In [17]:
import pandas as pd
from pathlib import Path
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
data_path = Path.cwd().parent / "data" / "data.csv"

In [3]:
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,review,sentiment
0,Bad. Personal opinion? The folks who made it? ...,negative
1,This movie is obviously low-budget & filmed in...,positive
2,"Yes, this movie has kids going to space camp a...",negative
3,"Before I begin, let me tell you how GREAT this...",positive
4,The Vampire Bat is set in the small German vil...,negative


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     1000 non-null   object
 1   sentiment  1000 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB


In [5]:
def remove_stop_words(text):
    """Remove stop words from the text."""
    stop_words = set(stopwords.words("english"))
    text = [word for word in str(text).split() if word not in stop_words]
    return " ".join(text)

In [6]:
def removing_numbers(text):
    """Remove numbers from the text."""
    text = ''.join([char for char in text if not char.isdigit()])
    return text

In [7]:
def lower_case(text):
    """Convert text to lower case."""
    return text.lower()

In [8]:
def lemmatization(text):
    """Lemmatize the text."""
    lemmatizer = WordNetLemmatizer()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    return " ".join(text)

In [9]:
def removing_punctuations(text):
    """Remove punctuations from the text."""
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = text.replace('Ø›', "")
    text = text.replace(';', "")
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [10]:
def removing_urls(text):
    """Remove URLs from the text."""
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [None]:
def normalize_text(df):
    """Normalize the text data."""
    df['review'] = df['review'].apply(lower_case)
    df['review'] = df['review'].apply(remove_stop_words)
    df['review'] = df['review'].apply(removing_numbers)
    df['review'] = df['review'].apply(removing_punctuations)
    df['review'] = df['review'].apply(removing_urls)
    df['review'] = df['review'].apply(lemmatization)
    return df

In [12]:
df = normalize_text(df)
df.head()

Unnamed: 0,review,sentiment
0,bad personal opinion folk made it knew made it...,negative
1,movie obviously low budget filmed british colu...,positive
2,yes movie kid going space camp start okay enou...,negative
3,begin let tell great stand up special sound pl...,positive
4,vampire bat set small german village klineschl...,negative


In [14]:
sentiment_mapper = {"positive" : 1 , "negative" : 0}
df['sentiment'] = df['sentiment'].map(sentiment_mapper)
df.head()

Unnamed: 0,review,sentiment
0,bad personal opinion folk made it knew made it...,0
1,movie obviously low budget filmed british colu...,1
2,yes movie kid going space camp start okay enou...,0
3,begin let tell great stand up special sound pl...,1
4,vampire bat set small german village klineschl...,0


In [16]:
vectorizer = CountVectorizer(max_features=100)
X = vectorizer.fit_transform(df['review'])
y = df['sentiment']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)