In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
os.getcwd()

'C:\\Users\\91705\\Desktop\\Projects\\Sentiment Analysis\\notebooks'

In [3]:
home_dir = os.path.expanduser('~')
print(home_dir)

csv_path = os.path.expanduser(r'~\Desktop\Projects\Sentiment Analysis\data\raw\IMDB Dataset.csv')
print(csv_path)


C:\Users\91705
C:\Users\91705\Desktop\Projects\Sentiment Analysis\data\raw\IMDB Dataset.csv


In [4]:
data = pd.read_csv(csv_path)
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [5]:
data.duplicated().sum()

418

In [6]:
data = data.drop_duplicates(subset = 'review')
data.shape

(49582, 2)

In [7]:
data.duplicated().sum()

0

In [8]:
data.isna().sum() 

review       0
sentiment    0
dtype: int64

In [9]:
data = data.reset_index(drop=True)
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49577,I thought this movie did a down right good job...,positive
49578,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49579,I am a Catholic taught in parochial elementary...,negative
49580,I'm going to have to disagree with the previou...,negative


In [10]:
print(data.shape)

(49582, 2)


## Text Cleaning

In [11]:
import re

def text_clean(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)           # Removing html tags
    text = re.sub(r'[^a-zA-Z\s]','', text)      # Removing punctuations and numbers
    text = re.sub(r'\s+', ' ', text).strip()    # Removed extra steps
    return text

In [12]:
data['cleaned_reviews'] = data['review'].apply(text_clean)

In [13]:
data[['review','cleaned_reviews']].head()

Unnamed: 0,review,cleaned_reviews
0,One of the other reviewers has mentioned that ...,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,a wonderful little production the filming tech...
2,I thought this was a wonderful way to spend ti...,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,basically theres a family where a little boy j...
4,"Petter Mattei's ""Love in the Time of Money"" is...",petter matteis love in the time of money is a ...


## Tokenization

In [14]:
from nltk.tokenize import word_tokenize

data['tokens'] = data['cleaned_reviews'].apply(word_tokenize)

In [15]:
data['tokens']

0        [one, of, the, other, reviewers, has, mentione...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, this, was, a, wonderful, way, to,...
3        [basically, theres, a, family, where, a, littl...
4        [petter, matteis, love, in, the, time, of, mon...
                               ...                        
49577    [i, thought, this, movie, did, a, down, right,...
49578    [bad, plot, bad, dialogue, bad, acting, idioti...
49579    [i, am, a, catholic, taught, in, parochial, el...
49580    [im, going, to, have, to, disagree, with, the,...
49581    [no, one, expects, the, star, trek, movies, to...
Name: tokens, Length: 49582, dtype: object

## Stopwords Removal

In [16]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

In Natural Language Processing (NLP), stop words are common words (like "the", "a", "is") that are often removed from text data because they don't carry significant meaning for many NLP tasks. This process, known as stop word removal, can improve the efficiency and accuracy of subsequent analysis.

Examples of stop words:
- Articles: "a", "an", "the"
- Conjunctions: "and", "but", "or"
- Prepositions: "in", "on", "at", "with"
- Pronouns: "he", "she", "it", "they"
- Common verbs: "is", "am", "are", "was", "were" 

In [17]:
data['tokens'] = data['tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])
data['tokens']

0        [one, reviewers, mentioned, watching, oz, epis...
1        [wonderful, little, production, filming, techn...
2        [thought, wonderful, way, spend, time, hot, su...
3        [basically, theres, family, little, boy, jake,...
4        [petter, matteis, love, time, money, visually,...
                               ...                        
49577    [thought, movie, right, good, job, wasnt, crea...
49578    [bad, plot, bad, dialogue, bad, acting, idioti...
49579    [catholic, taught, parochial, elementary, scho...
49580    [im, going, disagree, previous, comment, side,...
49581    [one, expects, star, trek, movies, high, art, ...
Name: tokens, Length: 49582, dtype: object

In [18]:
data['final_text'] = data['tokens'].apply(lambda tokens: ' '.join(tokens))
data['final_text']

0        one reviewers mentioned watching oz episode yo...
1        wonderful little production filming technique ...
2        thought wonderful way spend time hot summer we...
3        basically theres family little boy jake thinks...
4        petter matteis love time money visually stunni...
                               ...                        
49577    thought movie right good job wasnt creative or...
49578    bad plot bad dialogue bad acting idiotic direc...
49579    catholic taught parochial elementary schools n...
49580    im going disagree previous comment side maltin...
49581    one expects star trek movies high art fans exp...
Name: final_text, Length: 49582, dtype: object

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['final_text']).toarray()

y = data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0).values

In [29]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [30]:
y

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

In [31]:
from scipy import sparse
X_sparse = sparse.csr_matrix(X)

# sparse.save_npz('C:/Users/91705//Desktop/Projects/Sentiment Analysis/data/processed/x_data_sparse.npz', X)

In [32]:
import joblib

def save_processed_data(X, filname):
    """Saves data with automatic path handling"""
    expanded_path = os.path.expanduser(filname)
    os.makedirs(os.path.dirname(expanded_path), exist_ok = True)
    joblib.dump(X, expanded_path)


save_processed_data(X_sparse, '~/Desktop/Projects/Sentiment Analysis/data/processed/X_data.pkl')
# save_processed_data(y, '~/Desktop/Projects/Sentiment Analysis/data/processed/y_data.pkl')
# save_processed_data(vectorizer, '~/Desktop/Projects/Sentiment Analysis/data/processed/tf_idf_vectorizer.pkl')



In [33]:
import os

# filePath = os.path.expanduser('C:/Users/91705/')
size_in_mb = os.path.getsize('C:/Users/91705//Desktop/Projects/Sentiment Analysis/data/processed/x_data.pkl') / (1024 * 1024)
print(f"Size of x_data.pkl: {size_in_mb:.2f} MB")


Size of x_data.pkl: 43.93 MB
