## Import Statements

In [2]:
import pandas as pd
import string
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

In [3]:
# Download stopwords if not already
nltk.download('stopwords')

# Load stopwords set
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to C:\Users\VICTUS
[nltk_data]     16\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load the dataset

In [4]:
PATH = "../data/IMDB Dataset.csv"
df = pd.read_csv(PATH)

In [5]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Start of feautre engineering

1) Remove 'br' and stopwords

In [6]:
def clean_review(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    words = re.findall(r'\b\w+\b', text)
    # Remove 'br' and stopwords
    cleaned = [word for word in words if word not in stop_words and word != 'br']
    return ' '.join(cleaned)

In [7]:
df['clean_review'] = df['review'].astype(str).apply(clean_review)

2. Convert Sentiment to Binary Labels

In [8]:
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [9]:
df[['sentiment', 'label']].head()

Unnamed: 0,sentiment,label
0,positive,1
1,positive,1
2,positive,1
3,negative,0
4,positive,1


3. TF-IDF Vectorization

Converted the text data into numerical features using TF-IDF vectorization which contains about 10,000 max-features (dimensions).

In [10]:
tfidf = TfidfVectorizer(max_features=10000)

# Fit and transform clean text
X = tfidf.fit_transform(df['clean_review'])

# Labels
y = df['label']

In [12]:
X.shape

(50000, 10000)

Save the TF-IDF vectorized data to a file

In [13]:
import joblib

joblib.dump(tfidf, '../models/tfidf_vectorizer.pkl')

['../models/tfidf_vectorizer.pkl']