In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from tqdm import tqdm

warnings.filterwarnings("ignore")

In [3]:
reviews_data = pd.read_csv("result_df.csv")
reviews_data.head()

Unnamed: 0,Text,Score
0,bought several vitality canned dog food produc...,1
1,product arrived labeled jumbo salted peanuts p...,0
2,confection around centuries light pillowy citr...,1
3,looking secret ingredient robitussin believe f...,0
4,great taffy great price wide assortment yummy ...,1


In [4]:
# Check for NaN values in the dataframe
nan_counts = reviews_data.isnull().sum()
# Display the count of NaN values for each column
print(nan_counts)

Text     724
Score      0
dtype: int64


In [5]:

# Remove rows with NaN values
reviews_data = reviews_data.dropna()

# Display the cleaned dataframe
reviews_data.head()

Unnamed: 0,Text,Score
0,bought several vitality canned dog food produc...,1
1,product arrived labeled jumbo salted peanuts p...,0
2,confection around centuries light pillowy citr...,1
3,looking secret ingredient robitussin believe f...,0
4,great taffy great price wide assortment yummy ...,1


In [6]:
# Check for NaN values in the dataframe
nan_counts = reviews_data.isnull().sum()
# Display the count of NaN values for each column
print(nan_counts)

Text     0
Score    0
dtype: int64


In [7]:
reviews_data.head()

Unnamed: 0,Text,Score
0,bought several vitality canned dog food produc...,1
1,product arrived labeled jumbo salted peanuts p...,0
2,confection around centuries light pillowy citr...,1
3,looking secret ingredient robitussin believe f...,0
4,great taffy great price wide assortment yummy ...,1


In [8]:
# Creating the training data :
X = reviews_data['Text']
y = reviews_data['Score']
print(X.shape , y.shape)

(242690,) (242690,)


In [9]:
from sklearn.model_selection import train_test_split

# Assuming X and y are already defined
# Split the data into training and test sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)

# Further split the training set into training and cross-validation sets (70% train, 30% cross-validation of the original training set)
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size=0.3, random_state=40)

print("Training set shape:", X_train.shape, y_train.shape)
print("Cross-validation set shape:", X_cv.shape, y_cv.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Training set shape: (118918,) (118918,)
Cross-validation set shape: (50965,) (50965,)
Test set shape: (72807,) (72807,)


In [10]:
# Save the y datasets to local files
with open('y_train.pkl', 'wb') as f:
    pickle.dump(y_train, f)
with open('y_cv.pkl', 'wb') as f:
    pickle.dump(y_cv, f)
with open('y_test.pkl', 'wb') as f:
    pickle.dump(y_test, f)

## 1. BoW Vectorization

In [10]:
# Initialize the CountVectorizer for BoW
count_vect = CountVectorizer(ngram_range=(1, 1), min_df=5)

# Apply vectorizer with progress bar
print("Applying BoW vectorizer to training data...")
X_train_bow = count_vect.fit_transform(tqdm(X_train))
print("Applying BoW vectorizer to CV data...")
X_cv_bow = count_vect.transform(tqdm(X_cv))
print("Applying BoW vectorizer to test data...")
X_test_bow = count_vect.transform(tqdm(X_test))

# Save the BoW vectorized matrices
with open('X_train_bow.pkl', 'wb') as f:
    pickle.dump(X_train_bow, f)
with open('X_cv_bow.pkl', 'wb') as f:
    pickle.dump(X_cv_bow, f)
with open('X_test_bow.pkl', 'wb') as f:
    pickle.dump(X_test_bow, f)

print("BoW vectorization complete and saved.")

Applying BoW vectorizer to training data...


100%|███████████████████████████████████████████████████████████████████████| 118918/118918 [00:02<00:00, 45316.47it/s]


Applying BoW vectorizer to CV data...


100%|█████████████████████████████████████████████████████████████████████████| 50965/50965 [00:01<00:00, 45981.26it/s]


Applying BoW vectorizer to test data...


100%|█████████████████████████████████████████████████████████████████████████| 72807/72807 [00:01<00:00, 44559.91it/s]


BoW vectorization complete and saved.


## 2. TF-IDF Vectorization

In [11]:
# Initialize the TfidfVectorizer
tfidf_vect = TfidfVectorizer(ngram_range=(1, 1), min_df=5)

# Apply vectorizer with progress bar
print("Applying TF-IDF vectorizer to training data...")
X_train_tfidf = tfidf_vect.fit_transform(tqdm(X_train))
print("Applying TF-IDF vectorizer to CV data...")
X_cv_tfidf = tfidf_vect.transform(tqdm(X_cv))
print("Applying TF-IDF vectorizer to test data...")
X_test_tfidf = tfidf_vect.transform(tqdm(X_test))

# Save the TF-IDF vectorized matrices
with open('X_train_tfidf.pkl', 'wb') as f:
    pickle.dump(X_train_tfidf, f)
with open('X_cv_tfidf.pkl', 'wb') as f:
    pickle.dump(X_cv_tfidf, f)
with open('X_test_tfidf.pkl', 'wb') as f:
    pickle.dump(X_test_tfidf, f)

print("TF-IDF vectorization complete and saved.")

Applying TF-IDF vectorizer to training data...


100%|███████████████████████████████████████████████████████████████████████| 118918/118918 [00:02<00:00, 44973.35it/s]


Applying TF-IDF vectorizer to CV data...


100%|█████████████████████████████████████████████████████████████████████████| 50965/50965 [00:01<00:00, 44483.06it/s]


Applying TF-IDF vectorizer to test data...


100%|█████████████████████████████████████████████████████████████████████████| 72807/72807 [00:01<00:00, 44450.35it/s]


TF-IDF vectorization complete and saved.


## 3. Word2Vec Vectorization

In [12]:
# Tokenize the reviews for Word2Vec
X_train_tokens = [review.split() for review in X_train]
X_cv_tokens = [review.split() for review in X_cv]
X_test_tokens = [review.split() for review in X_test]

# Train the Word2Vec model
print("Training Word2Vec model...")
w2v_model = Word2Vec(sentences=tqdm(X_train_tokens), vector_size=100, window=5, min_count=5, workers=4)

# Vectorize the training, CV, and test data using the Word2Vec model
def vectorize_w2v(tokens, model):
    vectorized = []
    for token_list in tqdm(tokens):
        vector = np.mean([model.wv[word] for word in token_list if word in model.wv], axis=0)
        if isinstance(vector, np.ndarray):
            vectorized.append(vector)
        else:
            vectorized.append(np.zeros(model.vector_size))
    return np.array(vectorized)

print("Applying Word2Vec vectorizer to training data...")
X_train_w2v = vectorize_w2v(X_train_tokens, w2v_model)
print("Applying Word2Vec vectorizer to CV data...")
X_cv_w2v = vectorize_w2v(X_cv_tokens, w2v_model)
print("Applying Word2Vec vectorizer to test data...")
X_test_w2v = vectorize_w2v(X_test_tokens, w2v_model)

# Save the Word2Vec vectorized matrices
with open('X_train_w2v.pkl', 'wb') as f:
    pickle.dump(X_train_w2v, f)
with open('X_cv_w2v.pkl', 'wb') as f:
    pickle.dump(X_cv_w2v, f)
with open('X_test_w2v.pkl', 'wb') as f:
    pickle.dump(X_test_w2v, f)

print("Word2Vec vectorization complete and saved.")

Training Word2Vec model...


100%|██████████████████████████████████████████████████████████████████████| 118918/118918 [00:00<00:00, 172815.76it/s]


Applying Word2Vec vectorizer to training data...


100%|███████████████████████████████████████████████████████████████████████| 118918/118918 [00:10<00:00, 11564.07it/s]


Applying Word2Vec vectorizer to CV data...


100%|█████████████████████████████████████████████████████████████████████████| 50965/50965 [00:04<00:00, 11502.42it/s]


Applying Word2Vec vectorizer to test data...


100%|█████████████████████████████████████████████████████████████████████████| 72807/72807 [00:06<00:00, 12083.49it/s]


Word2Vec vectorization complete and saved.
