## Import libraries and load data

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import joblib

# Define the path to the data file
path = '../BDS_project/final_new_data_processed.csv'

# Load the data from the CSV file
data = pd.read_csv(path)

# Drop unnecessary columns from the data and assign the remaining to X
X = data.drop(['review', 'rating', 'date', 'review_sentiment'], axis=1)

# Assign the 'review_sentiment' column to y
y = data['review_sentiment'].values

# Split the data into training and testing sets, ensuring stratified sampling and a test size of 30%
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.30, random_state=42)

# Further split the training data into training and cross-validation sets, ensuring stratified sampling and a cross-validation size of 30%
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, stratify=y_train, test_size=0.30, random_state=42)

## Get BoW Vectorizer

In [None]:
# Initialize CountVectorizer with minimum document frequency of 10 and unigram setting
vect_bow_1 = CountVectorizer(min_df=10, ngram_range=(1, 1))

# Fit the vectorizer on the 'cleaned_review' column of the training data
vect_bow_1.fit(X_train['cleaned_review'].values)

# Save the fitted vectorizer to a file for future use
joblib.dump(vect_bow_1, '../BDS_project/vectorizer_bow.pkl')

## Get TF-IDF Vectorizer

In [None]:
# Initialize TfidfVectorizer with minimum document frequency of 10 and unigram setting
vect_tfidf_1 = TfidfVectorizer(min_df=10, ngram_range=(1, 1))

# Fit the vectorizer on the 'cleaned_review' column of the training data
vect_tfidf_1.fit(X_train['cleaned_review'].values)

# Save the fitted vectorizer to a file for future use
joblib.dump(vect_tfidf_1, '../BDS_project/vectorizer_tfidf.pkl')

## Get n-gram + BoW Vectorizer

In [None]:
# Initialize CountVectorizer with minimum document frequency of 10 and n-gram range from 2 to 4
ngram_vec_bow = CountVectorizer(min_df=10, ngram_range=(2, 4))

# Fit the vectorizer on the 'cleaned_review' column of the training data
ngram_vec_bow.fit(X_train['cleaned_review'].values)

# Create an empty list to store the vocabulary
vocab = []

# Loop through the feature names of the vectorizer
for i in ngram_vec_bow.get_feature_names_out():
    # If the word 'no' is in the feature name, add it to the vocabulary list
    if 'no' in i.split(' '):
        vocab.append(i)

# Initialize a new CountVectorizer with the same n-gram range but with the new vocabulary
ngram_vec_bow = CountVectorizer(ngram_range=(2, 4), vocabulary=vocab)

# Save the fitted vectorizer to a file for future use
joblib.dump(ngram_vec_bow, '../BDS_project/ngram_vec_bow.pkl')

## Get n-gram + TF-IDF Vectorizer

In [None]:
# Initialize TfidfVectorizer with n-gram range from 2 to 4 and the previously defined vocabulary
ngram_vec_tfidf = TfidfVectorizer(ngram_range=(2, 4), vocabulary=vocab)

# Fit the vectorizer on the vocabulary
ngram_vec_tfidf.fit(vocab)

# Save the fitted vectorizer to a file for future use
joblib.dump(ngram_vec_tfidf, '../BDS_project/ngram_vec_tfidf.pkl')