## Imports and loading data

In [None]:
# Importing standard libraries
import joblib
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import regex as re
import seaborn as sns
import warnings
from tqdm import tqdm

# Importing sklearn libraries
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, log_loss, roc_auc_score, roc_curve
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, Normalizer, StandardScaler
from sklearn.svm import LinearSVC, SVC

# Importing nltk libraries
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

# Importing scipy library
from scipy.sparse import hstack

# Importing xgboost library
from xgboost import XGBClassifier

# Suppressing warnings
warnings.filterwarnings('ignore')

# Installing xgboost
!pip install xgboost

# Loading the data
path = '../BDS_project/final_new_data_processed.csv'
data = pd.read_csv(path)

# Preparing the data
X = data.drop(['review','rating','date','review_sentiment'],axis=1)
y = data['review_sentiment'].values

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.30, random_state=42)
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, stratify=y_train, test_size=0.30, random_state=42)

## Initialization of data

In [None]:
# Initialize the Normalizer
normalizer = Normalizer()

# Define the columns to be normalized
columns = ['usefulCount', 'sentiment_score', 'sentiment_score_clean', 'word_count',
       'unique_word_count', 'char_length', 'count_punctuations',
       'stopword_count', 'mean_word_len', 'subj_count', 'obj_count',
       'CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC',
       'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT',
       'QUANTITY', 'TIME', 'WORK_OF_ART', '0', '1', '2', '3', '4', '5', '6',
       '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18',
       '19']

# Normalize the specified columns of the training, testing, and cross-validation data
X_train_num_1 = normalizer.fit_transform(X_train[columns])
X_test_num_1 = normalizer.transform(X_test[columns])  # Use transform, not fit_transform on test data
X_cv_num_1 = normalizer.transform(X_cv[columns])  # Use transform, not fit_transform on CV data

# Extract the 'sentiment_score' and 'sentiment_score_clean' columns from the datasets
X_train_sent_score = X_train[['sentiment_score', 'sentiment_score_clean']].values
X_test_sent_score = X_test[['sentiment_score', 'sentiment_score_clean']].values
X_cv_sent_score = X_cv[['sentiment_score', 'sentiment_score_clean']].values

# Concatenate the normalized data and the sentiment scores for each dataset
X_tr_1 = np.concatenate((X_train_num_1, X_train_sent_score), axis=1)
X_te_1 = np.concatenate((X_test_num_1, X_test_sent_score), axis=1)
X_cv_1 = np.concatenate((X_cv_num_1, X_cv_sent_score), axis=1)

In [None]:
# Initialize the LabelEncoder
lab_enc_year = LabelEncoder()

# Fit the encoder on the 'year' column of the full dataset
lab_enc_year.fit(X['year'].values)

# Transform the 'year' column of the training, testing, and cross-validation data
# The reshape(-1, 1) is used to ensure the output is a 2D array
X_train_year = lab_enc_year.transform(X_train['year'].values).reshape(-1, 1)
X_test_year = lab_enc_year.transform(X_test['year'].values).reshape(-1, 1)
X_cv_year = lab_enc_year.transform(X_cv['year'].values).reshape(-1, 1)

In [None]:
# Define the columns to be dropped
drop_col = ['subj_count', 'obj_count', 'CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY',
 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', '9', '14']

# Drop the specified columns from the training, testing, and cross-validation data
X_train = X_train.drop(drop_col, axis=1)
X_test = X_test.drop(drop_col, axis=1)
X_cv = X_cv.drop(drop_col, axis=1)

In [None]:
# Define the important columns to be normalized
imp_columns = ['usefulCount', 'word_count', 'unique_word_count', 'char_length', 'count_punctuations',
               'stopword_count', 'mean_word_len', 'TIME', 'WORK_OF_ART', '0', '1', '2',
               '3', '4', '5', '6', '7', '8', '10', '11', '12', '13', '15', '16', '17', '18', '19']

# Initialize the Normalizer
normalizer = Normalizer()

# Normalize the specified columns of the training, testing, and cross-validation data
X_train_num_2 = normalizer.fit_transform(X_train[imp_columns])
X_test_num_2 = normalizer.transform(X_test[imp_columns])  # Use transform, not fit_transform on test data
X_cv_num_2 = normalizer.transform(X_cv[imp_columns])  # Use transform, not fit_transform on CV data


In [None]:
# Initialize the LabelEncoder
lab_enc_cond = LabelEncoder()

# Fit the encoder on the 'condition' column of the full dataset
lab_enc_cond.fit(X['condition'].values)

# Transform the 'condition' column of the training, testing, and cross-validation data
# The reshape(-1, 1) is used to ensure the output is a 2D array
X_train_condition = lab_enc_cond.transform(X_train['condition'].values).reshape(-1, 1)
X_test_condition = lab_enc_cond.transform(X_test['condition'].values).reshape(-1, 1)
X_cv_condition = lab_enc_cond.transform(X_cv['condition'].values).reshape(-1, 1)

## Bag of Words (BoW) Vectorization + XGBoost Classifier

In [2]:
# Load the previously saved vectorizer
vect_bow_1 = load('../BDS_project/vectorizer_bow.pkl')

# Transform the 'cleaned_review' column of the training, testing, and cross-validation data
X_train_review_bow_1 = vect_bow_1.transform(X_train['cleaned_review'].values)
X_test_review_bow_1 = vect_bow_1.transform(X_test['cleaned_review'].values)
X_cv_review_bow_1 = vect_bow_1.transform(X_cv['cleaned_review'].values)

# Initialize the LabelEncoder
lab_enc_cond = LabelEncoder()

# Fit the encoder on the 'condition' column of the full dataset
lab_enc_cond.fit(X['condition'].values)

# Transform the 'condition' column of the training, testing, and cross-validation data
# The reshape(-1, 1) is used to ensure the output is a 2D array
X_train_condition = lab_enc_cond.transform(X_train['condition'].values).reshape(-1, 1)
X_test_condition = lab_enc_cond.transform(X_test['condition'].values).reshape(-1, 1)
X_cv_condition = lab_enc_cond.transform(X_cv['condition'].values).reshape(-1, 1)

# Concatenate the normalized data, sentiment scores, condition, year, and bag-of-words features for each dataset
X_tr_2 = hstack((X_train_num_2, X_train_sent_score, X_train_condition, X_train_year, X_train_review_bow_1)).tocsr()
X_te_2 = hstack((X_test_num_2, X_test_sent_score, X_test_condition, X_test_year, X_test_review_bow_1)).tocsr()
X_cv_2 = hstack((X_cv_num_2, X_cv_sent_score, X_cv_condition, X_cv_year, X_cv_review_bow_1)).tocsr()

In [None]:
# Define the evaluation set for early stopping
eval_set = [(X_tr_2, y_train), (X_cv_2, y_cv)]

# Initialize the XGBoost classifier with specified hyperparameters
x_cfl_2 = XGBClassifier(n_estimators=1000, subsample=1, max_depth=10, learning_rate=0.1, colsample_bytree=0.1, nthread=-1, objective='binary:logistic', random_state=0)

# Fit the classifier on the training data and use early stopping based on logloss on the evaluation set
x_cfl_2.fit(X_tr_2, y_train, eval_set=eval_set, eval_metric='logloss', verbose=0, early_stopping_rounds=20)

# Initialize a calibrated classifier on the XGBoost classifier with sigmoid method
x_sig_clf_2 = CalibratedClassifierCV(x_cfl_2, method="sigmoid")

# Fit the calibrated classifier on the training data
x_sig_clf_2.fit(X_tr_2, y_train)

# Call the function to calculate and display the model metrics
model_metrics(x_sig_clf_2, X_tr_2, X_te_2, X_cv_2)

# Save the trained model for future use
joblib.dump(x_sig_clf_2, '../BDS_project/Bow_model.pkl')

## Term Frequency - Inverse Document Frequency (TF-IDF) Vectorization + XGBoost Classifier

In [None]:
# Load the previously saved vectorizer
vect_tfidf_1 = joblib.load('../BDS_project/vectorizer_tfidf.pkl')

# Transform the 'cleaned_review' column of the training, testing, and cross-validation data
X_train_review_tfidf_1 = vect_tfidf_1.transform(X_train['cleaned_review'].values)
X_test_review_tfidf_1 = vect_tfidf_1.transform(X_test['cleaned_review'].values)
X_cv_review_tfidf_1 = vect_tfidf_1.transform(X_cv['cleaned_review'].values)

# Concatenate the normalized data, sentiment scores, condition, year, and TF-IDF features for each dataset
X_tr_3 = hstack((X_train_num_2, X_train_sent_score, X_train_condition, X_train_year, X_train_review_tfidf_1)).tocsr()
X_te_3 = hstack((X_test_num_2, X_test_sent_score, X_test_condition, X_test_year, X_test_review_tfidf_1)).tocsr()
X_cv_3 = hstack((X_cv_num_2, X_cv_sent_score, X_cv_condition, X_cv_year, X_cv_review_tfidf_1)).tocsr()


In [None]:
# Define the evaluation set for early stopping
eval_set = [(X_tr_3, y_train), (X_cv_3, y_cv)]

# Initialize the XGBoost classifier with specified hyperparameters
x_cfl_3 = XGBClassifier(n_estimators=3000, subsample=0.3, max_depth=7, learning_rate=0.05, colsample_bytree=1, nthread=-1, objective='binary:logistic', random_state=0)

# Fit the classifier on the training data and use early stopping based on logloss on the evaluation set
x_cfl_3.fit(X_tr_3, y_train, eval_set=eval_set, eval_metric='logloss', verbose=0, early_stopping_rounds=30)

# Initialize a calibrated classifier on the XGBoost classifier with sigmoid method
x_sig_clf_3 = CalibratedClassifierCV(x_cfl_3, method="sigmoid")

# Fit the calibrated classifier on the training data
x_sig_clf_3.fit(X_tr_3, y_train)

# Call the function to calculate and display the model metrics
model_metrics(x_sig_clf_3, X_tr_3, X_te_3, X_cv_3)

# Save the trained model for future use
joblib.dump(x_sig_clf_3, '../BDS_project/tfidf_model.pkl')

## N-Gram + BoW Vectorization + XGBoost Classifier

In [None]:
# Load the previously saved vectorizer
vec_bow = joblib.load('../BDS_project/ngram_vec_bow.pkl')

# Transform the 'cleaned_review' column of the training, testing, and cross-validation data
X_train_review_bow_ngram = vec_bow.transform(X_train['cleaned_review'].values)
X_test_review_bow_ngram = vec_bow.transform(X_test['cleaned_review'].values)
X_cv_review_bow_ngram = vec_bow.transform(X_cv['cleaned_review'].values)

# Concatenate the normalized data, sentiment scores, condition, year, and bag-of-words features for each dataset
X_tr_4 = hstack((X_train_num_2, X_train_sent_score, X_train_condition, X_train_year, X_train_review_bow_ngram)).tocsr()
X_te_4 = hstack((X_test_num_2, X_test_sent_score, X_test_condition, X_test_year, X_test_review_bow_ngram)).tocsr()
X_cv_4 = hstack((X_cv_num_2, X_cv_sent_score, X_cv_condition, X_cv_year, X_cv_review_bow_ngram)).tocsr()

In [None]:
# Define the evaluation set for early stopping
eval_set = [(X_tr_4, y_train), (X_cv_4, y_cv)]

# Initialize the XGBoost classifier with specified hyperparameters
x_cfl_4 = XGBClassifier(n_estimators=3000, subsample=0.5, max_depth=10, learning_rate=0.03, colsample_bytree=0.3, nthread=-1, objective='binary:logistic', random_state=0)

# Fit the classifier on the training data and use early stopping based on logloss on the evaluation set
x_cfl_4.fit(X_tr_4, y_train, eval_set=eval_set, eval_metric='logloss', verbose=0, early_stopping_rounds=30)

# Initialize a calibrated classifier on the XGBoost classifier with sigmoid method
x_sig_clf_4 = CalibratedClassifierCV(x_cfl_4, method="sigmoid")

# Fit the calibrated classifier on the training data
x_sig_clf_4.fit(X_tr_4, y_train)

# Call the function to calculate and display the model metrics
model_metrics(x_sig_clf_4, X_tr_4, X_te_4, X_cv_4)

# Save the trained model for future use
joblib.dump(x_sig_clf_4, '../BDS_project/ngram_bow_model.pkl')

## N-Gram + TF-IDF + XGBoost Classifier

In [None]:
# Load the previously saved vectorizer
vec_tfidf = joblib.load('../BDS_project/ngram_vec_tfidf.pkl')

# Transform the 'cleaned_review' column of the training, testing, and cross-validation data
X_train_review_tfidf_ngram = vec_tfidf.transform(X_train['cleaned_review'].values)
X_test_review_tfidf_ngram = vec_tfidf.transform(X_test['cleaned_review'].values)
X_cv_review_tfidf_ngram = vec_tfidf.transform(X_cv['cleaned_review'].values)

# Concatenate the normalized data, sentiment scores, condition, year, and TF-IDF features for each dataset
X_tr_5 = hstack((X_train_num_2, X_train_sent_score, X_train_condition, X_train_year, X_train_review_tfidf_ngram)).tocsr()
X_te_5 = hstack((X_test_num_2, X_test_sent_score, X_test_condition, X_test_year, X_test_review_tfidf_ngram)).tocsr()
X_cv_5 = hstack((X_cv_num_2, X_cv_sent_score, X_cv_condition, X_cv_year, X_cv_review_tfidf_ngram)).tocsr()

In [None]:
# Define the evaluation set for early stopping
eval_set = [(X_tr_5, y_train), (X_cv_5, y_cv)]

# Initialize the XGBoost classifier with specified hyperparameters
x_cfl_5 = XGBClassifier(n_estimators=3000, subsample=0.5, max_depth=10, learning_rate=0.2, colsample_bytree=0.1, nthread=-1, objective='binary:logistic', random_state=0)

# Fit the classifier on the training data and use early stopping based on logloss on the evaluation set
x_cfl_5.fit(X_tr_5, y_train, eval_set=eval_set, eval_metric='logloss', verbose=0, early_stopping_rounds=30)

# Initialize a calibrated classifier on the XGBoost classifier with sigmoid method
x_sig_clf_5 = CalibratedClassifierCV(x_cfl_5, method="sigmoid")

# Fit the calibrated classifier on the training data
x_sig_clf_5.fit(X_tr_5, y_train)

# Call the function to calculate and display the model metrics
model_metrics(x_sig_clf_5, X_tr_5, X_te_5, X_cv_5)

# Save the trained model for future use
joblib.dump(x_sig_clf_5, '../BDS_project/ngram_tfidf_model.pkl')

## Word2Vec + XGBoost Classifier

In [None]:
# Import necessary libraries
from gensim.models import Word2Vec

# Initialize an empty list to store the sentences
sentences = []

# Loop through each review in the 'cleaned_review' column of the data
# Split each review into words and append to the sentences list
for r in tqdm(data['cleaned_review']):
    sentences.append(r.split())

# Train a Word2Vec model on the sentences
model = Word2Vec(sentences, vector_size=300, workers=12, min_count=1)

# Print the trained model
print(model)

# Save the trained model to a file
model.save('word2vec.bin')

# Load the trained model from the file
model = Word2Vec.load('word2vec.bin')

# Define a function to create word2vec vectors for the reviews
def create_w2v(df, feature):
    w2v_vector = []
    for review in df[feature]:
        vector = np.zeros(300)
        for word in review.split():
            if word in model.wv.key_to_index:
                vector += model.wv[word]
        w2v_vector.append(vector)
    w2v_vector = np.array(w2v_vector)
    return w2v_vector

# Create word2vec vectors for the 'cleaned_review' column of the training, testing, and cross-validation data
X_train_review_w2v = create_w2v(X_train, 'cleaned_review')
X_test_review_w2v = create_w2v(X_test, 'cleaned_review')
X_cv_review_w2v = create_w2v(X_cv, 'cleaned_review')

# Concatenate the normalized data, sentiment scores, condition, year, and word2vec features for each dataset
X_tr_6 = np.concatenate((X_train_num_2, X_train_sent_score, X_train_condition, X_train_year, X_train_review_w2v), axis=1)
X_te_6 = np.concatenate((X_test_num_2, X_test_sent_score, X_test_condition, X_test_year, X_test_review_w2v), axis=1)
X_cv_6 = np.concatenate((X_cv_num_2, X_cv_sent_score, X_cv_condition, X_cv_year, X_cv_review_w2v), axis=1)

In [None]:
# Define the evaluation set for early stopping
eval_set = [(X_tr_6, y_train), (X_cv_6, y_cv)]

# Initialize the XGBoost classifier with specified hyperparameters
x_cfl_6 = XGBClassifier(n_estimators=3000, subsample=1, max_depth=7, learning_rate=0.05, colsample_bytree=1, nthread=-1, objective='binary:logistic', random_state=0)

# Fit the classifier on the training data and use early stopping based on logloss on the evaluation set
x_cfl_6.fit(X_tr_6, y_train, eval_set=eval_set, eval_metric='logloss', verbose=0, early_stopping_rounds=30)

# Initialize a calibrated classifier on the XGBoost classifier with sigmoid method
x_sig_clf_6 = CalibratedClassifierCV(x_cfl_6, method="sigmoid")

# Fit the calibrated classifier on the training data
x_sig_clf_6.fit(X_tr_6, y_train)

# Call the function to calculate and display the model metrics
model_metrics(x_sig_clf_6, X_tr_6, X_te_6, X_cv_6)

# Save the trained model for future use
joblib.dump(x_sig_clf_6, '../BDS_project/W2V Model.pkl')

## Ensemble learning approach for recommendation system

In [None]:
# Loading the models
model_1 = joblib.load('../BDS_project/Bow_model.pkl')
model_2 = joblib.load('../BDS_project/tfidf_model.pkl')
model_3 = joblib.load('../BDS_project/ngram_bow_model.pkl')
model_4 = joblib.load('../BDS_project/ngram_tfidf_model.pkl')
model_5 = joblib.load('../BDS_project/W2V Model.pkl')

# Predicting using the models and adding the predictions as new columns in the test data
X_test['model1'] = model_1.predict(X_te_1)
X_test['model2'] = model_2.predict(X_te_2)
X_test['model3'] = model_3.predict(X_te_3)
X_test['model4'] = model_4.predict(X_te_4)
X_test['model5'] = model_5.predict(X_te_5)

def adjust_column(data, feature):
    max_value = data[feature].max()
    min_value = data[feature].min()
    data[feature] = (data[feature] - min_value) / (max_value - min_value)
    return data

# Normalizing the 'usefulCount' feature
X_test =  adjust_column(X_test, 'usefulCount')

# Calculating a recommendation score based on the model predictions and the 'usefulCount' feature
X_test['rec_score'] = (X_test['model1'] + X_test['model2'] + X_test['model3'] + X_test['model4'] + X_test['model5']) * X_test['usefulCount']

# Saving the test data with the new columns to a csv file
X_test.to_csv('../BDS_project/validation_data.csv', index=False)