# Task 2: Word2Vec - Average Sentence Embedding Approach to Sentence Classification

In [None]:
import pandas as pd
import numpy as np
import pickle, csv
import project2Lib

## for plotting data distribution
import matplotlib.pyplot as plt
import seaborn as sns

#for text pre-processing
import re, string
import nltk
from gensim.models import Word2Vec
import gensim.models

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

#For Peformance Metrics
from sklearn.metrics import classification_report, f1_score, accuracy_score, plot_confusion_matrix, confusion_matrix, ConfusionMatrixDisplay


from tensorflow.keras import models, layers, preprocessing, Sequential
from tensorflow.keras import backend as K
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint


np.random.seed(1)


## Loading preprocessed data

### Choosing one of the preprocessing options

In [None]:
suffix = ""
mode = 1

if   mode==0:
    suffix = "lemmatization_noph"
    
elif mode==1:
    suffix = "lemmatization"
    
elif mode==2:
    suffix = "_noph"

elif mode==3:
    suffix = "_"
    
elif mode==4:
    suffix = "stemming_noph"
    
elif mode==5:
    suffix = "stemming"

In [None]:
# read data
train_data = pd.read_pickle (f"PreprocessedData/train_{suffix}_w2v.pkl")
dev_data = pd.read_pickle (f"PreprocessedData/dev_{suffix}_w2v.pkl")
test_data = pd.read_pickle (f"PreprocessedData/test_{suffix}_w2v.pkl")

## To prepare data:

the X_train_lines data is extracted for models that use the relative line number as an auxiliary input.

In [None]:
X_train_vec = np.stack(train_data["avg_vectors"].values)
X_train_vec_line = np.concatenate( ( np.reshape(train_data["line_relative"].values, (-1, 1)) ,X_train_vec) ,axis=1) 
Y_train = train_data['label'].values

X_dev_vec = np.stack(dev_data["avg_vectors"].values)
X_dev_vec_line = np.concatenate( ( np.reshape(dev_data["line_relative"].values, (-1, 1)) ,X_dev_vec) ,axis=1) 
Y_dev = dev_data['label'].values

X_test_vec = np.stack(test_data["avg_vectors"].values)
X_test_vec_line = np.concatenate( ( np.reshape(test_data["line_relative"].values, (-1, 1)) ,X_test_vec) ,axis=1) 
Y_test = test_data['label'].values

# Classifiers Based on Average Sentence Vectors

## Logistic Regression

### Without Line Numbers

In [None]:
logisticRegr1 = LogisticRegression(verbose=1, n_jobs=-1)
logisticRegr1.fit(X_train_vec, Y_train)

In [None]:
save_name = f'./TrainedModels/logreg_w2v_noline_{suffix}.sav'
pickle.dump(logisticRegr1, open(save_name, 'wb'))

Y_pred = logisticRegr1.predict(X_test_vec)

print("Accuracy: " ,accuracy_score(Y_test, Y_pred))
print("F1 Score: " ,f1_score(Y_test, Y_pred, average='weighted') )
cm = confusion_matrix(Y_test, Y_pred, normalize = "true")
cmd = ConfusionMatrixDisplay(confusion_matrix=cm)
cmd.plot()

### With Line Numbers

In [None]:
logisticRegr2 = LogisticRegression(verbose=1, n_jobs=-1)
logisticRegr2.fit(X_train_vec_line, Y_train)

In [None]:
save_name = f'./TrainedModels/logreg_w2v_line_{suffix}.sav'
pickle.dump(logisticRegr1, open(save_name, 'wb'))

Y_pred = logisticRegr2.predict(X_test_vec_line)

print("Accuracy: " ,accuracy_score(Y_test, Y_pred))
print("F1 Score: " ,f1_score(Y_test, Y_pred, average='weighted') )
cm = confusion_matrix(Y_test, Y_pred, normalize = "true")
cmd = ConfusionMatrixDisplay(confusion_matrix=cm)
cmd.plot()

## Random Forest

### Without Line Numbers

In [None]:
rf = RandomForestClassifier(verbose=1, n_jobs=8)
rf_model = rf.fit(X_train_vec, Y_train)

In [None]:
Y_pred = rf_model.predict(X_test_vec)

print("Accuracy: " ,accuracy_score(Y_test, Y_pred))
print("F1 Score: " ,f1_score(Y_test, Y_pred, average='weighted') )
cm = confusion_matrix(Y_test, Y_pred, normalize = "true")
cmd = ConfusionMatrixDisplay(confusion_matrix=cm)
cmd.plot()

### With Line Numbers

In [None]:
rf = RandomForestClassifier(verbose=1, n_jobs=-1)
rf_model2 = rf.fit(X_train_vec_line, Y_train)

In [None]:
Y_pred = rf_model2.predict(X_test_vec_line)

print("Accuracy: " ,accuracy_score(Y_test, Y_pred))
print("F1 Score: " ,f1_score(Y_test, Y_pred, average='weighted') )
cm = confusion_matrix(Y_test, Y_pred, normalize = "true")
cmd = ConfusionMatrixDisplay(confusion_matrix=cm)
cmd.plot()

## XGBoost

### Without Line Numbers

In [None]:
xg_class = xgb.XGBClassifier()
xg_model = xg_class.fit(X_train_vec, Y_train)

In [None]:
Y_pred = xg_model.predict(X_test_vec)

print("Accuracy: " ,accuracy_score(Y_test, Y_pred))
print("F1 Score: " ,f1_score(Y_test, Y_pred, average='weighted') )
cm = confusion_matrix(Y_test, Y_pred, normalize = "true")
cmd = ConfusionMatrixDisplay(confusion_matrix=cm)
cmd.plot()

### With Line Numbers

In [None]:
xg_class = xgb.XGBClassifier()
xg_model = xg_class.fit(X_train_vec_line, Y_train)

In [None]:
Y_pred = xg_model.predict(X_test_vec_line)

print("Accuracy: " ,accuracy_score(Y_test, Y_pred))
print("F1 Score: " ,f1_score(Y_test, Y_pred, average='weighted') )
cm = confusion_matrix(Y_test, Y_pred, normalize = "true")
cmd = ConfusionMatrixDisplay(confusion_matrix=cm)
cmd.plot()

# Classification Using Small Dataset 

In [None]:
# read data
train_data_small = pd.read_pickle (f"PreprocessedData/train_{suffix}_w2v_small.pkl")
dev_data_small = pd.read_pickle (f"PreprocessedData/dev_{suffix}_w2v_small.pkl")
test_data_small = pd.read_pickle (f"PreprocessedData/test_{suffix}_w2v_small.pkl")

In [None]:
train_data_small[:3]

In [None]:
X_train_vec_small = np.stack(train_data_small["avg_vectors"].values)
#X_train_vec_line_small = np.concatenate( ( np.reshape(train_data_small["line_relative"].values, (-1, 1)) ,X_train_vec_small) ,axis=1) 
Y_train_small = train_data_small['label'].values

X_dev_vec_small = np.stack(dev_data_small["avg_vectors"].values)
#X_dev_vec_line_small = np.concatenate( ( np.reshape(dev_data_small["line_relative"].values, (-1, 1)) ,X_dev_vec_small) ,axis=1) 
Y_dev_small = dev_data_small['label'].values

X_test_vec_small = np.stack(test_data_small["avg_vectors"].values)
#X_test_vec_line_small = np.concatenate( ( np.reshape(test_data_small["line_relative"].values, (-1, 1)) ,X_test_vec_small) ,axis=1) 
Y_test_small = test_data_small['label'].values

In [None]:
logisticRegr1_small = LogisticRegression(verbose=1, n_jobs=-1)
logisticRegr1_small.fit(X_train_vec_small, Y_train_small)

In [None]:
save_name = f'./TrainedModels/logreg_w2v_noline_{suffix}_small.sav'
pickle.dump(logisticRegr1_small, open(save_name, 'wb'))

Y_pred_small = logisticRegr1_small.predict(X_test_vec_small)

print("Accuracy: " ,accuracy_score(Y_test_small, Y_pred_small))
print("F1 Score: " ,f1_score(Y_test_small, Y_pred_small, average='weighted') )
cm = confusion_matrix(Y_test_small, Y_pred_small, normalize = "true")
cmd = ConfusionMatrixDisplay(confusion_matrix=cm)
cmd.plot()