In [206]:
import pandas as pd
import numpy as np
import re 
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer

In [207]:
trainDataframe = pd.read_csv("train.csv")
testDataframe = pd.read_csv("test.csv")

# X_train, Y_train extraction

In [208]:
df = trainDataframe
df = df["review"]
#Preprocess the data. Remove the regular expression term from each example and replace it.
for i, row in enumerate(df):
    trainDataframe.iloc[i , 0] = re.sub("<br />", " ", row)

In [209]:
#Use label encoder to change the reviews to numeric values
labeler = preprocessing.LabelEncoder()
labeler.fit(trainDataframe["sentiment"])
labeled_column = labeler.transform(trainDataframe["sentiment"].to_numpy())
trainDataframe["sentiment"] = labeled_column
y_trainTextdf = labeled_column

In [210]:
vectorizer = CountVectorizer(binary=True)
vectorizer.fit(trainDataframe["review"]) 
X_train = vectorizer.transform(trainDataframe["review"])
feature_names_test = vectorizer.get_feature_names_out()
X_trainTextdf = pd.DataFrame.sparse.from_spmatrix(X_train, columns=feature_names_test)

In [211]:
X_trainTextdf.shape

(2000, 25135)

In [212]:
y_trainTextdf.shape

(2000,)

In [213]:
X_trainTextdf.head()

Unnamed: 0,00,000,007,00am,01pm,02,04,06,07,08,...,zwick,zzzzzzzzzzzzzzzzzz,álvaro,ángel,æon,élan,être,ís,ísnt,île
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# X_test and Y_test extraction: Use the vocabulary of the train set

In [214]:
df = testDataframe
df = df["review"]

for i, row in enumerate(df):
    testDataframe.iloc[i , 0] = re.sub("<br />", " ", row)

In [215]:
labeler.fit(testDataframe["sentiment"])
labeled_column = labeler.transform(testDataframe["sentiment"].to_numpy())
testDataframe["sentiment"] = labeled_column
y_testTextdf = labeled_column

X_test = vectorizer.transform(testDataframe["review"])
feature_names_test = vectorizer.get_feature_names_out()
X_testTextdf = pd.DataFrame.sparse.from_spmatrix(X_test, columns=feature_names_test)


In [216]:
X_testTextdf.head(10)

Unnamed: 0,00,000,007,00am,01pm,02,04,06,07,08,...,zwick,zzzzzzzzzzzzzzzzzz,álvaro,ángel,æon,élan,être,ís,ísnt,île
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Calculate information gain for each feature wrt "sentiment"

In [217]:
from sklearn.feature_selection import mutual_info_classif

In [203]:
informationGain = mutual_info_classif(X_trainTextdf, trainDataframe["sentiment"], random_state = 8)

In [204]:
column_names = X_trainTextdf.columns
gainDf = pd.DataFrame(columns = column_names)
gainDf.loc["gain"] = informationGain

In [205]:
gainDf.head()

Unnamed: 0,00,000,007,00am,01pm,02,04,06,07,08,...,zwick,zzzzzzzzzzzzzzzzzz,álvaro,ángel,æon,élan,être,ís,ísnt,île
gain,2.505041e-08,0.000216,0.000689,0.000344,0.000349,0.000349,0.000349,0.000344,0.000344,0.000344,...,0.000344,0.000349,0.000344,0.000344,0.000349,0.000349,0.000349,0.000344,0.000349,0.000349


# Sort the information Gain columns based on gain value.

In [234]:
sorted_gainDf = gainDf.sort_values(by = 'gain', axis = 1, kind = 'stable')
sorted_gainDf.head()

Unnamed: 0,seen,100th,105,1794,1800,1800s,1840,1914,1934,1963,...,no,boring,stupid,terrible,great,excellent,awful,waste,worst,bad
gain,7.534844e-09,1.251267e-08,1.251267e-08,1.251267e-08,1.251267e-08,1.251267e-08,1.251267e-08,1.251267e-08,1.251267e-08,1.251267e-08,...,0.015746,0.015948,0.016204,0.016234,0.020785,0.02297,0.02389,0.025341,0.035207,0.043831


In [156]:
leastImportant = sorted_gainDf.columns[0:10]
mostImportant = sorted_gainDf.columns[-11:]

In [157]:
print(leastImportant)

Index(['seen', '100th', '105', '1794', '1800', '1800s', '1840', '1914', '1934',
       '1963'],
      dtype='object')


In [158]:
print(mostImportant)

Index(['wonderful', 'no', 'boring', 'stupid', 'terrible', 'great', 'excellent',
       'awful', 'waste', 'worst', 'bad'],
      dtype='object')


# First Trial without feature selection

In [253]:
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score,f1_score, recall_score, precision_score

In [254]:
X_train = X_trainTextdf.to_numpy()
y_train = y_trainTextdf
clf = tree.DecisionTreeClassifier(criterion = "entropy", random_state = 8)
clf = clf.fit(X_train, y_train)

In [255]:
X_train.shape

(2000, 25135)

In [256]:
X_test = X_testTextdf.to_numpy()
y_test = y_testTextdf

In [257]:
preds = clf.predict(X_test)



In [258]:
preds.shape

(500,)

In [259]:
confusion_matrix(preds, y_test)

array([[151,  82],
       [ 88, 179]])

In [260]:
accuracy_score(preds, y_test)

0.66

In [261]:
f1_score(preds, y_test)

0.678030303030303

In [244]:
precision_score(preds, y_test)


0.685823754789272

In [245]:
recall_score(preds, y_test)

0.6704119850187266

# Second trial: With feature selection best 90 percent 

In [229]:
columns_needed = -1 *sorted_gainDf.shape[1] * 0.9
mostImportantColumns = sorted_gainDf.columns[int(columns_needed):] 

In [230]:
bestGainDf = gainDf.loc[:, mostImportantColumns]

In [231]:
bestGainDf.shape

(1, 22621)

# Second Trial 

In [232]:
X_train = X_trainTextdf[bestGainDf.columns]
y_train = y_trainTextdf
clf = tree.DecisionTreeClassifier(criterion = "entropy", random_state = 8)
clf = clf.fit(X_train, y_train)

In [174]:
X_test = X_testTextdf[bestGainDf.columns]
preds = clf.predict(X_test)

In [175]:
confusion_matrix(preds, y_testTextdf)

array([[151,  84],
       [ 88, 177]])

In [176]:
accuracy_score(preds, y_testTextdf)

0.656

In [177]:
f1_score(preds, y_testTextdf)

0.6730038022813687

In [178]:
recall_score(preds, y_testTextdf)

0.6679245283018868

In [179]:
precision_score(preds, y_testTextdf)

0.6781609195402298

# Third trial: Remove columns that have 0 gain then try different percentages for removal of data

In [262]:
df = sorted_gainDf.loc[:, (sorted_gainDf > 0).any(axis=0)]

In [263]:
df.head()

Unnamed: 0,seen,100th,105,1794,1800,1800s,1840,1914,1934,1963,...,no,boring,stupid,terrible,great,excellent,awful,waste,worst,bad
gain,7.534844e-09,1.251267e-08,1.251267e-08,1.251267e-08,1.251267e-08,1.251267e-08,1.251267e-08,1.251267e-08,1.251267e-08,1.251267e-08,...,0.015746,0.015948,0.016204,0.016234,0.020785,0.02297,0.02389,0.025341,0.035207,0.043831


In [264]:
percentage_selected = 0.4
columns_needed = -1 *df.shape[1] * percentage_selected
mostImportantColumns = df.columns[int(columns_needed):]
bestGainDf = df.loc[:, mostImportantColumns]

In [183]:
bestGainDf.head()

Unnamed: 0,sabotage,sabrina,sacha,sacks,sacrifices,sadden,saddle,sadhu,sadist,sagacious,...,no,boring,stupid,terrible,great,excellent,awful,waste,worst,bad
gain,0.000344,0.000344,0.000344,0.000344,0.000344,0.000344,0.000344,0.000344,0.000344,0.000344,...,0.015746,0.015948,0.016204,0.016234,0.020785,0.02297,0.02389,0.025341,0.035207,0.043831


# Third Trial 


In [246]:
X_train = X_trainTextdf[bestGainDf.columns]
y_train = y_trainTextdf
clf = tree.DecisionTreeClassifier(criterion = "entropy", random_state = 8)
clf = clf.fit(X_train, y_train)

In [247]:
X_test = X_testTextdf[bestGainDf.columns]
preds = clf.predict(X_test)

In [248]:
confusion_matrix(preds, y_testTextdf)

array([[151,  84],
       [ 88, 177]])

In [249]:
accuracy_score(preds, y_testTextdf)

0.656

In [250]:
f1_score(preds, y_testTextdf)

0.6730038022813687

In [251]:
recall_score(preds, y_testTextdf)

0.6679245283018868

In [252]:
precision_score(preds, y_testTextdf)

0.6781609195402298