<a href="https://colab.research.google.com/github/Roxxxers/TSIN01/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


#https://www.kaggle.com/arhamrumi/amazon-product-reviews
#https://www.kaggle.com/skathirmani/amazon-reviews

# 1.0 Import the reviews and modules
Here we import the modules and files that we are going to use, we also print a heatmap to see if there are any null values in the dataframe



In [None]:
import pandas as pd
import csv

#--------------------------------------TRAIN--------------------------------------#

Train_Reviews = pd.read_csv("/content/drive/MyDrive/TDDE16 PROJEKT/reviews/amazon/amazon_reviews_big.csv", usecols = ["overall", "reviewText"])

Train_Reviews = Train_Reviews.rename(columns={"reviewText": "Text", "overall": "Score"})
Train_Reviews['Text'] = Train_Reviews['Text'].astype('U').values

#--------------------------------------TEST--------------------------------------#

test_reviews = pd.read_csv("/content/drive/MyDrive/TDDE16 PROJEKT/reviews/amazon/Reviews.csv", usecols = ["Text", "Score"])

test_reviews['Score'] = test_reviews['Score'].fillna(0).astype(int)
test_reviews['Text'] = test_reviews['Text'].astype('U').values

#--------------------------------------------------------------------------------#

test_reviews


In [None]:
from sklearn.metrics import confusion_matrix
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sn
import numpy as np

def print_CM(Test,pred):
  cm = confusion_matrix(pred, Test)
  plt.figure(figsize=(7,7))
  sn.heatmap(cm, annot=True)
  plt.xlabel("Predicted")
  plt.ylabel("Truth")


# 1.1 Preprocessing

In this section we will be preprocessing the dataframe and remove all the noise from the reviews to see how the noise affect the f1-score of the classifiers. 

In [None]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer


#Tagger, parser and entity are disabled since they are not needed for this problem. Disable them will increases performance.
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "entity"]) 

#Will return words in its lemma form as long as they are not stop words or non-alphabetic.
def preprocess(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_stop and token.lemma_.isalpha()]

def df_preprocess(df):
  for index, row in df.iterrows():
    desc = preprocess(row["Text"])
    desc = " ".join(desc).lower()
    df.append([desc, row["Score"]])

  return df

test = df_preprocess(test_reviews)
training = df_preprocess(Train_Reviews)

test

In [None]:
 
#remove last row as it contains a NaN value
test.drop(test.tail().index,inplace=True)

copy_test = test.copy(deep=True)
copy_training = training.copy(deep=True)

#Replace the values of 1-2 to negative, 3 to neutral and 4-5 to positive. 
copy_test = copy_test.replace({"Score" : {1 : "Negative", 2 : "Negative", 3 : "Neutral", 4 : "Positive", 5 : "Positive"}})
copy_training = copy_training.replace({"Score" : {1 : "Negative", 2 : "Negative", 3 : "Neutral", 4 : "Positive", 5 : "Positive"}})

copy_test.drop(copy_test.loc[copy_test['Score']=="Neutral"].index, inplace=True)
copy_training.drop(copy_training.loc[copy_training['Score']=="Neutral"].index, inplace=True)


copy_test

# 2.0 Baseline
For our testing we need to create a baseline with the different classifiers


2.0.1 Baseline values


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

descTrain, ResponseTrain = copy_training['Text'], copy_training['Score'] 
descTest, ResponseTest =  copy_test['Text'], copy_test['Score'] 

#Response = ["1", "2", "3", "4", "5"]
Response = ["Negative", "Positive"]

## 2.1 Naive Baysian

In this section we will be conduction different tests with the Naive baysian classifier. I will be testing the Multinomial and Bernoulli classifiers


2.1.1 Naive Baysian Multinomial


In [None]:
from sklearn.naive_bayes import MultinomialNB

pipe = Pipeline([('vectorizer', CountVectorizer()), ('MultinomialNB', MultinomialNB())]).fit(descTrain, ResponseTrain)
pred = pipe.predict(descTest)

print(classification_report(ResponseTest, pred, target_names=Response))

print_CM(ResponseTest,pred)

2.1.2 Naive Baysian Bernoulli

In [None]:
from sklearn.naive_bayes import BernoulliNB

pipe = Pipeline([('vectorizer', CountVectorizer()), ('BernoulliNB', BernoulliNB())]).fit(descTrain, ResponseTrain)
pred = pipe.predict(descTest)

print(classification_report(ResponseTest, pred, target_names=Response))
print_CM(ResponseTest,pred)

## 2.2 Random Forest 

In this section we will be conduction different tests with the Random forest classifier. I will be testing with different trees, 50, 100 and 200 trees



2.2.1 Random forest 50 trees

In [None]:
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline([('vectorizer', CountVectorizer()), ('RandomForestClassifier', RandomForestClassifier(n_estimators=50))]).fit(descTrain, ResponseTrain)
pred = pipe.predict(descTest)

print(classification_report(ResponseTest, pred, target_names=Response))
print_CM(ResponseTest,pred)

2.2.2 Random forest 100 trees

In [None]:
from sklearn.ensemble import RandomForestClassifier
pipe = Pipeline([('vectorizer', CountVectorizer()), ('RandomForestClassifier', RandomForestClassifier(n_estimators=100))]).fit(descTrain, ResponseTrain)
pred = pipe.predict(descTest)

print(classification_report(ResponseTest, pred, target_names=Response))
print_CM(ResponseTest,pred)

2.2.3 Random forest 200 trees

In [None]:
pipe = Pipeline([('vectorizer', CountVectorizer()), ('RandomForestClassifier', RandomForestClassifier(n_estimators=200))]).fit(descTrain, ResponseTrain)
pred = pipe.predict(descTest)

print(classification_report(ResponseTest, pred, target_names=Response))
print_CM(ResponseTest,pred)

## 2.3 Support Vector Machine


2.3.1 C-Support Vector Classification

In [None]:
from sklearn.svm import SVC

pipe = Pipeline([('vectorizer', CountVectorizer()), ('SVC', SVC())]).fit(descTrain, ResponseTrain)
pred = pipe.predict(descTest)

print(classification_report(ResponseTest, pred, target_names=Response))
print_CM(ResponseTest,pred)

2.3.2 Linear Support Vector Classification

In [None]:
from sklearn.svm import LinearSVC

pipe = Pipeline([('vectorizer', CountVectorizer()), ('LinearSVC', LinearSVC(dual=False))]).fit(descTrain, ResponseTrain)
pred = pipe.predict(descTest)

print(classification_report(ResponseTest, pred, target_names=Response))
print_CM(ResponseTest,pred)

# 3.0 Balanced data set

In [None]:
from sklearn.naive_bayes import MultinomialNB

resampled = []
minority_sample = ResponseTrain.value_counts()[-1]

for val in Response:
    resampled.append(copy_training[ResponseTrain == val][:minority_sample]) #Append speeches for each party where partTrain == party until count=minority_sample(719)

resampled_training = pd.concat(resampled, ignore_index=True)
#print(resampled_training['Score'].value_counts())

resampled_training = df_preprocess(resampled_training)



## 3.1 Naive Baysian

3.1.1 Naive Baysian Multinomial

In [None]:
pipe = Pipeline([('vectorizer', CountVectorizer()), ('MultinomialNB', MultinomialNB())]).fit(resampled_training["Text"], resampled_training["Score"])
pred = pipe.predict(descTest)

print(classification_report(ResponseTest, pred, target_names=Response))

print_CM(ResponseTest,pred)

3.1.2 Naive Baysian Bernoulli

In [None]:
from sklearn.naive_bayes import BernoulliNB

pipe = Pipeline([('vectorizer', CountVectorizer()), ('BernoulliNB', BernoulliNB())]).fit(resampled_training["Text"], resampled_training["Score"])
pred = pipe.predict(descTest)

print(classification_report(ResponseTest, pred, target_names=Response))
print_CM(ResponseTest,pred)

## 3.2 Random Forest

3.2.1 Random Forest 50 trees

In [None]:
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline([('vectorizer', CountVectorizer()), ('RandomForestClassifier', RandomForestClassifier(n_estimators=50))]).fit(resampled_training["Text"], resampled_training["Score"])
pred = pipe.predict(descTest)

print(classification_report(ResponseTest, pred, target_names=Response))
print_CM(ResponseTest,pred)

3.2.2 Random Forest 100 trees

In [None]:
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline([('vectorizer', CountVectorizer()), ('RandomForestClassifier', RandomForestClassifier(n_estimators=100))]).fit(resampled_training["Text"], resampled_training["Score"])
pred = pipe.predict(descTest)

print(classification_report(ResponseTest, pred, target_names=Response))
print_CM(ResponseTest,pred)

3.2.3 Random Forest 200 trees

In [None]:
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline([('vectorizer', CountVectorizer()), ('RandomForestClassifier', RandomForestClassifier(n_estimators=200))]).fit(resampled_training["Text"], resampled_training["Score"])
pred = pipe.predict(descTest)

print(classification_report(ResponseTest, pred, target_names=Response))
print_CM(ResponseTest,pred)

## 3.3 Support Vector Machine

3.2.1 C-Support Vector Classification

In [None]:
pipe = Pipeline([('vectorizer', CountVectorizer()), ('SVC', SVC())]).fit(resampled_training["Text"], resampled_training["Score"])
pred = pipe.predict(descTest)

print(classification_report(ResponseTest, pred, target_names=Response))
print_CM(ResponseTest,pred)

3.2.2 Linear Support Vector Classification

In [None]:
pipe = Pipeline([('vectorizer', CountVectorizer()), ('LinearSVC', LinearSVC(dual=False))]).fit(resampled_training["Text"], resampled_training["Score"])
pred = pipe.predict(descTest)

print(classification_report(ResponseTest, pred, target_names=Response))
print_CM(ResponseTest,pred)