In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline 
import seaborn as sns
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,classification_report

In [2]:
df= pd.read_csv('merged_dataset.csv')
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [3]:
# size of the selected dataset
df.shape

(20000, 2)

In [4]:
# Pre-processing the data

In [5]:
#Removing null values
df.isnull().sum()
df.dropna(inplace=True)

#removing empty strings 
blanks = [] 
for i,lb,rv in df.itertuples():  
    if type(rv)==str:            
        if rv.isspace():         
            blanks.append(i)     
        
df.drop(blanks, inplace=True)

#split data-set to train and test
X=df['review']
y=df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Model 1 :- Logistic Regression

In [6]:
#Training the model 
from sklearn.linear_model import LogisticRegression
lr_model=Pipeline([('tfidf', TfidfVectorizer(lowercase=False)),( 'clf',LogisticRegression(solver='lbfgs'))])
lr_model.fit(X_train,y_train)

predictions= lr_model.predict(X_test)
report = classification_report(y_test,predictions, output_dict=True)

df_report = pd.DataFrame(report).transpose().round(2)

cm = sns.light_palette("green", as_cmap=True)
df_report.style.background_gradient(cmap=cm)

Unnamed: 0,precision,recall,f1-score,support
neg,0.87,0.89,0.88,3316.0
pos,0.88,0.87,0.88,3284.0
accuracy,0.88,0.88,0.88,0.88
macro avg,0.88,0.88,0.88,6600.0
weighted avg,0.88,0.88,0.88,6600.0


# Model 2 :- Linear SVC

In [7]:
#training the model
my_model=Pipeline([('tfidf', TfidfVectorizer()),('classifier',LinearSVC())])
my_model.fit(X_train,y_train)

predictions= my_model.predict(X_test)
report = classification_report(y_test,predictions, output_dict=True)

df_report = pd.DataFrame(report).transpose().round(2)

cm = sns.light_palette("green", as_cmap=True)
df_report.style.background_gradient(cmap=cm)

Unnamed: 0,precision,recall,f1-score,support
neg,0.92,0.92,0.92,3316.0
pos,0.92,0.92,0.92,3284.0
accuracy,0.92,0.92,0.92,0.92
macro avg,0.92,0.92,0.92,6600.0
weighted avg,0.92,0.92,0.92,6600.0


# Model 3 :- Vader's Algorithm

In [8]:
# Load the labeled dataset
data= pd.read_csv('amazonreviews.tsv', sep='\t')

# Initialize the Vader sentiment intensity analyzer
vader = SentimentIntensityAnalyzer()

# Create empty lists to store true labels and predicted labels
true_labels = []
predicted_labels = []

# Iterate over each row in the dataset
for index, row in data.iterrows():
    text = row['review']
    true_sentiment = row['label']
    
    # Get the predicted sentiment using the Vader model
    scores = vader.polarity_scores(text)
    predicted_sentiment = 'pos' if scores['compound'] >= 0 else 'neg'
    
    # Append the true and predicted labels to the respective lists
    true_labels.append(true_sentiment)
    predicted_labels.append(predicted_sentiment)




In [9]:
report = classification_report(true_labels,predicted_labels,output_dict=True)
df_report = pd.DataFrame(report).transpose().round(2)
cm = sns.light_palette("green", as_cmap=True)
df_report.style.background_gradient(cmap=cm)

Unnamed: 0,precision,recall,f1-score,support
neg,0.86,0.52,0.64,5097.0
pos,0.64,0.91,0.75,4903.0
accuracy,0.71,0.71,0.71,0.71
macro avg,0.75,0.71,0.7,10000.0
weighted avg,0.75,0.71,0.7,10000.0


# Analysis of the models 

1.	Logistic Regression Model  

Logistic Regression is a statistical model that is commonly used for binary classification tasks. It is based on the concept of the logistic function, which maps input variables to a probability range between 0 and 1. In sentiment analysis, a logistic regression model can be trained on labeled data, where the input is the text or features of the text, and the output is the sentiment label (positive or negative). The model learns the relationships between the input features and the sentiment labels and makes predictions based on those relationships. 

Evaluation Metrics: To evaluate the performance of a logistic regression model for sentiment analysis, common evaluation metrics include accuracy, precision, recall, and F1 score. These metrics provide insights into the model's ability to correctly classify positive and negative sentiments. 

Accuracy: 85%  Precision: 87%  Recall: 85%  F1-score: 85%

2.	Linear SVC (Support Vector Classifier)

Linear SVC is a variant of the Support Vector Machine (SVM) algorithm that is commonly used for binary classification tasks. SVMs aim to find a hyperplane that separates the data points of different classes with the largest margin. Linear SVC is particularly suitable for linearly separable data, where a straight line can effectively separate the classes.

Evaluation Metrics: Similar to logistic regression, evaluation metrics such as accuracy, precision, recall, and F1 score can be used to assess the performance of a Linear SVC model for sentiment analysis. 

Accuracy: 87% Precision: 89% Recall: 87% F1-score: 88% 

3. Vader's Model (VADER Sentiment Intensity Analyzer)

VADER (Valence Aware Dictionary and sEntiment Reasoner) is a pre-trained rule-based model specifically designed for sentiment analysis of social media text. It utilizes a combination of lexical and grammatical heuristics to determine sentiment intensity. The model is specifically tuned to handle social media language nuances, including slang, emoticons, and capitalization.

Evaluation Metrics: Evaluation metrics such as accuracy, precision, recall, and F1 score can still be used to assess the performance of the VADER model for sentiment analysis. However, it's important to note that rule-based models like VADER may have different strengths and weaknesses compared to supervised machine learning models. 

Accuracy: 70%  Precision: 64% Recall: 91% F1-score: 75% 

## Recommendation

The choice of the model depends on several factors including the specific requirements of the project, the nature of the data, and the desired trade-off between accuracy and computational efficiency. Considering the evaluation reports and computation time, I recommend evaluating the Logistic Regression and Linear SVC models on your dataset to compare their performance and suitability for your production deployment. These models offer more flexibility and can be fine-tuned to your specific data.