In [1]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline 
import seaborn as sns
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,classification_report

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ryanr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ryanr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ryanr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# Load the initial dataset
initial_dataset = pd.read_csv('amazonreviews.tsv', delimiter='\t')

# Perform data augmentation using NLTK
augmented_data = []
stopwords_set = set(stopwords.words('english'))

for text in initial_dataset['review']:
    tokens = word_tokenize(text)
    augmented_tokens = []
    
    # Synonym Replacement
    for token in tokens:
        synsets = wordnet.synsets(token)
        if synsets:
            synonyms = [synset.lemmas()[0].name() for synset in synsets]
            augmented_token = synonyms[0] if len(synonyms) == 1 else synonyms[1]
        else:
            augmented_token = token
        augmented_tokens.append(augmented_token)
    
    # Random Swap
    n = len(augmented_tokens)
    for i in range(n):
        if i < n-1 and i % 2 == 0:
            augmented_tokens[i], augmented_tokens[i+1] = augmented_tokens[i+1], augmented_tokens[i]
    
    # Random Deletion
    augmented_tokens = [token for token in augmented_tokens if token.lower() not in stopwords_set or token.lower() == 'not']
    augmented_text = ' '.join(augmented_tokens)
    augmented_data.append(augmented_text)

# Create a new DataFrame with augmented data
augmented_dataset = pd.DataFrame({'label': initial_dataset['label'],'review': augmented_data})

# Save the augmented dataset to a file
augmented_dataset.to_csv('augmented_dataset.tsv', sep='\t', index=False)

# Merge the two datasets into a single file
merged_dataset = pd.concat([initial_dataset, augmented_dataset], ignore_index=True)

# Save the merged dataset to a file
merged_dataset.to_csv('merged_dataset.csv', index=False)

In [13]:
df= pd.read_csv('merged_dataset.csv')
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [14]:
# size of the selected dataset
df.shape

(20000, 2)

In [15]:
# Pre-processing the data

In [16]:
#Removing null values
df.isnull().sum()
df.dropna(inplace=True)

#removing empty strings 
blanks = [] 
for i,lb,rv in df.itertuples():  
    if type(rv)==str:            
        if rv.isspace():         
            blanks.append(i)     
        
df.drop(blanks, inplace=True)

#split data-set to train and test
X=df['review']
y=df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Model 1 :- Logistic Regression

In [17]:
#Training the model 
from sklearn.linear_model import LogisticRegression
lr_model=Pipeline([('tfidf', TfidfVectorizer(lowercase=False)),( 'clf',LogisticRegression(solver='lbfgs'))])
lr_model.fit(X_train,y_train)

predictions= lr_model.predict(X_test)
report = classification_report(y_test,predictions, output_dict=True)

df_report = pd.DataFrame(report).transpose().round(2)

cm = sns.light_palette("green", as_cmap=True)
df_report.style.background_gradient(cmap=cm)

Unnamed: 0,precision,recall,f1-score,support
neg,0.87,0.89,0.88,3316.0
pos,0.88,0.87,0.88,3284.0
accuracy,0.88,0.88,0.88,0.88
macro avg,0.88,0.88,0.88,6600.0
weighted avg,0.88,0.88,0.88,6600.0


# Model 2 :- Linear SVC

In [18]:
#training the model
my_model=Pipeline([('tfidf', TfidfVectorizer()),('classifier',LinearSVC())])
my_model.fit(X_train,y_train)

predictions= my_model.predict(X_test)
report = classification_report(y_test,predictions, output_dict=True)

df_report = pd.DataFrame(report).transpose().round(2)

cm = sns.light_palette("green", as_cmap=True)
df_report.style.background_gradient(cmap=cm)

Unnamed: 0,precision,recall,f1-score,support
neg,0.92,0.92,0.92,3316.0
pos,0.92,0.92,0.92,3284.0
accuracy,0.92,0.92,0.92,0.92
macro avg,0.92,0.92,0.92,6600.0
weighted avg,0.92,0.92,0.92,6600.0


# Model 3 :- Vader's Algorithm

In [19]:
# Load the labeled dataset
data= pd.read_csv('amazonreviews.tsv', sep='\t')

# Initialize the Vader sentiment intensity analyzer
vader = SentimentIntensityAnalyzer()

# Create empty lists to store true labels and predicted labels
true_labels = []
predicted_labels = []

# Iterate over each row in the dataset
for index, row in data.iterrows():
    text = row['review']
    true_sentiment = row['label']
    
    # Get the predicted sentiment using the Vader model
    scores = vader.polarity_scores(text)
    predicted_sentiment = 'pos' if scores['compound'] >= 0 else 'neg'
    
    # Append the true and predicted labels to the respective lists
    true_labels.append(true_sentiment)
    predicted_labels.append(predicted_sentiment)




In [20]:
report = classification_report(true_labels,predicted_labels,output_dict=True)
df_report = pd.DataFrame(report).transpose().round(2)
cm = sns.light_palette("green", as_cmap=True)
df_report.style.background_gradient(cmap=cm)

Unnamed: 0,precision,recall,f1-score,support
neg,0.86,0.52,0.64,5097.0
pos,0.64,0.91,0.75,4903.0
accuracy,0.71,0.71,0.71,0.71
macro avg,0.75,0.71,0.7,10000.0
weighted avg,0.75,0.71,0.7,10000.0


# Pre Augmentation

1) Logistic Regression Model

Accuracy: 85% Precision: 87% Recall: 85% F1-score: 85%
                
2) Linear SVC (Support Vector Classifier)

Accuracy: 87% Precision: 89% Recall: 87% F1-score: 88%
                
3) Vader's Model (VADER Sentiment Intensity Analyzer)

Accuracy: 70% Precision: 64% Recall: 91% F1-score: 75%


# Post Augmentation

1) Logistic Regression Model

Accuracy: 88% Precision: 88% Recall: 89% F1-score: 88%
                
2) Linear SVC (Support Vector Classifier)

Accuracy: 92% Precision: 92% Recall: 92% F1-score: 92%
                
3) Vader's Model (VADER Sentiment Intensity Analyzer)

Accuracy: 71% Precision: 86% Recall: 91% F1-score: 75%

# Recommendation 

The Support Vector Classifier (SVC) is better for sentiment analysis due to its ability to capture non-linear relationships in text data, handle high-dimensional feature spaces effectively, and handle imbalanced datasets. It is robust to outliers and can tolerate misclassifications with a soft margin. Additionally, SVC can be extended to handle non-linear sentiment analysis tasks in various data types, making it a versatile choice for sentiment analysis.

Data augmentation can initially be beneficial, continuously relying solely on data augmentation may not be the most effective approach to improving model accuracy. Instead, consider strategies such as increasing the dataset size, improving data quality, and exploring advanced feature engineering techniques or alternative models. These approaches can further enhance the accuracy of the models beyond what data augmentation alone can achieve.