In [1]:
#loading the libraries
import pandas as pd
import numpy as np
df = pd.read_csv('../data/reviews_data_dump/reviews_badminton/data.csv')


In [2]:
df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb-21,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb-21,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr-21,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr-16,Over pricedJust â?¹620 ..from retailer.I didn'...,1


In [3]:
df.tail()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
8513,,,,,,,,5
8514,,,,,,,,2
8515,,,,,,,,4
8516,,,,,,,,1
8517,,,,,,,,4


In [4]:
df.shape

(8518, 8)

In [5]:
df.isnull().sum()

Reviewer Name       10
Review Title        10
Place of Review     50
Up Votes            10
Down Votes          10
Month              465
Review text          8
Ratings              0
dtype: int64

In [6]:
df['Review text'].fillna('', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Review text'].fillna('', inplace=True)


In [7]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#Download NLTK resources (if not already downloaded)
nltk.download('vader_lexicon')

# Initialize the sentiment analyzer
sid = SentimentIntensityAnalyzer()


# Function to analyze the sentiment of each review text and return binary classification
def analyze_sentiment_binary(review_text):
   
    # Tokenize text into sentences
    sentences = nltk.sent_tokenize(review_text)
    
    # Check if there are no sentences
    if not sentences:
        return 0  # Neutral sentiment
    
    # Initialize sentiment scores
    compound_score = 0
    
    # Get sentiment score for each sentence and aggregate
    for sentence in sentences:
        ss = sid.polarity_scores(sentence)
        compound_score += ss['compound']
    
    # Normalize compound score
    normalized_score = compound_score / len(sentences)
    
    # Classify sentiment as binary
    if normalized_score >= 0:
        return 1  # Positive sentiment
    else:
        return 0  # Negative sentiment

# Apply sentiment analysis to each review text and add 'Sentiment' column
df['Sentiment'] = df['Review text'].apply(analyze_sentiment_binary)

# Display the first few rows with sentiment analysis results
print(df[['Review text', 'Sentiment']].head())


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\shiva\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


                                         Review text  Sentiment
0  Nice product, good quality, but price is now r...          1
1  They didn't supplied Yonex Mavis 350. Outside ...          0
2  Worst product. Damaged shuttlecocks packed in ...          0
3  Quite O. K. , but nowadays  the quality of the...          1
4  Over pricedJust â?¹620 ..from retailer.I didn'...          1


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['Review text'], df['Sentiment'], test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)


(6814,) (1704,)


In [9]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

def preprocess_text(text):
    # Removing special characters and digits
    text = re.sub("[^a-zA-Z]", " ", text)
    # Change text to lowercase
    text = text.lower()
    # Tokenize into words
    tokens = text.split()
    # Remove stop words                
    clean_tokens = [t for t in tokens if not t in stopwords.words("english")]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    clean_tokens = [lemmatizer.lemmatize(word) for word in clean_tokens]
    return " ".join(clean_tokens)


In [10]:
X_train_processed = X_train.apply(preprocess_text)
X_test_processed = X_test.apply(preprocess_text)


In [11]:
import mlflow

mlflow.set_experiment("Sentiment_Analysis")


2024/03/28 18:27:18 INFO mlflow.tracking.fluent: Experiment with name 'Sentiment_Analysis' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///C:/Users/shiva/OneDrive/Desktop/innomatics%20research%20labs/Flipkart/experiment%20tracking/mlruns/324248996580998105', creation_time=1711630638664, experiment_id='324248996580998105', last_update_time=1711630638664, lifecycle_stage='active', name='Sentiment_Analysis', tags={}>

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
import joblib
from joblib import Memory
import os
import warnings

warnings.filterwarnings('ignore')

# Define a memory object to cache intermediate results
cachedir = '.cache'
memory = Memory(location=cachedir, verbose=0)

# Define pipelines for different classifiers
pipelines = {
    'naive_bayes': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', MultinomialNB())
    ]),
    'decision_tree': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', DecisionTreeClassifier())
    ]),
    'random_forest': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', RandomForestClassifier())
    ]),
    'svm': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', SVC())
    ]),
    'knn': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', KNeighborsClassifier())
    ]),
    'logistic_regression': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', LogisticRegression())
    ])
}

# Define parameter grids for hyperparameter tuning
param_grids = {
    'naive_bayes': {
        'vectorization__max_features': [1000, 1500, 2000, 5000],
        'classifier__alpha': [1, 10]
    },
    'decision_tree': {
        'vectorization__max_features': [1000, 1500, 2000, 5000],
        'classifier__max_depth': [None, 5, 10]
    },
    'random_forest': {
        'vectorization__max_features': [1000, 1500, 2000, 5000],
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 5, 10]
    },
    'svm': {
        'vectorization__max_features': [1000, 1500, 2000, 5000],
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf'],
        'classifier__gamma': ['scale', 'auto']
    },
    'knn': {
        'vectorization__max_features': [1000, 1500, 2000, 5000],
        'classifier__n_neighbors': [3, 5, 7],
        'classifier__weights': ['uniform', 'distance']
    },
    'logistic_regression': {
        'vectorization__max_features': [1000, 1500, 2000, 5000],
        'classifier__C': [0.1, 1, 10],
        'classifier__solver': ['lbfgs', 'liblinear']
    }
}


KeyError: 'Review text'