Sentiment Analysis and Modeling on Amazon Product Review.


Importing Libraries


In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
import warnings
warnings.filterwarnings('ignore')

Read the dataset


In [3]:
df = pd.read_csv(r".\input_data\Reviews.csv")
df

FileNotFoundError: [Errno 2] No such file or directory: '.\\input_data\\Reviews.csv'

In [None]:
# Making a deep copy of the data so we can re-use it later 
df2 = df.copy()
df.head()

Exploratory Data Analysis

In [None]:
# Know the number of rows and columns of the data
df.shape

In [None]:
df.info()

In [None]:
# We have to deal with the customer review's so, we selected the text column
df["Text"]

We will remove all the unnecssary columns keep only 2 columns TEXT AND SCORE to perform sentiment analysis 

In [None]:
df.drop(['Id','ProductId','UserId','ProfileName','HelpfulnessNumerator','HelpfulnessDenominator','Time','Summary'],axis =1,inplace=True )

In [None]:
# to ensure that there are only 2 columns left
df.columns

In [None]:
# check null values
df.isna().sum()

In [None]:
# check duplicates values
df.duplicated().sum()

In [None]:
# remove duplicates values from the dataset
df.drop_duplicates(inplace= True)

In [None]:
df.shape

Number of rows after removing duplicates from 568454 to 393675

Exploring Target Column Score

In [None]:
# Show number of unique values in score column
df['Score'].nunique()

In [None]:
# show unique values in score column
df['Score'].unique()

Distribution of Ratings in Score Column

In [None]:
# Show value distribution(in percentage) of score ratings 
df['Score'].value_counts()/len(df)*100
 

In [None]:
plt.figure(figsize=(10,5))
ax = sns.countplot(x=df['Score'], palette= sns.color_palette("husl", 9))
total = float(len(df))
for p in ax.patches:
    height =p.get_height()
    ax.text(p.get_x() + p.get_width()/2.0,height + 75,'{:1.1f} %'.format((height/total)*100), ha ="center",
            bbox = dict(facecolor ='none', edgecolor = 'black', boxstyle ='round', linewidth =0.5))
ax.set_title('Score Distribution', fontsize =20, y= 1.05)
sns.despine(right = True)
sns.despine(offset= 5, trim =True)


In [None]:
score_values = df['Score'].value_counts()
plt.pie(score_values,labels= score_values.index)
plt.title('Pie Score Distribution')
plt.show()

Observation:
We notice that 63.7 % of reviews have the highest score ratings which indicate that high percentage of product reviews are positive.

The Distribution of score values is not balanced at all. 

Taking Samples from Score Column

In [None]:
new_df = df.groupby('Score').apply(lambda x: x.sample(15000)).reset_index(drop =True)

In [None]:
new_df

In [None]:
new_df.shape

By taking from each score 15000 samples, we will have 75000 row and by this data became balanced.

To Ensure Score Distribution After Sampling

In [None]:
new_df['Score'].value_counts()

In [None]:
plt.figure(figsize=(10,5))
ax=sns.countplot(x=new_df['Score'], palette= sns.color_palette("husl", 9))
total=float(len(df))
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,height +75,'{:1.1f} %'.format((height/total)*100), ha='center',bbox=dict(facecolor='none',edgecolor='black',boxstyle='round',linewidth=0.5))
ax.set_title('New Score Distribution', fontsize=20, y=1.05)
sns.despine(right=True)
sns.despine(offset=5, trim =True)



As Shown, Score Column is balanced and each value have the same distribution so the model won't be biased to specific value.

Text Preprocessing

In [None]:
# import libraries for preprocessing
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
stop_words =set(stopwords.words('english'))
stemming = PorterStemmer()


In [None]:

# Define your desired NLTK data path
nltk_data_path = 'C:\\nltk_data'

# Add the path to the NLTK data paths
nltk.data.path.append(nltk_data_path)

# Download necessary NLTK data to your custom path
nltk.download('punkt', download_dir=nltk_data_path)
nltk.download('punkt_tab', download_dir=nltk_data_path)
nltk.download('stopwords', download_dir=nltk_data_path)

In [None]:
def clean_text(text):
    # 1. Convert in to lower
    txt = text.lower()

    # 2. split in to words
    token = word_tokenize(txt)

    # 3. remove punctuation
    token = [word for word in token if word not in string.punctuation]
    
    # 4. Remove stopwords
    token = [word for word in token if word not in stop_words]
    
    # 5. Remove numbers
    token = [word for word in token if not word.isdigit()]
    
    
    # 6. Apply Stemming
    token = [stemming.stem(word) for word in token]

    
   # To return these single words back into one string
    return ' '.join(token)  




In [None]:
new_df["cleaned_text"] = new_df["Text"].apply(clean_text)

In [None]:
new_df.info()


In [None]:
!pip install WordCloud

In [None]:
from wordcloud import WordCloud


In [None]:

# Combine all cleaned text into a single string
all_text = ' '.join(new_df['cleaned_text'])

# Create the wordcloud object with the top 1000 words
wordcloud =WordCloud(width=800,height= 400,max_words= 1000, background_color='white').generate(all_text)

# plot the wordcloud using matplotlib
plt.figure(figsize=(10,5))
plt.imshow(wordcloud, interpolation ='bilinear')
plt.axis('off')
plt.show()


In [None]:
len(set(all_text))

In [None]:
len(all_text)

In [None]:
!pip install gensim

Vectorizing Text to numbers before Modeling

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report


In [None]:
!pip install scikit-learn

Spliting Data

In [None]:
new_df['Score'] = new_df['Score'].apply(lambda x: 1 if x>=3 else 0)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x = new_df['cleaned_text']
y = new_df['Score']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state= 42, test_size=0.2)

1. LOGISTIC REGRESSION

In [None]:
logistic_pipe = Pipeline(
    [
        ('vec',CountVectorizer(stop_words= 'english')),
         ('Tf_idf',TfidfTransformer()),
          ('log_rg',LogisticRegression()),
    ]
)


In [None]:
log_fit = logistic_pipe.fit(x_train,y_train)

In [None]:
# Store the fitted model in a pickle file

import joblib

joblib.dump(log_fit, r'.\models\logistic_regression_model.pkl')

In [None]:
log_pred = logistic_pipe.predict(x_test)

In [None]:
print("Training accuracy:", log_fit.score(x_train,y_train))
print("Test accuracy:", log_fit.score(x_test,y_test))

In [None]:
sns.heatmap(confusion_matrix(y_test,log_pred), annot= True, fmt='d')

Making Prediction using Logistic Regression

In [None]:
reviews = ['This is an amazing product,I will definetly buy it ',
         'very bad,I dont recommend it at all',
'we received this coffee yesterday, and have to say its amazing',
         'experience was terrible',
         'I will buy again from this site,everything was perfect']

prediction = logistic_pipe.predict(reviews)
sentiment =["Positive" if i == 1 else "Negative" for i in prediction]
print(sentiment)

2. NAIVE BAYES

In [None]:
naive_bayes_pipeline = Pipeline([
    ('vec', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

# Train the model
nb_model = naive_bayes_pipeline.fit(x_train,y_train)
# Store the model for future use
joblib.dump(nb_model, r'.\models\naive_bayes_model.pkl')
y_pred_nb = naive_bayes_pipeline.predict(x_test)
print(classification_report(y_test, y_pred_nb,digits =4))
sns.heatmap(confusion_matrix(y_test,y_pred_nb), annot= True,fmt ='d')


In [None]:
print("Training accuracy of Naive Bayes :", nb_model.score(x_train,y_train))
print("Test accuracy of Naive Bayes:", nb_model.score(x_test,y_test))

3. SVC

In [None]:
svm_pipeline = Pipeline([
    ('vec', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('classifier', SVC())
])

# Train the model
svc = svm_pipeline.fit(x_train,y_train)
y_pred_svc = svm_pipeline.predict(x_test)
print(classification_report(y_test,y_pred_svc,digits=4))



In [None]:
# Store the model for future use
joblib.dump(svc, r'.\models\support_vector_classifier.pkl')

In [None]:
sns.heatmap(confusion_matrix(y_test,y_pred_svc), annot = True, fmt = 'd')

In [None]:
print('Training accuracy of SVC : ', svc.score(x_train,y_train,y_train))
print('Test accuracy of SVC : ', svc.score(x_test,y_test))

Making Prediction using SVC

In [None]:
reviews=['This is an amazing product,I will definetly buy it ',
         'very bad,I dont recommend it at all',
'we received this coffee yesterday, and have to say its amazing',
         'experience was terrible',
         'I will buy again from this site,everything was perfect']
prediction=svm_pipeline.predict(reviews)
sentiment=["Positive" if i == 1 else "Negative" for i in prediction]
  
print(sentiment)