In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
df=pd.read_csv(r'D:\DSINTERNSHIP\Review_data\reviews_badminton\data.csv')
df.shape

(8518, 8)

In [4]:
df['Ratings'].value_counts(normalize=True)

5    0.596384
4    0.204978
1    0.090279
3    0.072200
2    0.036159
Name: Ratings, dtype: float64

In [5]:
df.columns

Index(['Reviewer Name', 'Review Title', 'Place of Review', 'Up Votes',
       'Down Votes', 'Month', 'Review text', 'Ratings'],
      dtype='object')

In [6]:
df=df.dropna()
df

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
5,Baji Sankar,Mind-blowing purchase,"Certified Buyer, Hyderabad",173.0,45.0,Oct 2018,Good quality product. Delivered on time.READ MORE,5
6,Flipkart Customer,Must buy!,"Certified Buyer, Doom Dooma",403.0,121.0,Jan 2020,BEST PURCHASE It is a good quality and is more...,5
...,...,...,...,...,...,...,...,...
8495,vishal kumar ashish,Nice,"Certified Buyer, Haridwar",0.0,0.0,Oct 2016,Thanks to the delivery boy ... Service is alwa...,5
8496,Nitya Nand Rai,Good choice,"Certified Buyer, Raebareli",0.0,0.0,Oct 2016,Over priced even after 50% discount price is m...,1
8497,Tarun Reddy,Awesome,Certified Buyer,0.0,0.0,Oct 2016,Too much priced. It was getting me for Rs. 640...,1
8499,Jayachandra,High cost,"Certified Buyer, Mangalore",0.0,0.0,Dec 2015,Hii flipkart customers care..why your delivery...,5


In [7]:
y = df['Ratings']
X = df[['Review text']]

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:

X_train.head()

Unnamed: 0,Review text
250,Product is good as like as bought in the open ...
7735,Love it...READ MORE
2805,GoodREAD MORE
4914,superREAD MORE
1539,excellentREAD MORE


In [10]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [11]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\k.udayasagar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [13]:
def preprocess(raw_text, flag):
    sentence = re.sub("[^a-zA-Z]", " ", raw_text)
    

    sentence = sentence.lower()

    # tokenize into words
    tokens = sentence.split()
    
    # remove stop words                
#     clean_tokens = [t for t in tokens if not t in stopwords.words("english")]
    stop_words = set(stopwords.words("english"))
    clean_tokens = []
    for t in tokens:
        if t.endswith('read'):
            clean_tokens.append(t[:-4])  # Remove 'read' from the end
        elif t not in stop_words:
            clean_tokens.append(t)
    
    
    # Stemming/Lemmatization
    if(flag == 'lemma'):
        clean_tokens = [lemmatizer.lemmatize(word) for word in clean_tokens]
    else:
        clean_tokens = [stemmer.stem(word) for word in clean_tokens]
    
    return pd.Series([" ".join(clean_tokens), len(clean_tokens)])

In [14]:
# !pip install tqdm

In [15]:
from tqdm import tqdm, tqdm_notebook

In [16]:
temp_df = X_train['Review text'].apply(lambda x: preprocess(x, 'stem'))

In [17]:
temp_df.head()

Unnamed: 0,0,1
250,product good like bought open market price als...,17
7735,love,2
2805,good,1
4914,super,1
1539,excel,1


In [18]:
temp_df.columns = ['clean_text_stem', 'text_length_stem']

temp_df.head()

Unnamed: 0,clean_text_stem,text_length_stem
250,product good like bought open market price als...,17
7735,love,2
2805,good,1
4914,super,1
1539,excel,1


In [21]:
X_train = pd.concat([X_train, temp_df], axis=1)

X_train.head()
print(X_test)


                                            Review text
1880        Nice product and delivered ontime.READ MORE
2188  Good one... On problem for  20 to 30 days..REA...
7075                                     SuperREAD MORE
4967             genuine product.....go for itREAD MORE
3822                                 Very niceREAD MORE
...                                                 ...
6072                 excellent product... niceREAD MORE
4149                                     SuperREAD MORE
4332                                   ya goodREAD MORE
5421                                      goodREAD MORE
3420  They really are durable , I tried a lot of shu...

[1603 rows x 1 columns]


In [35]:
y_train.value_counts()

5    3836
4    1340
1     541
3     456
2     237
Name: Ratings, dtype: int64

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Assuming X_train['clean_text_stem'] is a list of preprocessed text documents

# Initialize a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Initialize a Multinomial Naive Bayes classifier
naive_bayes_classifier = MultinomialNB()

# Create a pipeline with TF-IDF vectorization followed by Multinomial Naive Bayes
pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('clf', naive_bayes_classifier)
])

# Training the pipeline
pipeline.fit(X_train['clean_text_stem'], y_train)

# Evaluating the pipeline
y_pred = pipeline.predict(X_test['Review text'])
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.6088583905177791


In [39]:
print(pipeline.predict(['waste']))

[5]


In [62]:
import joblib
joblib.dump(pipeline, 'svc_model.pkl')

['svc_model.pkl']

In [63]:
pwd

'C:\\Users\\k.udayasagar'

In [179]:
from wordcloud import WordCloud

In [180]:
y_train

250     5
7735    5
2805    4
4914    3
1539    5
       ..
5686    5
5850    5
1320    5
8063    5
7730    5
Name: Ratings, Length: 6410, dtype: int64

In [181]:

from sklearn.feature_extraction.text import CountVectorizer

vocab = CountVectorizer()

X_train_bow = vocab.fit_transform(X_train['clean_text_stem'])


In [182]:

X_train_bow

<6410x1826 sparse matrix of type '<class 'numpy.int64'>'
	with 20924 stored elements in Compressed Sparse Row format>

In [183]:
print("Total unique words:", len(vocab.vocabulary_))

print("Type of train features:", type(X_train_bow))

print("Shape of input data:", X_train_bow.shape)

Total unique words: 1826
Type of train features: <class 'scipy.sparse._csr.csr_matrix'>
Shape of input data: (6410, 1826)


In [184]:
print(X_train_bow.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [185]:
from sys import getsizeof

print(type(X_train_bow))
print(getsizeof(X_train_bow), "Bytes")

<class 'scipy.sparse._csr.csr_matrix'>
48 Bytes


In [186]:
X_test.head()

Unnamed: 0,Review text
1880,Nice product and delivered ontime.READ MORE
2188,Good one... On problem for 20 to 30 days..REA...
7075,SuperREAD MORE
4967,genuine product.....go for itREAD MORE
3822,Very niceREAD MORE


In [187]:
temp_df = X_test['Review text'].apply(lambda x: preprocess(x, 'stem'))

temp_df.head()

Unnamed: 0,0,1
1880,nice product deliv ontim,5
2188,good one problem day,5
7075,super,1
4967,genuin product go it,4
3822,nice,1


In [188]:
temp_df.columns = ['clean_text_stem', 'text_length_stem']

temp_df.head()

Unnamed: 0,clean_text_stem,text_length_stem
1880,nice product deliv ontim,5
2188,good one problem day,5
7075,super,1
4967,genuin product go it,4
3822,nice,1


In [189]:
X_test = pd.concat([X_test, temp_df], axis=1)

X_test.head()

Unnamed: 0,Review text,clean_text_stem,text_length_stem
1880,Nice product and delivered ontime.READ MORE,nice product deliv ontim,5
2188,Good one... On problem for 20 to 30 days..REA...,good one problem day,5
7075,SuperREAD MORE,super,1
4967,genuine product.....go for itREAD MORE,genuin product go it,4
3822,Very niceREAD MORE,nice,1


In [190]:
X_test_bow = vocab.transform(X_test['clean_text_stem'])

In [191]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train_bow, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [192]:
y_test_pred = classifier.predict(X_test_bow)

In [193]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))

0.6550218340611353
              precision    recall  f1-score   support

           1       0.63      0.53      0.58       147
           2       0.19      0.10      0.13        51
           3       0.54      0.13      0.20       119
           4       0.33      0.06      0.10       319
           5       0.68      0.97      0.80       967

    accuracy                           0.66      1603
   macro avg       0.47      0.36      0.36      1603
weighted avg       0.58      0.66      0.57      1603



In [198]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train_bow, y_train)
y_test_pred = classifier.predict(X_test_bow)
from sklearn.metrics import precision_recall_fscore_support
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_test_pred, average='weighted')
print("F1 Score:", f1_score)

F1 Score: 0.5422907054269597


In [195]:
y_test_pred = classifier.predict(X_test_bow)

In [197]:
# from sklearn.metrics import accuracy_score, classification_report

# print(accuracy_score(y_test, y_test_pred))

# print(classification_report(y_test, y_test_pred))

from sklearn.metrics import precision_recall_fscore_support

# Assuming y_test and y_test_pred are your true labels and predicted labels respectively

# Calculate precision, recall, F1-score and support
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_test_pred, average='weighted')

# Print the F1-score
print("F1 Score:", f1_score)

F1 Score: 0.5341738488194795


In [199]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support

# Initialize Random Forest classifier
classifier_rf = RandomForestClassifier()

# Train the Random Forest classifier
classifier_rf.fit(X_train_bow, y_train)

# Predict on the test set
y_test_pred_rf = classifier_rf.predict(X_test_bow)

# Calculate precision, recall, and F1 score
precision_rf, recall_rf, f1_score_rf, _ = precision_recall_fscore_support(y_test, y_test_pred_rf, average='weighted')

# Print the F1 score
print("Random Forest F1 Score:", f1_score_rf)


Random Forest F1 Score: 0.5501048956670367


In [200]:
from sklearn.svm import SVC
from sklearn.metrics import precision_recall_fscore_support

# Initialize Support Vector Machine classifier
classifier_svm = SVC()

# Train the Support Vector Machine classifier
classifier_svm.fit(X_train_bow, y_train)

# Predict on the test set
y_test_pred_svm = classifier_svm.predict(X_test_bow)

# Calculate precision, recall, and F1 score
precision_svm, recall_svm, f1_score_svm, _ = precision_recall_fscore_support(y_test, y_test_pred_svm, average='weighted')

# Print the F1 score
print("SVM F1 Score:", f1_score_svm)


SVM F1 Score: 0.5265677785516144


  _warn_prf(average, modifier, msg_start, len(result))


In [201]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_fscore_support

# Initialize Multinomial Naive Bayes classifier
classifier_nb = MultinomialNB()

# Train the Multinomial Naive Bayes classifier
classifier_nb.fit(X_train_bow, y_train)

# Predict on the test set
y_test_pred_nb = classifier_nb.predict(X_test_bow)

# Calculate precision, recall, and F1 score
precision_nb, recall_nb, f1_score_nb, _ = precision_recall_fscore_support(y_test, y_test_pred_nb, average='weighted')

# Print the F1 score
print("Naive Bayes F1 Score:", f1_score_nb)


Naive Bayes F1 Score: 0.568376980427023
