In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [3]:
review1 = pd.read_csv("data.csv")
review1.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1


In [4]:
review1.duplicated().sum()

4

In [5]:
review1.drop_duplicates(inplace=True)

In [6]:
review1.dropna(inplace=True)

In [7]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+','',text)
    text = re.sub(r'[^\w\s]','',text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    text = ' '.join(words).strip()
    return text

In [8]:
review1['cleaned_text']=review1['Review text'].apply(clean_text)

In [9]:
cleaned_df = review1

In [10]:
def assign_ratings(ratings):
    if ratings == 5 or ratings == 4:
        return "Positive"
    else:
        return "Negative"


cleaned_df["Target"]=cleaned_df["Ratings"].apply(assign_ratings)

In [11]:
cleaned_df['Target'] = cleaned_df['Target'].replace({'Positive':1,'Negative':0}, regex=True)

  cleaned_df['Target'] = cleaned_df['Target'].replace({'Positive':1,'Negative':0}, regex=True)


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [13]:
x=tfidf.fit_transform(cleaned_df['cleaned_text']).toarray()
y=cleaned_df['Target']

In [14]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
import numpy as np

In [49]:
pipe1 = Pipeline(
    [
        ('classifier', RandomForestClassifier())
    ]
)

In [50]:
param_grid1 = {
    'classifier__n_estimators': [50, 100, 200, 300],
    'classifier__max_depth': [None, 5, 10, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['sqrt', 'log2']
}

In [51]:
# try only 20 random combinations
clf = RandomizedSearchCV(
    estimator=pipe1,
    param_distributions=param_grid1,
    n_iter=20,             
    cv=5,
    verbose=2,
    n_jobs=-1
)

In [22]:
%time clf.fit(x_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
CPU times: total: 2min 51s
Wall time: 14min 38s


In [24]:
print("Best estimator found on train set")
print(clf.best_estimator_)
print()
print(clf.score(x_test, y_test))

Best estimator found on train set
Pipeline(steps=[('classifier',
                 RandomForestClassifier(min_samples_split=5,
                                        n_estimators=300))])

0.885215221459763


In [24]:
import joblib
import os

In [32]:
with open("tfidf.pkl",'wb') as f:
    joblib.dump(tfidf,f)

In [33]:
with open("tfidf.pkl",'rb') as f:
    tfidf=joblib.load(f)

In [29]:
best_modl = clf.best_estimator_

joblib.dump(best_modl, 'demo_model.pkl')

['demo_model.pkl']

In [30]:
model=joblib.load('demo_model.pkl')

In [31]:
sample= "best product quality durable"
sample1="worst quality of the product"

In [34]:
sample=tfidf.transform([sample]).toarray()
sample1=tfidf.transform([sample1]).toarray()

In [36]:
print(sample)
print(sample1)

[[0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]]


In [38]:
label_map = {0: 'Negative', 1: 'Positive'}
pred = model.predict(sample)[0]
print(label_map[pred])

Positive


In [39]:
%time y_test_pred = model.predict(x_test)

CPU times: total: 1.08 s
Wall time: 1.18 s


In [42]:
print("Accuracy Score:", metrics.accuracy_score(y_test, y_test_pred))

Accuracy Score: 0.885215221459763


In [43]:
print("Model Size:", os.path.getsize('demo_model.pkl'), "Bytes")

Model Size: 44353023 Bytes


In [44]:
from sklearn.svm import SVC

In [53]:
pipe_2 = Pipeline(
    [
        ('classifier', SVC())
    ]
)

In [56]:
# Observe the Key Value Pair format

degree = [2, 3, 4, 5]
c = [0.1, 0.01, 1, 10, 100]

param_grid_2 = [
    {
        'classifier__kernel' : ['rbf'], 
        'classifier__C' : c
    }, 
    {
        'classifier__kernel' : ['poly'], 
        'classifier__degree' : degree, 
        'classifier__C' : c
    }, 
    {
        'classifier__kernel' : ['linear'], 
        'classifier__C' : c
    }
]

In [58]:
# try only 20 random combinations
clf_svc = RandomizedSearchCV(
    estimator=pipe_2,
    param_distributions=param_grid_2,
    n_iter=10,             
    cv=5,
    verbose=2,
    n_jobs=-1
)

In [59]:
%time clf_svc.fit(x_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
CPU times: total: 1min 51s
Wall time: 1h 2min 57s


In [61]:
print("Best estimator found on train set")
print(clf_svc.best_estimator_)
print()
print(clf_svc.score(x_test, y_test))

Best estimator found on train set
Pipeline(steps=[('classifier', SVC(C=1, kernel='linear'))])

0.8814722395508422


In [62]:
best_modl = clf_svc.best_estimator_

joblib.dump(best_modl, 'demo_model_svc.pkl')

['demo_model_svc.pkl']

In [63]:
model=joblib.load('demo_model_svc.pkl')

In [64]:
sample= "best product quality durable"
sample1="worst quality of the product"

In [65]:
sample=tfidf.transform([sample]).toarray()
sample1=tfidf.transform([sample1]).toarray()

In [67]:
label_map = {0: 'Negative', 1: 'Positive'}
pred = model.predict(sample1)[0]
print(label_map[pred])

Negative


In [68]:
%time y_test_pred = model.predict(x_test)

CPU times: total: 11.3 s
Wall time: 11.5 s


In [69]:
print("Accuracy Score:", metrics.accuracy_score(y_test, y_test_pred))

Accuracy Score: 0.8814722395508422


In [70]:
print("Model Size:", os.path.getsize('demo_model_svc.pkl'), "Bytes")

Model Size: 63785569 Bytes


In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [18]:
pipelines = {
    'knn': Pipeline([
        ('classifier', KNeighborsClassifier())
    ]), 
    'svc': Pipeline([
        ('classifier', SVC())
    ]),
    'logistic_regression': Pipeline([
        ('classifier', LogisticRegression(max_iter=5000))
    ]),
    'random_forest': Pipeline([
        ('classifier', RandomForestClassifier())
    ]),
    'decision_tree': Pipeline([
        ('classifier', DecisionTreeClassifier())
    ]),
    'naive_bayes': Pipeline([
        ('classifier', GaussianNB())
    ])
}

In [19]:
# Parameter grids
parameter_grids = {
    'knn': [{
        'classifier__n_neighbors': [i for i in range(1, 20, 2)], 
        'classifier__p': [1, 2, 3]
    }],
    'svc': [
        {
            'classifier__kernel': ['rbf'], 
            'classifier__C': [0.01, 0.1, 1, 10, 100]
        }, 
        {
            'classifier__kernel': ['poly'], 
            'classifier__degree': [2, 3, 4, 5], 
            'classifier__C': [0.01, 0.1, 1, 10, 100]
        }, 
        {
            'classifier__kernel': ['linear'], 
            'classifier__C': [0.01, 0.1, 1, 10, 100]
        }
    ],
    'logistic_regression': [
        {
            'classifier__C': [0.1, 1, 10], 
            'classifier__penalty': ['l2']
        }, 
        {
            'classifier__C': [0.1, 1, 10], 
            'classifier__penalty': ['l1'], 
            'classifier__solver': ['liblinear']
        }, 
        {
            'classifier__C': [0.1, 1, 10], 
            'classifier__penalty': ['elasticnet'], 
            'classifier__l1_ratio': [0.4, 0.5, 0.6],
            'classifier__solver': ['saga']
        }
    ],
    'random_forest': [{
        'classifier__n_estimators': [50, 100, 200]
    }],
    'decision_tree': [{
        'classifier__max_depth': [None, 5, 10]
    }],
    'naive_bayes': [{}]   # no hyperparameters for this
}

In [20]:
# Perform ramdomizedSearchCV for each algorithm
best_models_sentiment_prediction = {}

In [21]:
for algorithm in pipelines.keys():
    print("-" * 10, algorithm, "-" * 10)
    random_search = RandomizedSearchCV(
        estimator=pipelines[algorithm],
        param_distributions=parameter_grids[algorithm],
        n_iter=10,
        cv=5,
        verbose=2,
        n_jobs=-1
    )
    
    %time random_search.fit(x_train, y_train)
    
    best_models_sentiment_prediction[algorithm] = random_search.best_estimator_
    print('Score on Test Data: ', random_search.score(x_test, y_test))

---------- knn ----------
Fitting 5 folds for each of 10 candidates, totalling 50 fits
CPU times: total: 18.1 s
Wall time: 42min 4s
Score on Test Data:  0.8633811603243917
---------- svc ----------
Fitting 5 folds for each of 10 candidates, totalling 50 fits
CPU times: total: 50.3 s
Wall time: 34min 19s
Score on Test Data:  0.8814722395508422
---------- logistic_regression ----------
Fitting 5 folds for each of 10 candidates, totalling 50 fits
CPU times: total: 10.2 s
Wall time: 7min 40s
Score on Test Data:  0.8802245789145352
---------- random_forest ----------
Fitting 5 folds for each of 3 candidates, totalling 15 fits




CPU times: total: 30.7 s
Wall time: 4min 46s
Score on Test Data:  0.8833437305053026
---------- decision_tree ----------
Fitting 5 folds for each of 3 candidates, totalling 15 fits




CPU times: total: 3.52 s
Wall time: 58.8 s
Score on Test Data:  0.8665003119151591
---------- naive_bayes ----------
Fitting 5 folds for each of 1 candidates, totalling 5 fits




ValueError: Invalid parameter 'scaler' for estimator Pipeline(steps=[('classifier', GaussianNB())]). Valid parameters are: ['memory', 'steps', 'verbose'].

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_estimator_'

In [22]:
for name, model in best_models_sentiment_prediction.items():
    print(f"{name}")
    print(f"{model}")
    print()

knn
Pipeline(steps=[('classifier', KNeighborsClassifier(n_neighbors=15))])

svc
Pipeline(steps=[('classifier', SVC(C=1, kernel='linear'))])

logistic_regression
Pipeline(steps=[('classifier', LogisticRegression(C=10))])

random_forest
Pipeline(steps=[('classifier', RandomForestClassifier())])

decision_tree
Pipeline(steps=[('classifier', DecisionTreeClassifier(max_depth=10))])



In [25]:
for name, model in best_models_sentiment_prediction.items():
    print("-"*10, name, "-"*10)
    joblib.dump(model, f'{name}.pkl')
    model = joblib.load(f'{name}.pkl')
    %time y_test_pred = model.predict(x_test)
    print("Accuracy Score", metrics.accuracy_score(y_test, y_test_pred))

    print("Model Size:", os.path.getsize(f'{name}.pkl'), "Bytes")

---------- knn ----------
CPU times: total: 3.09 s
Wall time: 1.76 s
Accuracy Score 0.8633811603243917
Model Size: 177532258 Bytes
---------- svc ----------
CPU times: total: 8.69 s
Wall time: 9.27 s
Accuracy Score 0.8814722395508422
Model Size: 63785569 Bytes
---------- logistic_regression ----------
CPU times: total: 15.6 ms
Wall time: 19.2 ms
Accuracy Score 0.8802245789145352
Model Size: 28645 Bytes
---------- random_forest ----------
CPU times: total: 312 ms
Wall time: 319 ms
Accuracy Score 0.8833437305053026
Model Size: 16077007 Bytes
---------- decision_tree ----------
CPU times: total: 15.6 ms
Wall time: 16.7 ms
Accuracy Score 0.8665003119151591
Model Size: 10887 Bytes
