# Importing the data 

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('D:\\Final_data.csv')
data.head()

Unnamed: 0,Reviews,Lammatized,polarity,sentiment
0,Impeccable quality standards,impeccable quality standard,0.375,Positive
1,Very Good Product,good product,0.7,Positive
2,"touch is great, phone response time is superb ...",touch great phone response time superb due und...,0.558333,Positive
3,"Buy, if your Budget is below 13K.",buy budget 13k,0.0,Neutral
4,Poor performance,poor performance,-0.4,Negative


# Converting our sentiments in numerical form using LabelEncoder

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
labelencoder = LabelEncoder()
data['sentiment_label'] = labelencoder.fit_transform(data['sentiment'])

In [None]:
data.head()

Unnamed: 0,Reviews,Lammatized,polarity,sentiment,sentiment_label
0,Impeccable quality standards,impeccable quality standard,0.375,Positive,2
1,Very Good Product,good product,0.7,Positive,2
2,"touch is great, phone response time is superb ...",touch great phone response time superb due und...,0.558333,Positive,2
3,"Buy, if your Budget is below 13K.",buy budget 13k,0.0,Neutral,1
4,Poor performance,poor performance,-0.4,Negative,0


In [None]:
# Negative = 0
# Neutral = 1
# Positive = 2

# Spliting the Data into X &Y 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
bow_vectorizer = CountVectorizer()
bow = bow_vectorizer.fit_transform(data['Lammatized'].apply(lambda x: np.str_(x)))

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x = bow.toarray()
y = data['sentiment_label']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=42)

# Model Building

## 1. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [None]:
model_1=LogisticRegression()

# Trainning
model_1.fit(x_train,y_train)
# Testing
y_pred_test_lr=model_1.predict(x_test)

print('*************Testing Accuracy*************')
print('confusion_matrix \n',confusion_matrix(y_test,y_pred_test_lr))
print('*********************************************')
print('classification_report \n',classification_report(y_test,y_pred_test_lr))
print('*********************************************')
print("Accuracy score \n",accuracy_score(y_test,y_pred_test_lr))

*************Testing Accuracy*************
confusion_matrix 
 [[ 314   57  108]
 [  14  633   47]
 [  36   70 2691]]
*********************************************
classification_report 
               precision    recall  f1-score   support

           0       0.86      0.66      0.74       479
           1       0.83      0.91      0.87       694
           2       0.95      0.96      0.95      2797

    accuracy                           0.92      3970
   macro avg       0.88      0.84      0.86      3970
weighted avg       0.92      0.92      0.91      3970

*********************************************
Accuracy score 
 0.9163727959697733


In [None]:
# Training accuracy
y_pred_train_lr = model_1.predict(x_train)

print('*************Training Accuracy*************')
print('confusion_matrix \n',confusion_matrix(y_train,y_pred_train_lr))
print('*********************************************')
print('classification_report \n',classification_report(y_train,y_pred_train_lr))
print('*********************************************')
print("Accuracy score \n",accuracy_score(y_train,y_pred_train_lr))

*************Training Accuracy*************
confusion_matrix 
 [[1389   25   16]
 [   0 1882   18]
 [   1   21 8558]]
*********************************************
classification_report 
               precision    recall  f1-score   support

           0       1.00      0.97      0.99      1430
           1       0.98      0.99      0.98      1900
           2       1.00      1.00      1.00      8580

    accuracy                           0.99     11910
   macro avg       0.99      0.99      0.99     11910
weighted avg       0.99      0.99      0.99     11910

*********************************************
Accuracy score 
 0.993198992443325


## 2. Linear support vector classifier

In [None]:
from sklearn.svm import LinearSVC

In [None]:
model_2 = LinearSVC(verbose=0)

# Trainning
model_2.fit(x_train,y_train)
# Testing
y_pred_test_lsvc = model_2.predict(x_test)

print('*************Testing Accuracy*************')
print('confusion_matrix \n',confusion_matrix(y_test,y_pred_test_lsvc))
print('*********************************************')
print('classification_report \n',classification_report(y_test,y_pred_test_lsvc))
print('*********************************************')
print("Accuracy score \n",accuracy_score(y_test,y_pred_test_lsvc))

*************Testing Accuracy*************
confusion_matrix 
 [[ 342   43   94]
 [  24  641   29]
 [  35   62 2700]]
*********************************************
classification_report 
               precision    recall  f1-score   support

           0       0.85      0.71      0.78       479
           1       0.86      0.92      0.89       694
           2       0.96      0.97      0.96      2797

    accuracy                           0.93      3970
   macro avg       0.89      0.87      0.88      3970
weighted avg       0.93      0.93      0.93      3970

*********************************************
Accuracy score 
 0.9277078085642317


In [None]:
# Training accuracy
y_pred_train_lsvc = model_2.predict(x_train)

print('*************Training Accuracy*************')
print('confusion_matrix \n',confusion_matrix(y_train,y_pred_train_lsvc))
print('*********************************************')
print('classification_report \n',classification_report(y_train,y_pred_train_lsvc))
print('*********************************************')
print("Accuracy score \n",accuracy_score(y_train,y_pred_train_lsvc))

*************Training Accuracy*************
confusion_matrix 
 [[1430    0    0]
 [   0 1899    1]
 [   0    0 8580]]
*********************************************
classification_report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1430
           1       1.00      1.00      1.00      1900
           2       1.00      1.00      1.00      8580

    accuracy                           1.00     11910
   macro avg       1.00      1.00      1.00     11910
weighted avg       1.00      1.00      1.00     11910

*********************************************
Accuracy score 
 0.9999160369437448


## 3. Decision tree classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model_3 = DecisionTreeClassifier(criterion = 'entropy')

# Trainning
model_3.fit(x_train,y_train)
# Testing
y_pred_test_dtc = model_3.predict(x_test)

print('*************Testing Accuracy*************')
print('confusion_matrix \n',confusion_matrix(y_test,y_pred_test_dtc))
print('*********************************************')
print('classification_report \n',classification_report(y_test,y_pred_test_dtc))
print('*********************************************')
print("Accuracy score \n",accuracy_score(y_test,y_pred_test_dtc))

*************Testing Accuracy*************
confusion_matrix 
 [[ 264   48  167]
 [  30  621   43]
 [ 131   73 2593]]
*********************************************
classification_report 
               precision    recall  f1-score   support

           0       0.62      0.55      0.58       479
           1       0.84      0.89      0.86       694
           2       0.93      0.93      0.93      2797

    accuracy                           0.88      3970
   macro avg       0.79      0.79      0.79      3970
weighted avg       0.87      0.88      0.87      3970

*********************************************
Accuracy score 
 0.8760705289672545


In [None]:
# Training accuracy
y_pred_train_dtc = model_3.predict(x_train)           

print('*************Training Accuracy*************')
print('confusion_matrix \n',confusion_matrix(y_train,y_pred_train_dtc))
print('*********************************************')
print('classification_report \n',classification_report(y_train,y_pred_train_dtc))
print('*********************************************')
print("Accuracy score \n",accuracy_score(y_train,y_pred_train_dtc))

*************Training Accuracy*************
confusion_matrix 
 [[1430    0    0]
 [   0 1900    0]
 [   0    0 8580]]
*********************************************
classification_report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1430
           1       1.00      1.00      1.00      1900
           2       1.00      1.00      1.00      8580

    accuracy                           1.00     11910
   macro avg       1.00      1.00      1.00     11910
weighted avg       1.00      1.00      1.00     11910

*********************************************
Accuracy score 
 1.0


# 4.Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model_4 = RandomForestClassifier()

# Trainning
model_4.fit(x_train,y_train)
# Testing
y_pred_test_rfc = model_4.predict(x_test)

print('*************Testing Accuracy*************')
print('confusion_matrix \n',confusion_matrix(y_test,y_pred_test_rfc))
print('*********************************************')
print('classification_report \n',classification_report(y_test,y_pred_test_rfc))
print('*********************************************')
print("Accuracy score \n",accuracy_score(y_test,y_pred_test_rfc))

*************Testing Accuracy*************
confusion_matrix 
 [[ 124   50  305]
 [   1  557  136]
 [   7   71 2719]]
*********************************************
classification_report 
               precision    recall  f1-score   support

           0       0.94      0.26      0.41       479
           1       0.82      0.80      0.81       694
           2       0.86      0.97      0.91      2797

    accuracy                           0.86      3970
   macro avg       0.87      0.68      0.71      3970
weighted avg       0.86      0.86      0.83      3970

*********************************************
Accuracy score 
 0.8564231738035264


In [None]:
# Training accuracy
y_pred_train_rfc = model_4.predict(x_train)           

print('*************Training Accuracy*************')
print('confusion_matrix \n',confusion_matrix(y_train,y_pred_train_rfc))
print('*********************************************')
print('classification_report \n',classification_report(y_train,y_pred_train_rfc))
print('*********************************************')
print("Accuracy score \n",accuracy_score(y_train,y_pred_train_rfc))

*************Training Accuracy*************
confusion_matrix 
 [[1430    0    0]
 [   0 1900    0]
 [   0    0 8580]]
*********************************************
classification_report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1430
           1       1.00      1.00      1.00      1900
           2       1.00      1.00      1.00      8580

    accuracy                           1.00     11910
   macro avg       1.00      1.00      1.00     11910
weighted avg       1.00      1.00      1.00     11910

*********************************************
Accuracy score 
 1.0


## 5. KNN

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier

In [None]:
model_5 = KNeighborsClassifier(n_neighbors = 3)

# Trainning
model_5.fit(x_train,y_train)
# Testing
y_pred_test_knn = model_5.predict(x_test)

print('*************Testing Accuracy*************')
print('confusion_matrix \n',confusion_matrix(y_test,y_pred_test_knn))
print('*********************************************')
print('classification_report \n',classification_report(y_test,y_pred_test_knn))
print('*********************************************')
print("Accuracy score \n",accuracy_score(y_test,y_pred_test_knn))

*************Testing Accuracy*************
confusion_matrix 
 [[  94  259  126]
 [   4  635   55]
 [  79  649 2069]]
*********************************************
classification_report 
               precision    recall  f1-score   support

           0       0.53      0.20      0.29       479
           1       0.41      0.91      0.57       694
           2       0.92      0.74      0.82      2797

    accuracy                           0.70      3970
   macro avg       0.62      0.62      0.56      3970
weighted avg       0.78      0.70      0.71      3970

*********************************************
Accuracy score 
 0.7047858942065491


In [None]:
# Training accuracy
y_pred_train_knn = model_5.predict(x_train)           

print('*************Training Accuracy*************')
print('confusion_matrix \n',confusion_matrix(y_train,y_pred_train_knn))
print('*********************************************')
print('classification_report \n',classification_report(y_train,y_pred_train_knn))
print('*********************************************')
print("Accuracy score \n",accuracy_score(y_train,y_pred_train_knn))

*************Training Accuracy*************
confusion_matrix 
 [[ 754  469  207]
 [  18 1800   82]
 [ 185 1131 7264]]
*********************************************
classification_report 
               precision    recall  f1-score   support

           0       0.79      0.53      0.63      1430
           1       0.53      0.95      0.68      1900
           2       0.96      0.85      0.90      8580

    accuracy                           0.82     11910
   macro avg       0.76      0.77      0.74     11910
weighted avg       0.87      0.82      0.83     11910

*********************************************
Accuracy score 
 0.8243492863140218


## Accuracy comparision

In [None]:
model = ['Logistic Regression','Linear SVC','Decision Tree','Random Forest','KNN']

training = [
    (accuracy_score(y_train,y_pred_train_lr)*100),
    (accuracy_score(y_train,y_pred_train_lsvc)*100),
    (accuracy_score(y_train,y_pred_train_dtc)*100),
    (accuracy_score(y_train,y_pred_train_rfc)*100),
    (accuracy_score(y_train,y_pred_train_knn)*100)
]

testing = [
    (accuracy_score(y_test,y_pred_test_lr)*100),
    (accuracy_score(y_test,y_pred_test_lsvc)*100),
    (accuracy_score(y_test,y_pred_test_dtc)*100),
    (accuracy_score(y_test,y_pred_test_rfc)*100),
    (accuracy_score(y_test,y_pred_test_knn)*100),    
]

In [None]:
df = pd.DataFrame()
df['Model'] = model
df['Training'] = training
df['Testing'] = testing
df

Unnamed: 0,Model,Training,Testing
0,Logistic Regression,99.319899,91.63728
1,Linear SVC,99.991604,92.770781
2,Decision Tree,100.0,87.607053
3,Random Forest,100.0,85.642317
4,KNN,82.434929,70.478589


### Here Linear SVC model is performing Better than any other model,so we use Linear SVC.

# Create the pickle file.

In [None]:
import pickle
from sklearn.pipeline import Pipeline

In [None]:
pipeline = Pipeline([('CountVectorizer',bow_vectorizer),('clf',model_2)])

In [None]:
file_name = 'final_model_1.pkl'
pickle.dump(pipeline,open(file_name,'wb'))

### Top five most negative reviews

In [None]:
pd.options.display.max_colwidth = 100

In [None]:
data['polarity'].sort_values(ascending=True)

8342    -1.0
7776    -1.0
1153    -1.0
514     -1.0
14647   -1.0
        ... 
1709     1.0
15343    1.0
682      1.0
606      1.0
3401     1.0
Name: polarity, Length: 15880, dtype: float64

In [None]:
data['Reviews'].iloc[[14589,6765,7308,1462,10383]]

14589                                                             Good product and amazing service by Amazon
6765                                                                                           It just cool.
7308     Anonymous, 01 Nov 2015\nhello moto 3 gen is having auto reboot issue meanwhile iphone is always ...
1462     Amazon india has successfully delivered a used product. The phone was activated on 9th april but...
10383    Terrible phone. It freezes a lot, very slow. I use this phone with Straight Talk. When you use a...
Name: Reviews, dtype: object

### Top five most positive reviews

In [None]:
data['polarity'].sort_values(ascending=False)

2714     1.0
3457     1.0
2164     1.0
2169     1.0
2170     1.0
        ... 
4890    -1.0
7776    -1.0
48      -1.0
8386    -1.0
14647   -1.0
Name: polarity, Length: 15880, dtype: float64

In [None]:
data['Reviews'].iloc[[10781,14969,5792,5793,11028]]

10781    Love my iPhone 4!  Would dream of upgrading to a 5 or 6. My iPhone has so much more memory than ...
14969                                                                     Awesome phone my dad loves it!! :)
5792                                                              This phone 16GB version or htc sensation? 
5793                                            [deleted post]\nIphone 4s CMDA don't have slot for sim card.
11028    Received my first with this product and I couldn't be happier. It has a ton of storage and easy ...
Name: Reviews, dtype: object