# Import data 

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('Final_data.csv')
data.head()

Unnamed: 0,Reviews,Lammatized,polarity,sentiment
0,Bad,bad,-0.7,Negative
1,Charger not working...😫,charger working,0.0,Neutral
2,one of the speaker doesn't works,one speaker work,0.0,Neutral
3,The piece i got is Worst piece and it is refur...,piece got worst piece refurbished lot feature ...,-0.31,Negative
4,awesome phone with really great look and perfo...,awesome phone really great look performance lo...,0.58,Positive


# Convert our sentiment in numeric form

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
labelencoder = LabelEncoder()
data['sentiment_label'] = labelencoder.fit_transform(data['sentiment'])

In [5]:
data.head()

Unnamed: 0,Reviews,Lammatized,polarity,sentiment,sentiment_label
0,Bad,bad,-0.7,Negative,0
1,Charger not working...😫,charger working,0.0,Neutral,1
2,one of the speaker doesn't works,one speaker work,0.0,Neutral,1
3,The piece i got is Worst piece and it is refur...,piece got worst piece refurbished lot feature ...,-0.31,Negative,0
4,awesome phone with really great look and perfo...,awesome phone really great look performance lo...,0.58,Positive,2


In [6]:
# Negative = 0
# Neutral = 1
# Positive = 2

# Split Data

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
bow_vectorizer = CountVectorizer()
bow = bow_vectorizer.fit_transform(data['Lammatized'].apply(lambda x: np.str_(x)))

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
x = bow.toarray()
y = data['sentiment_label']

In [11]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=42)

# Model Building

## 1. Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [13]:
model_1=LogisticRegression()

# Trainning
model_1.fit(x_train,y_train)
# Testing
y_pred_test_lr=model_1.predict(x_test)

print('*************Testing Accuracy*************')
print('confusion_matrix \n',confusion_matrix(y_test,y_pred_test_lr))
print('*********************************************')
print('classification_report \n',classification_report(y_test,y_pred_test_lr))
print('*********************************************')
print("Accuracy score \n",accuracy_score(y_test,y_pred_test_lr))

*************Testing Accuracy*************
confusion_matrix 
 [[ 426   84  105]
 [   5 1119   36]
 [  51   96 2921]]
*********************************************
classification_report 
               precision    recall  f1-score   support

           0       0.88      0.69      0.78       615
           1       0.86      0.96      0.91      1160
           2       0.95      0.95      0.95      3068

    accuracy                           0.92      4843
   macro avg       0.90      0.87      0.88      4843
weighted avg       0.92      0.92      0.92      4843

*********************************************
Accuracy score 
 0.9221556886227545


In [14]:
# Training accuracy
y_pred_train_lr = model_1.predict(x_train)

print('*************Training Accuracy*************')
print('confusion_matrix \n',confusion_matrix(y_train,y_pred_train_lr))
print('*********************************************')
print('classification_report \n',classification_report(y_train,y_pred_train_lr))
print('*********************************************')
print("Accuracy score \n",accuracy_score(y_train,y_pred_train_lr))

*************Training Accuracy*************
confusion_matrix 
 [[1690   45   30]
 [   3 3471   17]
 [   4   33 9235]]
*********************************************
classification_report 
               precision    recall  f1-score   support

           0       1.00      0.96      0.98      1765
           1       0.98      0.99      0.99      3491
           2       0.99      1.00      1.00      9272

    accuracy                           0.99     14528
   macro avg       0.99      0.98      0.99     14528
weighted avg       0.99      0.99      0.99     14528

*********************************************
Accuracy score 
 0.9909140969162996


## 2. Linear support vector classifier

In [12]:
from sklearn.svm import LinearSVC

In [15]:
model_2 = LinearSVC(verbose=0)

# Trainning
model_2.fit(x_train,y_train)
# Testing
y_pred_test_lsvc = model_2.predict(x_test)

print('*************Testing Accuracy*************')
print('confusion_matrix \n',confusion_matrix(y_test,y_pred_test_lsvc))
print('*********************************************')
print('classification_report \n',classification_report(y_test,y_pred_test_lsvc))
print('*********************************************')
print("Accuracy score \n",accuracy_score(y_test,y_pred_test_lsvc))

*************Testing Accuracy*************
confusion_matrix 
 [[ 480   49   86]
 [  11 1125   24]
 [  50   81 2937]]
*********************************************
classification_report 
               precision    recall  f1-score   support

           0       0.89      0.78      0.83       615
           1       0.90      0.97      0.93      1160
           2       0.96      0.96      0.96      3068

    accuracy                           0.94      4843
   macro avg       0.92      0.90      0.91      4843
weighted avg       0.94      0.94      0.94      4843

*********************************************
Accuracy score 
 0.9378484410489366


In [16]:
# Training accuracy
y_pred_train_lsvc = model_2.predict(x_train)

print('*************Training Accuracy*************')
print('confusion_matrix \n',confusion_matrix(y_train,y_pred_train_lsvc))
print('*********************************************')
print('classification_report \n',classification_report(y_train,y_pred_train_lsvc))
print('*********************************************')
print("Accuracy score \n",accuracy_score(y_train,y_pred_train_lsvc))

*************Training Accuracy*************
confusion_matrix 
 [[1763    0    2]
 [   0 3488    3]
 [   1    0 9271]]
*********************************************
classification_report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1765
           1       1.00      1.00      1.00      3491
           2       1.00      1.00      1.00      9272

    accuracy                           1.00     14528
   macro avg       1.00      1.00      1.00     14528
weighted avg       1.00      1.00      1.00     14528

*********************************************
Accuracy score 
 0.9995870044052864


## 3. Decision tree classifier

In [18]:
from sklearn.tree import DecisionTreeClassifier

In [19]:
model_3 = DecisionTreeClassifier(criterion = 'entropy')

# Trainning
model_3.fit(x_train,y_train)
# Testing
y_pred_test_dtc = model_3.predict(x_test)

print('*************Testing Accuracy*************')
print('confusion_matrix \n',confusion_matrix(y_test,y_pred_test_dtc))
print('*********************************************')
print('classification_report \n',classification_report(y_test,y_pred_test_dtc))
print('*********************************************')
print("Accuracy score \n",accuracy_score(y_test,y_pred_test_dtc))

*************Testing Accuracy*************
confusion_matrix 
 [[ 331   96  188]
 [  42 1063   55]
 [ 142  110 2816]]
*********************************************
classification_report 
               precision    recall  f1-score   support

           0       0.64      0.54      0.59       615
           1       0.84      0.92      0.88      1160
           2       0.92      0.92      0.92      3068

    accuracy                           0.87      4843
   macro avg       0.80      0.79      0.79      4843
weighted avg       0.87      0.87      0.87      4843

*********************************************
Accuracy score 
 0.8692958909766674


In [20]:
# Training accuracy
y_pred_train_dtc = model_3.predict(x_train)           

print('*************Training Accuracy*************')
print('confusion_matrix \n',confusion_matrix(y_train,y_pred_train_dtc))
print('*********************************************')
print('classification_report \n',classification_report(y_train,y_pred_train_dtc))
print('*********************************************')
print("Accuracy score \n",accuracy_score(y_train,y_pred_train_dtc))

*************Training Accuracy*************
confusion_matrix 
 [[1765    0    0]
 [   0 3491    0]
 [   0    0 9272]]
*********************************************
classification_report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1765
           1       1.00      1.00      1.00      3491
           2       1.00      1.00      1.00      9272

    accuracy                           1.00     14528
   macro avg       1.00      1.00      1.00     14528
weighted avg       1.00      1.00      1.00     14528

*********************************************
Accuracy score 
 1.0


## 4. XGBoost Classifier

In [21]:
from xgboost import XGBClassifier

In [22]:
model_4 = XGBClassifier(learning_rate = 0.9)

# Trainning
model_4.fit(x_train,y_train)
# Testing
y_pred_test_xgbc = model_4.predict(x_test)

print('*************Testing Accuracy*************')
print('confusion_matrix \n',confusion_matrix(y_test,y_pred_test_xgbc))
print('*********************************************')
print('classification_report \n',classification_report(y_test,y_pred_test_xgbc))
print('*********************************************')
print("Accuracy score \n",accuracy_score(y_test,y_pred_test_xgbc))

*************Testing Accuracy*************
confusion_matrix 
 [[ 449   52  114]
 [   6 1138   16]
 [  51   78 2939]]
*********************************************
classification_report 
               precision    recall  f1-score   support

           0       0.89      0.73      0.80       615
           1       0.90      0.98      0.94      1160
           2       0.96      0.96      0.96      3068

    accuracy                           0.93      4843
   macro avg       0.91      0.89      0.90      4843
weighted avg       0.93      0.93      0.93      4843

*********************************************
Accuracy score 
 0.9345447036960561


In [23]:
# Training accuracy
y_pred_train_xgbc = model_4.predict(x_train)           

print('*************Training Accuracy*************')
print('confusion_matrix \n',confusion_matrix(y_train,y_pred_train_xgbc))
print('*********************************************')
print('classification_report \n',classification_report(y_train,y_pred_train_xgbc))
print('*********************************************')
print("Accuracy score \n",accuracy_score(y_train,y_pred_train_xgbc))

*************Training Accuracy*************
confusion_matrix 
 [[1640  103   22]
 [   6 3469   16]
 [  11  107 9154]]
*********************************************
classification_report 
               precision    recall  f1-score   support

           0       0.99      0.93      0.96      1765
           1       0.94      0.99      0.97      3491
           2       1.00      0.99      0.99      9272

    accuracy                           0.98     14528
   macro avg       0.98      0.97      0.97     14528
weighted avg       0.98      0.98      0.98     14528

*********************************************
Accuracy score 
 0.9817593612334802


## Accuracy comparision

In [24]:
model = ['Linear Regression','Linear SVC','Decision Tree','XGB Classifier']

training = [
    (accuracy_score(y_train,y_pred_train_lr)*100),
    (accuracy_score(y_train,y_pred_train_lsvc)*100),
    (accuracy_score(y_train,y_pred_train_dtc)*100),
    (accuracy_score(y_train,y_pred_train_xgbc)*100)
]

testing = [
    (accuracy_score(y_test,y_pred_test_lr)*100),
    (accuracy_score(y_test,y_pred_test_lsvc)*100),
    (accuracy_score(y_test,y_pred_test_dtc)*100),
    (accuracy_score(y_test,y_pred_test_xgbc)*100)
]

In [25]:
df = pd.DataFrame()
df['Model'] = model
df['Training'] = training
df['Testing'] = testing
df

Unnamed: 0,Model,Training,Testing
0,Linear Regression,99.09141,92.215569
1,Linear SVC,99.9587,93.784844
2,Decision Tree,100.0,86.929589
3,XGB Classifier,98.175936,93.45447


### Here Linear SVC model Better than other

# Export Model

In [17]:
import pickle
from sklearn.pipeline import Pipeline

In [18]:
pipeline = Pipeline([('CountVectorizer',bow_vectorizer),('clf',model_2)])

In [19]:
file_name = 'final_model.pkl'
pickle.dump(pipeline,open(file_name,'wb'))

### Top five most negative reviews

In [53]:
pd.options.display.max_colwidth = 100

In [54]:
data['polarity'].sort_values(ascending=True)

18589   -1.0
6765    -1.0
7308    -1.0
1462    -1.0
10383   -1.0
        ... 
7007     1.0
11851    1.0
11837    1.0
6170     1.0
9014     1.0
Name: polarity, Length: 19371, dtype: float64

In [55]:
data['Reviews'].iloc[[18589,6765,7308,1462,10383]]

18589                                                                                 worst... get a samsung
6765                                                                                            Worst mobile
7308                                                                            it has a horrible selfie cam
1462     Terrible battery life and camera. Use my phone as my camera but this was the reason I switched. ...
10383                               I did that with tooth paste but nothing improves in fact i becomes worst
Name: Reviews, dtype: object

### Top five most positive reviews

In [56]:
data['polarity'].sort_values(ascending=False)

10781    1.0
14969    1.0
5792     1.0
5793     1.0
11028    1.0
        ... 
15694   -1.0
6461    -1.0
6765    -1.0
9180    -1.0
6498    -1.0
Name: polarity, Length: 19371, dtype: float64

In [57]:
data['Reviews'].iloc[[10781,14969,5792,5793,11028]]

10781                                                                    definetly 4s... the best.
14969    So how would you describe other phones battery life since iphone is the BEST in its class
5792                                                                           An awesome. Product
5793                                                                             Excellent product
11028                                                                               best option 4s
Name: Reviews, dtype: object