In [1]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from pickle import dump
import warnings
warnings.filterwarnings('ignore')

In [2]:
data=pd.read_csv('review_rating_1.csv',index_col=[0])
data

Unnamed: 0,Clean_Review,Rating,Number of Characters,Rating_Analysis
0,expensive park get deal anniversary arrive lat...,4,427,Positive
1,ok nothing special charge diamond member hilto...,2,1408,Negative
2,experience monaco seattle levelpositives large...,3,1147,Positive
3,unique wonderful time monaco excellent short s...,5,456,Positive
4,go seahawk game awesome downfall view build co...,5,1028,Positive
...,...,...,...,...
20486,best keep secret rd time stay charm star ca be...,5,635,Positive
20487,price view quick place sightsdirectly street s...,4,240,Positive
20488,ok look modern outside desk particularly frien...,2,356,Negative
20489,theft ruin vacation open sept guests week happ...,1,4517,Negative


In [3]:
data['Rating_Analysis'] = data['Rating_Analysis'].replace({'Negative': -1})
data['Rating_Analysis'] = data['Rating_Analysis'].replace({'Positive': 1})
data['Rating_Analysis'] = data['Rating_Analysis'].replace({'Neutral': 0})
data

Unnamed: 0,Clean_Review,Rating,Number of Characters,Rating_Analysis
0,expensive park get deal anniversary arrive lat...,4,427,1
1,ok nothing special charge diamond member hilto...,2,1408,-1
2,experience monaco seattle levelpositives large...,3,1147,1
3,unique wonderful time monaco excellent short s...,5,456,1
4,go seahawk game awesome downfall view build co...,5,1028,1
...,...,...,...,...
20486,best keep secret rd time stay charm star ca be...,5,635,1
20487,price view quick place sightsdirectly street s...,4,240,1
20488,ok look modern outside desk particularly frien...,2,356,-1
20489,theft ruin vacation open sept guests week happ...,1,4517,-1


## TF-ID Vectorizer

In [5]:
count=TfidfVectorizer()
x_train_df=count.fit(data['Clean_Review'])

In [6]:
x_train=x_train_df.transform(data['Clean_Review'])

In [7]:
X_train,X_test,y_train,y_test=train_test_split(x_train,data['Rating_Analysis'],shuffle=True,random_state=30,test_size=0.3)

In [8]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(14343, 69689) (6148, 69689) (14343,) (6148,)


In [9]:
def Accuracy(y_train,y_train_pred,y_test,y_test_pred):
    print('Train Accuracy\n')
    print(classification_report(y_train,y_train_pred))
    print('\n',confusion_matrix(y_train,y_train_pred))
    print('\n',accuracy_score(y_train,y_train_pred))
    print('*'*100)
    print('Test Accuracy\n')
    print(classification_report(y_test,y_test_pred))
    print('\n',confusion_matrix(y_test,y_test_pred))
    print('\n',accuracy_score(y_test,y_test_pred)) 

## Final model 

In [10]:
from lightgbm import LGBMClassifier

In [11]:
LGBM = LGBMClassifier()

In [12]:
LGBM.fit(X_train, y_train)

LGBMClassifier()

In [13]:
LGBM_train=LGBM.predict(X_train)
LGBM_test=LGBM.predict(X_test)

In [14]:
LGBM_model=Accuracy(LGBM_train,y_train,LGBM_test,y_test)
LGBM_model

Train Accuracy

              precision    recall  f1-score   support

          -1       0.92      0.98      0.95      2113
           1       1.00      0.98      0.99     12230

    accuracy                           0.98     14343
   macro avg       0.96      0.98      0.97     14343
weighted avg       0.98      0.98      0.98     14343


 [[ 2063    50]
 [  184 12046]]

 0.9836854214599456
****************************************************************************************************
Test Accuracy

              precision    recall  f1-score   support

          -1       0.58      0.82      0.68       693
           1       0.98      0.93      0.95      5455

    accuracy                           0.91      6148
   macro avg       0.78      0.87      0.82      6148
weighted avg       0.93      0.91      0.92      6148


 [[ 565  128]
 [ 402 5053]]

 0.9137931034482759


## deployment

In [15]:
# build the intelligence for tfid Vectorizer 
x=data['Clean_Review']
y=data['Rating_Analysis']

In [16]:
tfid=TfidfVectorizer()
tfid_deploy=tfid.fit(x)

In [17]:
# converting text into numeric for svm
x_train=tfid_deploy.transform(x)

In [18]:
LGBM_model_deploy=LGBMClassifier()
LGBM_model_deploy.fit(x_train,y)

LGBMClassifier()

In [19]:
#saving svm n tfid into pkl
dump(obj=LGBM_model_deploy,file=open('LGBM_model_deploy.pkl','wb'))
dump(obj=tfid_deploy,file=open('tfid_deploy.pkl','wb'))