In [1]:
# Importing basic libraries
import pandas as pd
import numpy as np
import spacy
import string
punct = string.punctuation
nlp = spacy.load('en_core_web_sm') #Loading spacy english

In [2]:
df = pd.read_csv('review.csv')  #reading reviews dataset
print(df.shape)
df.head()

(14675, 2)


Unnamed: 0,sentiment,review
0,1,Good but need updates and improvements
1,0,"Worst mobile i have bought ever, Battery is dr..."
2,1,when I will get my 10% cash back.... its alrea...
3,1,Good
4,0,The worst phone everThey have changed the last...


In [3]:
df['sentiment'].value_counts()

0    7712
1    6963
Name: sentiment, dtype: int64

In [4]:
df.isnull().sum()

sentiment    0
review       0
dtype: int64

In [5]:
# Text Processing 
def text_clean(text):
    text = text.lower()  #Convert text in lower case
    punc_removed = [char for char in text if char not in punct]  #Removing Punctuations
    punc_removed_join = ''.join(punc_removed) 
    
    text_out = []
    doc= nlp(punc_removed_join)
    for token in doc:
        if token.is_stop == False and token.is_alpha and len(token)>2:
            lemma = token.lemma_            #lemmatization of token word
            text_out.append(lemma)
            
    return text_out

In [6]:
# df['clean_doc'] = df['review'].apply(text_clean) #Apply Preprocessing of text
# df.head()

In [7]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf = TfidfVectorizer(tokenizer = text_clean,max_features=150)
# print(X.shape)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# Vectorizer
cv = CountVectorizer(tokenizer = text_clean,max_features=150)

# Decleare model
classifier = LinearSVC()

In [9]:
X = df['review']
y = df['sentiment']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
X_train.shape, X_test.shape

((10272,), (4403,))

In [11]:
# classifier.fit(X_train, y_train)    #fitting training data in model
# y_pred = classifier.predict(X_test)  #getting predict value on test data

# score = classifier.score(X_test, y_test) #getting accuracy on predict and real data
# cm = confusion_matrix(y_test, y_pred)  #generating confussion metrics
# cr1 = classification_report(y_test, y_pred)  #generating classification report

# print("Logistic Regression Accuracy :   %0.3f" % score)
# print("Confussion Metrics : ","\n",cm)
# print("Classification Report : ","\n",cr1)

In [12]:
# Creating pipline
clf = Pipeline([('cv', cv), ('clf', classifier)])
clf.fit(X_train, y_train)



Pipeline(steps=[('cv',
                 CountVectorizer(max_features=150,
                                 tokenizer=<function text_clean at 0x000001BAB203E040>)),
                ('clf', LinearSVC())])

In [13]:
y_pred = clf.predict(X_test) # predicting test result

In [14]:
y_pred

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [15]:
#  Getting Classification Report
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print("Classification Report :","\n",classification_report(y_test, y_pred))

Classification Report : 
               precision    recall  f1-score   support

           0       0.85      0.80      0.82      2311
           1       0.79      0.84      0.82      2092

    accuracy                           0.82      4403
   macro avg       0.82      0.82      0.82      4403
weighted avg       0.82      0.82      0.82      4403



In [16]:
# Printing Confussion metrics
confusion_matrix(y_test, y_pred)

array([[1852,  459],
       [ 334, 1758]], dtype=int64)

In [17]:
print(clf.score(X_test,y_test))

0.8198955257778787


In [18]:
print(clf.score(X_train,y_train))

0.8275895638629284


In [32]:
print(clf.predict(['Worth of watching it. Please like it'])[0])
print(clf.predict(['Wow, this is amazing lesson'])[0])
print(clf.predict(['this sucks'])[0])
print(clf.predict(['Loved it. Amazing'])[0])
print(clf.predict(['bad .'])[0])

1
1
1
1
0


In [20]:
df['clean_doc'] = df['review'].apply(text_clean) #Cleaning text data for emotion

In [21]:
df.head()

Unnamed: 0,sentiment,review,clean_doc
0,1,Good but need updates and improvements,"[good, need, update, improvement]"
1,0,"Worst mobile i have bought ever, Battery is dr...","[bad, mobile, buy, battery, drain, like, hell,..."
2,1,when I will get my 10% cash back.... its alrea...,"[cash, january]"
3,1,Good,[good]
4,0,The worst phone everThey have changed the last...,"[bad, phone, everthey, change, phone, problem,..."


In [22]:
# Creating function for getting emotions 
def emotion(text):
    emotion_list = []
    with open('Emotions.txt', 'r') as file:
        for line in file:
            clear_line = line.replace("\n", '').replace(",", '').replace("'", '').strip()
            word, emotion = clear_line.split(':')
        
            if word in text:
                emotion_list.append(emotion)

    return emotion_list

In [23]:
df['emotion'] = df['clean_doc'].apply(emotion)

In [24]:
df.sample(50)

Unnamed: 0,sentiment,review,clean_doc,emotion
10266,1,Good Delevard,"[good, delevard]",[]
12281,1,Product is good battery turbo charge is really...,"[product, good, battery, turbo, charge, good, ...",[]
9173,1,Awesome,[awesome],[]
14454,0,"Phone is hanging after installing FB, messenge...","[phone, hang, instal, messenger, whattsappvery...",[]
11378,0,Nice phone but not very good phone,"[nice, phone, good, phone]",[]
2342,1,Nice nic charging,"[nice, nic, charge]",[]
12393,1,Very good product,"[good, product]",[]
3081,1,Good phone,"[good, phone]",[]
13062,1,Good Phone.Inbuilt Compas is not there.Schedul...,"[good, phoneinbuilt, compa, thereschedule, power]",[]
7486,1,Good looking mobile.... Battery draining fastl...,"[good, look, mobile, battery, drain, fastly, c...",[]


In [25]:
# df.to_csv('dataframe.csv')

In [26]:
import pickle

In [27]:
with open('model_sentiment.pkl','wb') as file:
    pickle.dump(clf,file)