In [1]:
#importing the packages
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
nlp = spacy.load('en_core_web_sm')

In [2]:
#reading the data
data_yelp = pd.read_csv('datasets/yelp_labelled.txt',sep="\t",header=None)
column_name=['Review','Sentiment']
data_yelp.columns = column_name
data_amazon = pd.read_csv('datasets/amazon_cells_labelled.txt',sep="\t",header=None)
data_amazon.columns = column_name
data_imdb = pd.read_csv('datasets/imdb_labelled.txt',sep='\t',header=None)
data_imdb.columns = column_name

In [3]:
#Merging all the data 
data = data_yelp.append([data_amazon,data_imdb])


In [4]:
#Getting stopwords and punctuation list
from spacy.lang.en.stop_words import STOP_WORDS
import string
stopwords = list(STOP_WORDS)
punctuation = string.punctuation

In [5]:
# function for text cleaning
def text_data_cleaning(sentence):
    doc = nlp(sentence)
    tokens=[]
    for token in doc:
        if token.lemma_ !="-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    cleared_token = []
    for token in tokens:
        if token not in stopwords and token not in punctuation:
            cleared_token.append(token)
    return cleared_token

In [6]:
#Inititalising tfidf and classifier variable
tfidf = TfidfVectorizer(tokenizer=text_data_cleaning)
classifer = LinearSVC()

In [7]:
#Splitting the data into trainnig and testing data
X= data['Review']
y= data['Sentiment']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=4)
# X_train.shape,X_test.shape

In [8]:
# Applying pipelining
clf = Pipeline([('tfidf',tfidf),('clf',classifer)])

In [9]:
#Fitting the model
clf.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [10]:
#predicting the result for the testing dataset
y_pred = clf.predict(X_test)

In [11]:
#printing the classification report test data vs predictted data  
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.77      0.77      0.77       267
           1       0.78      0.78      0.78       283

   micro avg       0.78      0.78      0.78       550
   macro avg       0.78      0.78      0.78       550
weighted avg       0.78      0.78      0.78       550



In [12]:
text = input("Enter movie review:")
temp = clf.predict([text])
if temp[0] == 0:
    print("\nReview is negative.")
    
else:
    print("\nReview is positive.")

Enter movie review:I will give 100/100 rating to 'Dil Bechara' movie❤️ 'Dil Bechara' The best movie I have ever seen. I watched this movie only because of Sushant❤️. Sushant❤️ character in dil bechara is almost similar to himself. Superb story, Superior acting sushant acting is as like as he lived his own character ,climax is so emotional... Everyone should watch this movie.

Review is positive.
