In [1]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS


import pandas as pd


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, consensus_score

#### DataSets

In [2]:
data_yelp = pd.read_csv("yelp_labelled.txt", sep='\t',  names= ["review", "sentiment"])
data_amazon = pd.read_csv("amazon_cells_labelled.txt", sep='\t', names= ["review", "sentiment"] )
data_imdb = pd.read_csv("imdb_labelled.txt", sep='\t', names= ["review", "sentiment"])

In [5]:
data = data_yelp.append([data_amazon, data_imdb], ignore_index=True)
print(data.sentiment.value_counts())
print("\n")
data.head()

1    1386
0    1362
Name: sentiment, dtype: int64




Unnamed: 0,review,sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [12]:
#Tokenization

import string
punct = string.punctuation

stopwords = list(STOP_WORDS)
nlp = spacy.load('en_core_web_sm')

In [13]:
def text_data_cleaning(sentencces):
    doc = nlp(sentencces)
    tokens = []
    
    
    for token in doc:
        if token.lemma_ !="-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
            
        tokens.append(temp)
        
         
        
    cleaned_tokens = []
    
    
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
            
    return cleaned_tokens

In [15]:
#example
text_data_cleaning("Hello! I'm Samir. What's your name?")

['hello', 'samir']

In [18]:
#Vectorization
from sklearn.svm import LinearSVC
classifier = LinearSVC()

tfidf = TfidfVectorizer(tokenizer=text_data_cleaning)

In [20]:
X = data['review']
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [21]:
clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])
clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function text_data_cleaning at 0x000001DC8693ED30>)),
                ('clf', LinearSVC())])

In [26]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred,))

              precision    recall  f1-score   support

           0       0.80      0.74      0.76       349
           1       0.75      0.80      0.77       338

    accuracy                           0.77       687
   macro avg       0.77      0.77      0.77       687
weighted avg       0.77      0.77      0.77       687

[[257  92]
 [ 66 272]]


In [28]:
clf.predict(["Wow, This is horrible"])

array([0], dtype=int64)