<a href="https://colab.research.google.com/github/Tamaghnatech/NLP-based-review-system/blob/main/NLP_based_review_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

IMPORTING THE DATA

In [None]:
import spacy
from spacy import displacy

In [None]:
nlp = spacy.load('en_core_web_sm')
text = "Hi, myname is Tamaghna Nag, this is the first sentence where i introduce myself. this is the second sentence where i do dumb things. this is the third one yahooooo"
doc = nlp(text)
doc

Hi, myname is Tamaghna Nag, this is the first sentence where i introduce myself. this is the second sentence where i do dumb things. this is the third one yahooooo

NOW WE WILL SEE TOKENS IN DOC

In [None]:
for token in doc:
  print(token)

Hi
,
myname
is
Tamaghna
Nag
,
this
is
the
first
sentence
where
i
introduce
myself.this
is
the
second
sentence
where
i
do
dumb
things.this
is
the
third
one
yahooooo


WE USE CREATE_PIPE() to create pipeline component

In [None]:
sent = nlp.create_pipe('sentencizer')
nlp.add_pipe(sent, before='parser')
doc = nlp(text)
for sent in doc.sents:
  print(sent)

Hi, myname is Tamaghna Nag, this is the first sentence where i introduce myself.
this is the second sentence where i do dumb things.
this is the third one yahooooo


WE FIND THE STOP WORDS

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)
print(stopwords)

['five', 'sixty', 'others', 'wherein', 'they', 'noone', 'in', 'none', 'thus', 'an', 'whatever', 'quite', 'seems', 'again', 'whoever', 'nevertheless', 'six', 'of', 'show', 'their', 'always', 'thereupon', 'might', 'third', 'whether', "'m", 'hereafter', 'his', 'else', 'he', 'side', 'these', "'s", 'otherwise', 'where', 'first', 'seem', 'put', 'should', '‘s', 'nothing', 'amongst', 'most', 'but', 'because', 'made', 'whenever', 'via', 'say', 'doing', 'before', 'never', 'us', 'therefore', 'i', 'seemed', '’re', 'beside', 'formerly', 'if', 'me', 'whence', 'your', 'every', '‘d', 'them', 'become', 'was', 'hundred', 'together', 'itself', 'whole', 'him', 'several', 'herself', 'sometime', 'everyone', 'moreover', 'anything', 'anywhere', 'around', 'our', 'same', 'really', 'some', 'such', 'someone', 'any', 'fifteen', 'thru', 'used', 'full', 'had', 'therein', 'yourselves', 'forty', 'became', 'this', 'whereupon', 'two', 'beforehand', 'nobody', 'being', 'please', 'part', 'upon', 'except', 'various', 'while

In [None]:
len(stopwords)

326

PRINTING ALL TOKENS 

In [None]:
for token in doc:
  if token.is_stop == False:
    print(token)
    

Hi
,
myname
Tamaghna
Nag
,
sentence
introduce
.
second
sentence
dumb
things
.
yahooooo


LEMMATIZATION

In [None]:
doc = nlp('run runs running runner')
for lem in doc:
  print(lem.text, lem.lemma_)

run run
runs run
running run
runner runner


PARTS OF SPEECH TAGGING(POS)

In [None]:
doc = nlp('All is well at my end!')
for token in doc:
  print(token.text, token.pos_)

All DET
is AUX
well ADV
at ADP
my DET
end NOUN
! PUNCT


In [None]:
displacy.render(doc, style = 'dep')

'<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="en" id="66bfc41bf87b44bdb567f6ded847b60e-0" class="displacy" width="1100" height="312.0" direction="ltr" style="max-width: none; height: 312.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr">\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="222.0">\n    <tspan class="displacy-word" fill="currentColor" x="50">All</tspan>\n    <tspan class="displacy-tag" dy="2em" fill="currentColor" x="50">DET</tspan>\n</text>\n\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="222.0">\n    <tspan class="displacy-word" fill="currentColor" x="225">is</tspan>\n    <tspan class="displacy-tag" dy="2em" fill="currentColor" x="225">AUX</tspan>\n</text>\n\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="222.0">\n    <tspan class="displacy-word" fill="currentColor" x="400">well</tspan>\n    <tspan class="displacy-tag" d

ENTITY DETECTION

In [None]:
doc = nlp("Hi, I am Tamaghna, I am a CS Engineer currently studying in 4th year at IEM, I love to code and occasionally spent my time jamming games or cracking weird lesser known puzzles which are worth every penny.Currently i earn a substantial amount of money to sustain myself for the forseeable future and I am not inclined to work so often so maybe that makes me lazy.")
doc

Hi, I am Tamaghna, I am a CS Engineer currently studying in 4th year at IEM, I love to code and occasionally spent my time jamming games or cracking weird lesser known puzzles which are worth every penny.Currently i earn a substantial amount of money to sustain myself for the forseeable future and I am not inclined to work so often so maybe that makes me lazy.

In [None]:
displacy.render(doc, style = 'ent')

'<div class="entities" style="line-height: 2.5; direction: ltr">Hi, I am \n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Tamaghna\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">ORG</span>\n</mark>\n, I am a \n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    CS Engineer\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">ORG</span>\n</mark>\n currently studying in \n<mark class="entity" style="background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    4th year\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



In [None]:
data_yelp = pd.read_csv('yelp_labelled.txt', sep='\t', header = None)
data_yelp.head()

Unnamed: 0,0,1
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [None]:
columns_name = ['Review', 'Sentiment']
data_yelp.columns = columns_name
data_yelp.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [None]:
data_yelp.shape

(1000, 2)

In [None]:
data_amazon = pd.read_csv('amazon_cells_labelled.txt', sep = '\t', header = None)
data_amazon.columns = columns_name
data_amazon.head()

Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [None]:
data_amazon.shape

(1000, 2)

In [None]:
data_imdb = pd.read_csv('imdb_labelled.txt', sep = '\t', header = None)
data_imdb.columns = columns_name
data_imdb.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [None]:
data_imdb.shape

(748, 2)

In [None]:
data = data_yelp.append([data_amazon, data_imdb], ignore_index=True)
data.shape

(2748, 2)

In [None]:
data.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [None]:
data['Sentiment'].value_counts()

1    1386
0    1362
Name: Sentiment, dtype: int64

In [None]:
data.isnull().sum()

Review       0
Sentiment    0
dtype: int64

In [None]:
import string
punct = string.punctuation
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
def text_data_cleaning(sentence):
  doc = nlp(sentence)

  tokens = []
  for token in doc:
    if token.lemma_ != "-PRON-":
      temp = token.lemma_.lower().strip()
    else:
      temp = token.lower_
    tokens.append(temp)

  cleaned_token = []
  for token in tokens:
    if token not in stopwords and token not in punct:
      cleaned_token.append(token)
  return cleaned_token


text_data_cleaning("Hello how are you doing today?")

['hello', 'today']

In [None]:
tfidf = TfidfVectorizer(tokenizer = text_data_cleaning)
classifier = LinearSVC()

In [None]:
x = data['Review']
y = data['Sentiment']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)
x_train.shape, x_test.shape

((2198,), (550,))

In [None]:
clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])
clf.fit(x_train, y_train)


Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function text_data_cleaning at 0x7f477fd87d08>,
                                 use_idf=True, vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept

In [None]:
y_pred = clf.predict(x_test)

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.82      0.80       285
           1       0.79      0.74      0.77       265

    accuracy                           0.78       550
   macro avg       0.78      0.78      0.78       550
weighted avg       0.78      0.78      0.78       550



In [None]:
confusion_matrix(y_test, y_pred)


array([[233,  52],
       [ 68, 197]])

In [None]:
clf.predict(['This lesson is amazing!'])


array([1])

In [None]:
clf.predict(['this container is so bad'])

array([0])