In [1]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [2]:
np.random.seed(500)

In [3]:
corpus=pd.read_csv(r"C:\Users\Dhwani\Desktop\Datasets\corpus.csv", encoding='latin-1')


In [4]:
corpus['text']

0        Stuning even for the non-gamer: This sound tr...
1        The best soundtrack ever to anything.: I'm re...
2        Amazing!: This soundtrack is my favorite musi...
3        Excellent Soundtrack: I truly like this sound...
4        Remember, Pull Your Jaw Off The Floor After H...
5        an absolute masterpiece: I am quite sure any ...
6        Buyer beware: This is a self-published book, ...
7        Glorious story: I loved Whisper of the wicked...
8        A FIVE STAR BOOK: I just finished reading Whi...
9        Whispers of the Wicked Saints: This was a eas...
10       The Worst!: A complete waste of time. Typogra...
11       Great book: This was a great book,I just coul...
12       Great Read: I thought this book was brilliant...
13       Oh please: I guess you have to be a romance n...
14       Awful beyond belief!: I feel I have to write ...
15       Don't try to fool us with fake reviews.: It's...
16       A romantic zen baseball comedy: When you hear...
17       Fashi

In [5]:
# preprocessing

print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
corpus['text']=[entry.lower() for entry in corpus['text']]
corpus['text']

0        stuning even for the non-gamer: this sound tr...
1        the best soundtrack ever to anything.: i'm re...
2        amazing!: this soundtrack is my favorite musi...
3        excellent soundtrack: i truly like this sound...
4        remember, pull your jaw off the floor after h...
5        an absolute masterpiece: i am quite sure any ...
6        buyer beware: this is a self-published book, ...
7        glorious story: i loved whisper of the wicked...
8        a five star book: i just finished reading whi...
9        whispers of the wicked saints: this was a eas...
10       the worst!: a complete waste of time. typogra...
11       great book: this was a great book,i just coul...
12       great read: i thought this book was brilliant...
13       oh please: i guess you have to be a romance n...
14       awful beyond belief!: i feel i have to write ...
15       don't try to fool us with fake reviews.: it's...
16       a romantic zen baseball comedy: when you hear...
17       fashi

In [7]:
corpus['text']=[word_tokenize(entry) for entry in corpus['text']]

In [8]:
tag_map=defaultdict(lambda:wn.NOUN)
tag_map['J']=wn.ADJ
tag_map['V']=wn.VERB
tag_map['R']=wn.ADV

for index, entry in enumerate(corpus['text']):
    final_words=[]
    word_Lemmatized=WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_final=word_Lemmatized.lemmatize(word, tag_map[tag[0]])
            final_words.append(word_final)
#             print('hello')
    corpus.loc[index,'text_final'] = repr(final_words)

In [26]:
corpus['text_final']

0       ['stun', 'even', 'sound', 'track', 'beautiful'...
1       ['best', 'soundtrack', 'ever', 'anything', 're...
2       ['amaze', 'soundtrack', 'favorite', 'music', '...
3       ['excellent', 'soundtrack', 'truly', 'like', '...
4       ['remember', 'pull', 'jaw', 'floor', 'hear', '...
5       ['absolute', 'masterpiece', 'quite', 'sure', '...
6       ['buyer', 'beware', 'book', 'want', 'know', 'r...
7       ['glorious', 'story', 'love', 'whisper', 'wick...
8       ['five', 'star', 'book', 'finish', 'read', 'wh...
9       ['whisper', 'wicked', 'saint', 'easy', 'read',...
10      ['bad', 'complete', 'waste', 'time', 'typograp...
11      ['great', 'book', 'great', 'book', 'could', 'p...
12      ['great', 'read', 'think', 'book', 'brilliant'...
13      ['oh', 'please', 'guess', 'romance', 'novel', ...
14      ['awful', 'beyond', 'belief', 'feel', 'write',...
15      ['try', 'fool', 'u', 'fake', 'review', 'glarin...
16      ['romantic', 'zen', 'baseball', 'comedy', 'hea...
17      ['fash

In [10]:
train_x, test_x, train_y, test_y=model_selection.train_test_split(corpus['text_final'], corpus['label'], test_size=0.3)

In [11]:
Encoder=LabelEncoder()
train_y=Encoder.fit_transform(train_y)
test_y=Encoder.fit_transform(test_y)

In [13]:
Tfidf_vect=TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(corpus['text_final'])

train_x_tfidf=Tfidf_vect.transform(train_x)
test_x_tfidf=Tfidf_vect.transform(test_x)

In [17]:
print(train_x_tfidf)

  (0, 4506)	0.3763188267807246
  (0, 4505)	0.15031494427382475
  (0, 3976)	0.35868777245753825
  (0, 3890)	0.2515140235472667
  (0, 3860)	0.2690675584422277
  (0, 3745)	0.34695623926050195
  (0, 3655)	0.2896999547088821
  (0, 3564)	0.29449641491430995
  (0, 2931)	0.229683025366997
  (0, 1935)	0.13406125327954532
  (0, 1529)	0.17761496997588844
  (0, 521)	0.321056290554803
  (0, 491)	0.12303572865008613
  (0, 240)	0.2448559358109696
  (1, 4687)	0.21384275526442909
  (1, 4060)	0.3566872275481094
  (1, 3434)	0.21279175847748263
  (1, 3313)	0.8157357261127677
  (1, 2592)	0.2173336717856602
  (1, 1260)	0.2074693534878867
  (1, 604)	0.1614401835472762
  (2, 4734)	0.18368024895699514
  (2, 4625)	0.15009324791624393
  (2, 4466)	0.10284196288022608
  (2, 4191)	0.1168175572462934
  :	:
  (6998, 2523)	0.11512525241790023
  (6998, 2125)	0.13650351320774795
  (6998, 1971)	0.07126979525506368
  (6998, 1778)	0.22013576217543665
  (6998, 1746)	0.19935227823153098
  (6998, 1710)	0.13509114757747032
  (

In [18]:
SVM=svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(train_x_tfidf, train_y)

SVM_predictions=SVM.predict(test_x_tfidf)

print("Accuracy score:", accuracy_score(SVM_predictions, test_y)*100)

Accuracy score: 84.66666666666667
