Importing important libraries

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
review_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Task1/movie review.csv', names=['rating','tag','review'])
review_data.head()

Unnamed: 0,rating,tag,review
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...
1,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...


In [3]:
review_data.shape

(400000, 3)

Sampling of dataset

In [4]:
sampling_interval = 10

num_rows = len(review_data)

sampled_data = pd.DataFrame(columns=['rating','tag','review'])

for i in range(0, num_rows, sampling_interval):
    sampled_data.loc[i] = review_data.iloc[i]
print(sampled_data.shape)

(40000, 3)


Creating new column with name 'sentiment'

In [5]:
#create a new column named sentiment and for rating = 1 'negative' is assigned and for rating = 2 'positive' is assigned in sentiment column

sampled_data['sentiment'] = sampled_data['rating'].apply(lambda x: 'negative' if x == 1 else 'positive')
sampled_data.head()

Unnamed: 0,rating,tag,review,sentiment
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...,positive
10,2,Great book for travelling Europe,"I currently live in Europe, and this is the bo...",positive
20,1,Long and boring,"I've read this book with much expectation, it ...",negative
30,2,This is the all time best book!,This is the all time best book. She mentoins i...,positive
40,2,WeiB FOREVER!,This is a great CD!! I love WeiB Kreuz!! The a...,positive


Preprocessing text data

In [6]:
# tokenize review column

def tokenize_reviews(text):
  tokens = nltk.word_tokenize(text)
  return tokens

sampled_data['tokenized_review'] = sampled_data['review'].apply(tokenize_reviews)
sampled_data.head()

Unnamed: 0,rating,tag,review,sentiment,tokenized_review
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...,positive,"[My, lovely, Pat, has, one, of, the, GREAT, vo..."
10,2,Great book for travelling Europe,"I currently live in Europe, and this is the bo...",positive,"[I, currently, live, in, Europe, ,, and, this,..."
20,1,Long and boring,"I've read this book with much expectation, it ...",negative,"[I, 've, read, this, book, with, much, expecta..."
30,2,This is the all time best book!,This is the all time best book. She mentoins i...,positive,"[This, is, the, all, time, best, book, ., She,..."
40,2,WeiB FOREVER!,This is a great CD!! I love WeiB Kreuz!! The a...,positive,"[This, is, a, great, CD, !, !, I, love, WeiB, ..."


In [7]:
# remove stopwords from tokenized review column

stop_words = stopwords.words('english')

def remove_stopwords(tokenized_review):
  filtered_review = [word for word in tokenized_review if word not in stop_words]
  return filtered_review

sampled_data['filtered_review'] = sampled_data['tokenized_review'].apply(remove_stopwords)
sampled_data.head()

Unnamed: 0,rating,tag,review,sentiment,tokenized_review,filtered_review
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...,positive,"[My, lovely, Pat, has, one, of, the, GREAT, vo...","[My, lovely, Pat, one, GREAT, voices, generati..."
10,2,Great book for travelling Europe,"I currently live in Europe, and this is the bo...",positive,"[I, currently, live, in, Europe, ,, and, this,...","[I, currently, live, Europe, ,, book, I, recom..."
20,1,Long and boring,"I've read this book with much expectation, it ...",negative,"[I, 've, read, this, book, with, much, expecta...","[I, 've, read, book, much, expectation, ,, bor..."
30,2,This is the all time best book!,This is the all time best book. She mentoins i...,positive,"[This, is, the, all, time, best, book, ., She,...","[This, time, best, book, ., She, mentoins, boo..."
40,2,WeiB FOREVER!,This is a great CD!! I love WeiB Kreuz!! The a...,positive,"[This, is, a, great, CD, !, !, I, love, WeiB, ...","[This, great, CD, !, !, I, love, WeiB, Kreuz, ..."


Splitting training and testing data

In [8]:
x_train, x_test, y_train, y_test = train_test_split( sampled_data['filtered_review'], sampled_data['sentiment'],test_size=0.2)

# join the tokenized words back into strings
x_train_str = [' '.join(tokens) for tokens in x_train]
x_test_str = [' '.join(tokens) for tokens in x_test]

Feature extraction

In [9]:
cv = CountVectorizer()
x_train_cv = cv.fit_transform(x_train_str)
x_test_cv = cv.transform(x_test_str)

Taining SVM model

In [10]:
models=svm.SVC(C=1000, kernel='sigmoid', gamma='auto')
models.fit(x_train_cv,y_train)

In [11]:
models.score(x_train_cv,y_train)

0.90303125

Testing some of the text by trained model

In [12]:
text=['good','I dont like this','This is good','Better luck next time','boring']
vector = cv.transform(text)
models.predict(vector)

array(['positive', 'negative', 'positive', 'negative', 'negative'],
      dtype=object)