In [0]:
import nltk
import re
import numpy as np
import pandas as pd
#Tokenization of text
from nltk.tokenize import word_tokenize,sent_tokenize
#remove stop-words
from nltk.corpus import stopwords # library 
nltk.download('stopwords')
all_stopwords = set(stopwords.words('english')) # set the language 
from typing import List

In [0]:
# reading review data with panda frames 
reviews_data=pd.read_csv('IMDB Dataset.csv')
reviews_data.describe()

In [0]:
#sentiment counts
reviews_data['sentiment'].value_counts()

In [0]:
# The sentiments are either 'positive' or 'negative' and are evenly distributed. Lets preprocess the text using the simple tokenizer we built in last class. We call it preprocess_text now.
def preprocess_text(text: str) -> List[str]:
    # Looking at the text we see that <br></br> which is HTML tag for line break can be a good splitter
    # A sentence (atleast well structured) often has a full spot at the end. We use these two for word breaks
    pattern1 = re.compile("<br /><br />|\.")
    lines = re.split(pattern1, text)
    # you can break a sentence into words using whitespace based split
    tokens = []
    for line in lines:
        tokens += line.split(" ")

    # lowercase and remove any non-alphanumeric characters from tokens for normalize
    normalized_tokens = [re.sub(r"\W+", "", token.lower()) for token in tokens]
    return  " ".join([
            token
            for token in normalized_tokens
            if token and token not in all_stopwords and len(tokens) > 1 
        ])
    

  
custom_review = "I hated the film. It was a disaster. Poor direction, bad acting."
custom_review_tokens = preprocess_text(custom_review)
print(custom_review_tokens)

In [0]:
#apply preprocessing to review data
reviews_data['review'] = reviews_data['review'].apply(preprocess_text)

In [0]:
#split the dataset  
#train dataset
train_reviews=reviews_data.review[:40000]
train_sentiments=reviews_data.sentiment[:40000]
#test dataset
test_reviews=reviews_data.review[40000:45000]
test_sentiments=reviews_data.sentiment[40000:45000]
#validation (blind) dataset
blind_reviews=reviews_data.review[45000:]
blind_sentiments=reviews_data.sentiment[45000:]
print(train_reviews.shape,train_sentiments.shape)
print(test_reviews.shape,test_sentiments.shape)
print(blind_reviews.shape,blind_sentiments.shape)

In [0]:
# CountVectorizer implements both tokenization and occurrence counting in a single class. Read more here https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# You can also reuse the from scratch code we learnt in previous class
# TfidfVectorizer Convert a collection of raw documents to a matrix of TF-IDF features. Equivalent to CountVectorizer followed by TfidfTransformer.
# from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
#Count vectorizer with 
lower_count_thr = 100 # rare words/tokens
upper_count_thr = 5000 # frequent/common tokens

tv=TfidfVectorizer(min_df=lower_count_thr,max_df=upper_count_thr,binary=False,ngram_range=(1,1))
#transformed train reviews
tv_train_reviews=tv.fit_transform(train_reviews)
#transformed test reviews
tv_test_reviews=tv.transform(test_reviews)

#transformed validation reviews
tv_blind_reviews=tv.transform(blind_reviews)

print('BOW_cv_train:',tv_train_reviews.shape)
print('BOW_cv_test:',tv_test_reviews.shape)
print('BOW_cv_blind:',tv_blind_reviews.shape)

In [0]:
#Now generate binary (true, false) labels from sentiment values. positive maps to 1, negative maps to 0
from sklearn.preprocessing import LabelBinarizer
lb=LabelBinarizer()
#transformed sentiment data
sentiment_data=lb.fit_transform(reviews_data['sentiment'])
print(sentiment_data.shape)

In [0]:
#Spliting the sentiment data
train_sentiments=sentiment_data[:40000]
test_sentiments=sentiment_data[40000:45000]
blind_sentiments=sentiment_data[45000:]
print(train_sentiments.shape)
print(test_sentiments.shape)
print(blind_sentiments.shape)

In [0]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(tv_train_reviews,train_sentiments)

In [0]:
# predict the labels on validation dataset
predictions_SVM = SVM.predict(tv_test_reviews)

In [0]:
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, test_sentiments)*100)