In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("./Data/IMDB_Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
# creating a smaller csv file for better working

d3 = df.head(4000).copy()
d3.to_csv('short_reviews.csv', index = False)
data = pd.read_csv('./Data/short_reviews.csv')

In [7]:
# Step 1: Sentence Segmentation

import spacy
nlp = spacy.load("en_core_web_sm")

#seperating sentences using spacy (punctuation marks work like delimitters. Sentences just maintains them as seperate sentences
def process_text(text):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    return sentences

data['text'] = data['review'].apply(process_text)

In [8]:
# step 2: Tokenization
def tokenize_text(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return tokens

data['text'] = data['review'].apply(tokenize_text)

In [9]:
# Step 3: removing stop words according to Buckely list of stop words

with open("./Data/Buckley-Salton-stopword-list.txt", "r") as words:
	lines = words.readlines()

stop_words =[]
for words in lines:
    as_list = words.split(",")
    stop_words.append(as_list[0].replace("\n", ""))

print(stop_words)

['a', "a's", 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', "ain't", 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', "aren't", 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'b', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'both', 'brief', 'but', 'by', 'c', "c'mon", "c's", 'came', 'can', "can't", 'cannot', 'cant', 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'conta

In [10]:
def remove_stop_words(text):
    if text and isinstance(text, str):
        doc = nlp(text)
        tokens_without_stopwords = [token.text for token in doc if token.text.lower() not in stop_words]
        return " ".join(tokens_without_stopwords)
    else:
        return ""


data['cleaned_text'] = data['text'].apply(remove_stop_words)

In [11]:
data.head()

Unnamed: 0,review,sentiment,text,cleaned_text
0,One of the other reviewers has mentioned that ...,positive,"[One, of, the, other, reviewers, has, mentione...",
1,A wonderful little production. <br /><br />The...,positive,"[A, wonderful, little, production, ., <, br, /...",
2,I thought this was a wonderful way to spend ti...,positive,"[I, thought, this, was, a, wonderful, way, to,...",
3,Basically there's a family where a little boy ...,negative,"[Basically, there, 's, a, family, where, a, li...",
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[Petter, Mattei, 's, "", Love, in, the, Time, o...",
