In [1]:
import re                    
import numpy as np           
import pandas as pd          
import nltk                  
import spacy

from nltk.corpus import stopwords        
from nltk.tokenize import word_tokenize  

from sklearn.preprocessing import LabelEncoder      
from sklearn.model_selection import train_test_split  

import torch                                 
from torch.nn.utils.rnn import pad_sequence  
from torch.utils.data import Dataset, DataLoader  



In [3]:
df = pd.read_csv("D:/Intern/DataSets/IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    return text

In [8]:
df['clean_review'] = df['review'].apply(clean_text)

In [9]:
df.head()

Unnamed: 0,review,sentiment,clean_review
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production br br the...
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,basically there s a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love in the time of money is...


In [15]:
stop_words = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [20]:
def tokenize_lemmatize(text):
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    
    # lemmatize with spacy
    doc = nlp(" ".join(tokens))
    lemma = [token.lemma_ for token in doc if not token.is_punct]
    return lemma

In [21]:
df["tokens"] = df["clean_review"].apply(tokenize_lemmatize)

In [22]:
df.head()

Unnamed: 0,review,sentiment,clean_review,tokens
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,"[one, reviewer, mention, watch, 1, oz, episode..."
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production br br the...,"[wonderful, little, production, br, br, filmin..."
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,"[think, wonderful, way, spend, time, hot, summ..."
3,Basically there's a family where a little boy ...,negative,basically there s a family where a little boy ...,"[basically, family, little, boy, jake, think, ..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love in the time of money is...,"[petter, mattei, love, time, money, visually, ..."


In [23]:
all_tokens = [word for tokens in df['tokens'] for word in tokens]
vocab = sorted(set(all_tokens))
word2idx = {word: idx+2 for idx, word in enumerate(vocab)}
word2idx["<PAD>"] = 0
word2idx["<UNK>"] = 1

In [24]:
idx2word = {idx: word for word, idx in word2idx.items()}
vocab_size = len(word2idx)
print(f"Vocab size: {vocab_size}")

Vocab size: 87703


In [25]:
def token_to_seq(text):
    return [word2idx.get(word, word2idx["<UNK>"]) for word in text]

In [26]:
df["seq"] = df["tokens"].apply(token_to_seq)

In [27]:
df.head()

Unnamed: 0,review,sentiment,clean_review,tokens,seq
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,"[one, reviewer, mention, watch, 1, oz, episode...","[55472, 64787, 49599, 84441, 57, 56614, 25491,..."
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production br br the...,"[wonderful, little, production, br, br, filmin...","[85956, 45600, 61059, 10449, 10449, 28137, 765..."
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,"[think, wonderful, way, spend, time, hot, summ...","[77404, 85956, 84551, 72488, 77848, 36726, 747..."
3,Basically there's a family where a little boy ...,negative,basically there s a family where a little boy ...,"[basically, family, little, boy, jake, think, ...","[7340, 27108, 45600, 10408, 40179, 77404, 8756..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love in the time of money is...,"[petter, mattei, love, time, money, visually, ...","[58439, 48529, 46221, 77848, 51257, 83655, 742..."


In [28]:
MAX_LEN = 200

In [34]:
def pad_sequences(seq, max_len=MAX_LEN):
    if len(seq) < max_len:
        seq = seq + [word2idx["<PAD>"]] * (max_len - len(seq))
    else:
        seq = seq[:max_len]
        
    return seq

In [35]:
df["padded_seq"] = df["seq"].apply(pad_sequences)

In [36]:
le = LabelEncoder()

df["label"] = le.fit_transform(df["sentiment"]) # positive = 1, negative = 0

In [38]:
X_train, X_test, y_train, y_test = train_test_split(df["padded_seq"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42)

In [39]:
X_train = torch.tensor(X_train)
X_test = torch.tensor(X_test)
y_train = torch.tensor(y_train)
y_test = torch.tensor(y_test)

In [40]:
embedding_dim = 100
embedding_index = {}