In [1]:
## LIBRARIES

import os
import numpy as np
import pandas as pd
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import regex as re
from string import punctuation
import math

import nltk
#nltk.download("omw-1.4")
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

[nltk_data] Error loading omw-1.4: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


In [3]:
data = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [9]:
## EDA

data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
data['sentiment'].value_counts()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [None]:
data.drop_duplicates(subset='review', inplace=True)

In [None]:
data.describe()

In [None]:
data['review'] = data['review'].str.lower()

In [None]:
## CLEANING

def punctuation_remove(text):
    exclude=string.punctuation
    for char in exclude:
        text=text.replace(char,'')
    return text

data['review'] = data['review'].apply(punctuation_remove)

In [None]:
def tag_remove(text):
    pattern=re.compile('<.*?>')
    return pattern.sub(r'',text)

data['review']=data['review'].apply(tag_remove)

In [None]:
def url_remove(text):
    pattern=re.compile(r'https ? ://\s+|www\.\s+')
    return pattern.sub(r'',text)
data['review']=data['review'].apply(url_remove)

In [None]:
def stopwords_remove(text):
    return ' '.join([x.lower() for x in text.split(' ') if x.lower() not in STOPWORDS])

data['review'] = data['review'].apply(remove_stop)

In [None]:
data[['review']]

In [None]:
def tokenize(series):
    return word_tokenize(series)

data['tokens'] = data['review'].apply(tokenize)

In [None]:
def get_len(series):
    return len(series)

data['token_len'] = data['tokens'].apply(get_len)

In [None]:
data[['tokens','token_len']].head()

In [None]:
data.describe()

In [None]:
# PADDING

MAX_LEN = math.ceil(data.describe().values[1])
print(MAX_LEN)

In [None]:
def pad_token(series):
    if len(series) < MAX_LEN:
        series.extend(['<END>']*(MAX_LEN-len(series)))
        return series
    else:
        return series[:MAX_LEN]

data['paded_tokens'] = data['tokens'].apply(pad_token)

In [None]:
print(data['paded_tokens'].values[10])

In [None]:
data[['tokens','paded_tokens']]

In [None]:
# NORMALIZATION (LEMMATIZATION)

lemmatizer = WordNetLemmatizer()

In [None]:
def lemma(series):
    return [lemmatizer.lemmatize(word) for word in series]

data['lemma_tokens'] = data['paded_tokens'].apply(lemma)

In [None]:
data[['tokens','lemma_tokens']]

In [None]:
# NORMALIZATION (STEMMING)

stemmer = PorterStemmer()

In [None]:
def stem(series):
    return [stemmer.stem(word) for word in series]

data['stem_tokens'] = data['tokens'].apply(stem)

In [None]:
data[['tokens','stem_tokens']]

In [None]:
# POS TAGGING

def pos_t(series):
    return nltk.pos_tag(series, tagset='universal')

data['pos_tag_tokens'] = data['tokens'].apply(pos_t)

In [None]:
data[['tokens','pos_tag_tokens']]

In [None]:
# WORD EMBEDDINGS

unique_words = set()
for tokens in list(data['lemma_tokens'].values):
    unique_words.update(tokens)

In [None]:
print('Count of Unique words:', len(unique_words))

In [None]:
word2idx = {}
for word in unique_words:
    word2idx[word] = len(word2idx)
word2idx['<END>'] = len(word2idx)

In [None]:
word_embeddings = np.random.rand(len(word2idx),200)
with open('../input/glove-global-vectors-for-word-representation/glove.6B.200d.txt', 'r') as embeds:
    embeddings = embeds.read()
    embeddings = embeddings.split('\n')[:-2]
    
for token_idx, token_embed in enumerate(embeddings):
    token = token_embed.split()[0]
    if token in word2idx:
        word_embeddings[word2idx[token]] = [float(val) for val in token_embed.split()[1:]]

In [None]:
print(f'Word embeddings for word {list(word2idx.keys())[300]}:',word_embeddings[300])

In [None]:
# COUNT VECTORIZATION

vectorizer = CountVectorizer()

In [None]:
vector_cl = vectorizer.fit_transform([' '.join(tok) for tok in list(data['lemma_tokens'].values)])

In [None]:
vector_cl.shape

In [None]:
# VECTORIZING THE LABELS

def label_vec(series):
    return int(series=='positive')

data['sentiment'] = data['sentiment'].apply(label_vec)

In [None]:
data[['lemma_tokens', 'sentiment']].head()

In [None]:
# TRAINING MODEL

vector_cl_train_x, vector_cl_test_x, vector_cl_train_y, vector_cl_test_y = train_test_split(vector_cl, data['sentiment'].values, test_size=0.3, random_state=42)
print(vector_cl_train_x.shape,vector_cl_train_y.shape,vector_cl_test_x.shape,vector_cl_test_y.shape)

In [None]:
# FITTING MODEL

log_reg = LogisticRegression().fit(vector_cl_train_x, vector_cl_train_y)

In [None]:
pred = log_reg.predict(vector_cl_test_x)

In [None]:
print('Mean Accuracy:', log_reg.score(vector_cl_test_x, vector_cl_test_y))
print('F1 Score:', f1_score(vector_cl_test_y, pred))

In [None]:
## TESTING

print(log_reg.predict(vectorizer.transform(['This movie was one of the best i watched in recent times'])))
print(log_reg.predict(vectorizer.transform(['This movie was not bad and i really liked it.'])))
print(log_reg.predict(vectorizer.transform(['The cinematics of this movie made my eyes bleed'])))

The inability of these classical machine learning models to identify the context of the text is the reason that Recurrent neural networks and transformer based models are the most widely used models for such tasks as they tend to understand the context of the text. 

Below we will be training a hugging face model.

In [None]:
# A HUGGING FACE MODEL APPROACH

from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, DataCollatorWithPadding, TFAutoModelForSequenceClassification, TrainingArguments, Trainer
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
imdb = load_dataset("imdb")

train_dataset = imdb["train"].shuffle(seed=42).select([i for i in list(range(10000))])
test_dataset = imdb["test"].shuffle(seed=42).select([i for i in list(range(10000))])

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def preprocess_function(examples):
   return tokenizer(examples["text"], truncation=True, padding=True, max_length=132)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

tokenized_train = tokenized_train.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=32,
)

tokenized_test = tokenized_test.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=32,
)

model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}

model.compile(
    optimizer=Adam(learning_rate=0.0001),
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)
model.fit(
    tokenized_train,
    validation_data=tokenized_test,
    epochs=3
)

In [None]:
from transformers import TextClassificationPipeline
sentiment = TextClassificationPipeline(model=model, tokenizer=tokenizer, framework='tf')

In [None]:
sentiment(['This movie was one of the best i watched in recent times','This movie was not bad and i really liked it.','The cinematics of this movie made my eyes bleed'])