# IMDb Movie Ratings Sentiment Analysis

In [5]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


## Working with data

In [6]:
!wget -O archive.zip https://www.kaggle.com/api/v1/datasets/download/yasserh/imdb-movie-ratings-sentiment-analysis

--2024-12-06 18:27:21--  https://www.kaggle.com/api/v1/datasets/download/yasserh/imdb-movie-ratings-sentiment-analysis
Resolving www.kaggle.com (www.kaggle.com)... 35.244.233.98
Connecting to www.kaggle.com (www.kaggle.com)|35.244.233.98|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://storage.googleapis.com:443/kaggle-data-sets/1875905/3063858/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20241206%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20241206T182721Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=6abecbabc16fab5be9bc973499b1dfbbd817e86270d1a5fd4b4c75cbba420136429989835586287b43f6810d595345be371c1fe9a007399912965072fb06221af2819a606069a405bf9592437b0fd4560e41d261f6b00bd37c01949eeffb73ab74c6669a18a8970ad342824af79daf05a939723a0d76bcddc4c4d5dba61ad9ed0dc1e4e2e1436b3056204bf1d0f7b70e47dffa290127591a1baab16a76dbd417fe38fbea6012e098b7281fda3742a2d

In [7]:
movie = pd.read_csv('./archive.zip', header=0, names=['texts', 'label'])
movie.texts = [x.strip() for x in movie.texts]
movie.head()

Unnamed: 0,texts,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [8]:
# getting raw texts and labels
X_train_raw = pd.DataFrame(movie, columns=['texts'])
y_train_raw = pd.DataFrame(movie, columns=['label'])

## Processing our data

In [9]:
# stopwords and punctuation
nltk.download('stopwords')
SW = stopwords.words("english")
SW += [x.capitalize() for x in SW] + ['<', '/', '>', ',', '.', '+', '-', '=', "'", ':', ';', '"', '`']

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
# function for tokenization and filtration
def processing(sent, SWP):
    tokenizer = RegexpTokenizer(r'[A-z]\w+')
    word_tokens = tokenizer.tokenize(sent)
    filtered = []
    for w in word_tokens:
        if w not in SWP:
            filtered.append(w)
    return filtered

In [11]:
X_len = len(X_train_raw)
X_train_filtered = X_train_raw.copy()

# tokenization and filtration
for i in range(X_len):
    X_train_filtered.loc[i, 'texts'] = processing(X_train_filtered.texts[i], SW)

# stemming
X_train_stemmed = X_train_filtered.copy()
for i in range(X_len):
    ps = PorterStemmer()
    X_train_stemmed.loc[i, 'texts'] = set(list(map(ps.stem, X_train_stemmed.texts[i])))

# joining
X_train_strs = X_train_stemmed.copy()
for i in range(X_len):
    X_train_strs.loc[i, 'texts'] = ' '.join(X_train_strs.texts[i]).lower()

In [12]:
X_train_strs.head()

Unnamed: 0,texts
0,judgment utter creat get art version thunderbi...
1,scene atlanti know also except anim breakfast ...
2,yeah know parent kid defin see dian civil audi...
3,horror get though bad great death abraham valu...
4,certain captain instead im cash davi numer bra...


In [13]:
X_train, X_val, y_train, y_val = train_test_split(X_train_strs, y_train_raw, test_size=0.25, random_state=42)

In [None]:
# TF-IDF vectorization of texts
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train.texts)
X_val_tfidf = tfidf_vectorizer.transform(X_val.texts)

## Random Forest Classifier

In [None]:
rf_clf = RandomForestClassifier()
search_rf = RandomizedSearchCV(rf_clf,
                              {'n_estimators': range(1, 11)}, random_state=42)
search_rf.fit(X_train_tfidf, y_train.label)

# getting the best model
best_rf = search_rf.best_estimator_
print('Best number of estimators:', best_rf.n_estimators)

# prediction
y_pred = best_rf.predict(X_val_tfidf)

Best number of estimators: 9


In [None]:
print('AUROC of Random Forest on train:', roc_auc_score(np.array(y_train), best_rf.predict_proba(X_train_tfidf)[:, 1]))
print('AUROC of Random Forest on validation:', roc_auc_score(np.array(y_val), best_rf.predict_proba(X_val_tfidf)[:, 1]))
print('Accuracy of Random Forest on train:', accuracy_score(np.array(y_train), best_rf.predict(X_train_tfidf)))
print('Accuracy score of Random Forest on validation:', accuracy_score(np.array(y_val), best_rf.predict(X_val_tfidf)))

AUROC of Random Forest on train: 0.9996579610101018
AUROC of Random Forest on validation: 0.8258397312514371
Accuracy of Random Forest on train: 0.9939
Accuracy score of Random Forest on validation: 0.7512


## Support Vector Classifier

The code below runs for two hours

In [None]:
svc_clf = SVC(C=2.0, kernel='linear', probability=True, random_state=42)
svc_clf.fit(X_train_tfidf, y_train.label)

In [None]:
print('AUROC of SVC on train:', roc_auc_score(np.array(y_train), svc_clf.predict_proba(X_train_tfidf)[:, 1]))
print('AUROC of SVC on validation:', roc_auc_score(np.array(y_val), svc_clf.predict_proba(X_val_tfidf)[:, 1]))
print('Accuracy of SVC on train:', accuracy_score(np.array(y_train), svc_clf.predict(X_train_tfidf)))
print('Accuracy score of SVC on validation:', accuracy_score(np.array(y_val), svc_clf.predict(X_val_tfidf)))

AUROC of SVC on train: 0.9938002463587409
AUROC of SVC on validation: 0.9505421702555332
Accuracy of SVC on train: 0.9725666666666667
Accuracy score of SVC on validation: 0.8812


## DistilBERT

In [1]:
# !pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:
from datasets import Dataset, DatasetDict
import transformers
from transformers import (DistilBertTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, TrainingArguments, Trainer)

In [3]:
# loading pre-trained model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [14]:
# function for tokenization and truncation
def preprocess_function(examples):
  return tokenizer(examples['texts'], truncation=True, max_length=128)

dataset = DatasetDict({
    "train": Dataset.from_pandas(pd.concat([X_train, y_train], axis=1)),
    "val": Dataset.from_pandas(pd.concat([X_val, y_val], axis=1))
    })

tokenized_dataset = dataset.map(preprocess_function, batched=True)
print(tokenized_dataset)

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['texts', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 30000
    })
    val: Dataset({
        features: ['texts', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 10000
    })
})


In [15]:
# checking for correctness
print(tokenized_dataset['train'][0])

{'texts': 'grade student know art put pretti read whole laugh mimick bad make bunch agre anoth kid film valu love loud would afterward adapt neighborhood still not languag venn time fun line fifth unfortun diagram comedi book teacher educ notic better hope act show movi felt compar way watch differ play aw like', 'label': 0, '__index_level_0__': 26898, 'input_ids': [101, 3694, 3076, 2113, 2396, 2404, 3653, 6916, 3191, 2878, 4756, 23150, 2243, 2919, 2191, 9129, 12943, 2890, 2019, 14573, 4845, 2143, 11748, 2226, 2293, 5189, 2052, 9707, 15581, 5101, 2145, 2025, 11374, 6692, 2290, 2310, 10695, 2051, 4569, 2240, 3587, 4895, 13028, 4609, 16403, 2272, 4305, 2338, 3836, 3968, 14194, 2025, 2594, 2488, 3246, 2552, 2265, 9587, 5737, 2371, 4012, 19362, 2126, 3422, 11234, 2377, 22091, 2066, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [16]:
import os
os.environ["WANDB_DISABLED"] = 'True'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [17]:
# defining metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    auroc = roc_auc_score(labels, predictions[:, 1])
    prediction = np.argmax(predictions, axis=1)
    return {'accuracy': (prediction==labels).sum() / len(labels), 'AUROC': auroc}

# set training hyperparameters to achieve better quality
training_args = TrainingArguments(
    output_dir="test_run",
    learning_rate=1e-05,
    optim='adamw_torch',
    weight_decay=0.0,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Auroc
1,0.5002,0.375403,0.835,0.913532
2,0.3587,0.344999,0.8512,0.927851
3,0.3183,0.342219,0.8538,0.932454
4,0.2974,0.346075,0.8531,0.934473


Epoch,Training Loss,Validation Loss,Accuracy,Auroc
1,0.5002,0.375403,0.835,0.913532
2,0.3587,0.344999,0.8512,0.927851
3,0.3183,0.342219,0.8538,0.932454
4,0.2974,0.346075,0.8531,0.934473
5,0.2737,0.357902,0.8547,0.9365
6,0.254,0.335599,0.8643,0.937209
7,0.2375,0.345001,0.8627,0.937007
8,0.2227,0.349893,0.8631,0.93744
9,0.2133,0.349099,0.8631,0.937343
10,0.2067,0.354573,0.8632,0.937334


TrainOutput(global_step=2350, training_loss=0.28826138922508726, metrics={'train_runtime': 3462.0746, 'train_samples_per_second': 86.653, 'train_steps_per_second': 0.679, 'total_flos': 9935054899200000.0, 'train_loss': 0.28826138922508726, 'epoch': 10.0})

AUROC of Random Forest on validation: 0.825840

AUROC of SVC on validation: 0.950542

Max AUROC of DistilBERT on validation: 0.937440