## Dataset Preparation

In [1]:
import os

if os.path.exists("dev.tsv"):
    print("File exists")
else:
    !wget https://raw.githubusercontent.com/sberbank-ai/ru-gpts/master/data/ru-mokoron/dev.tsv

if os.path.exists("IMDB-Dataset.csv"):
    print("File exists")
else:
    !wget https://raw.githubusercontent.com/Ankit152/IMDB-sentiment-analysis/master/IMDB-Dataset.csv


File exists
File exists


In [2]:
# Load the dataset (dev.tsv)
import pandas as pd
import numpy as np

df = pd.read_csv("dev.tsv", sep='\t', header=None)
df.columns = ['text', 'label']

# Extracting text
df['text'] = df['text'].astype(str).apply(lambda x: x.replace('\t', ''))

# Extracting labels
df['label'] = df['label'].astype(str).apply(lambda x: x.replace('\n', ''))
df['label'] = df['label'].astype(int)

# Printing counts to ensure no imbalance
print("Counts of each label:")
print(df['label'].value_counts())

df.head(10)

Counts of each label:
label
1    444
0    428
Name: count, dtype: int64


Unnamed: 0,text,label
0,one long string of cliches,0
1,if you 've ever entertained the notion of doin...,0
2,k 19 exploits our substantial collective fear ...,0
3,it 's played in the most straight faced fashio...,0
4,"there is a fabric of complex ideas here , and ...",1
5,although laced with humor and a few fanciful t...,1
6,it all feels like a monty python sketch gone h...,0
7,it 's a stunning lyrical work of considerable ...,1
8,however it may please those who love movies th...,0
9,broomfield turns his distinctive ` blundering ...,1


In [3]:
# Split the dataset into train and validation
from sklearn.model_selection import train_test_split

train, val = train_test_split(df, test_size=0.2, random_state=42)
print(f"Train shape: {train.shape}, Validation shape: {val.shape}")

train.head()

Train shape: (697, 2), Validation shape: (175, 2)


Unnamed: 0,text,label
544,verbinski implements every hack artist trick t...,0
398,the band 's courage in the face of official re...,1
764,it 's hampered by a lifetime channel kind of p...,0
312,"a warm , funny , engaging film",1
326,"in a way , the film feels like a breath of fre...",1


## Model Construction

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(10000, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, 2)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.dropout(F.relu(self.fc2(x)))
        x = self.dropout(F.relu(self.fc3(x)))
        x = self.dropout(F.relu(self.fc4(x)))
        x = self.dropout(F.relu(self.fc5(x)))
        return x



----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1               [-1, 1, 512]       5,120,512
           Dropout-2               [-1, 1, 512]               0
            Linear-3               [-1, 1, 256]         131,328
           Dropout-4               [-1, 1, 256]               0
            Linear-5               [-1, 1, 128]          32,896
           Dropout-6               [-1, 1, 128]               0
            Linear-7                [-1, 1, 64]           8,256
           Dropout-8                [-1, 1, 64]               0
            Linear-9                 [-1, 1, 2]             130
          Dropout-10                 [-1, 1, 2]               0
Total params: 5,293,122
Trainable params: 5,293,122
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.04
Forward/backward pass size (MB): 0.01
Params size (MB): 20.19
Estima

In [35]:
# Instantiate model
model = NeuralNetwork()

# Print summary
from torchsummary import summary
summary(model, (1, 10000))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1               [-1, 1, 512]       5,120,512
           Dropout-2               [-1, 1, 512]               0
            Linear-3               [-1, 1, 256]         131,328
           Dropout-4               [-1, 1, 256]               0
            Linear-5               [-1, 1, 128]          32,896
           Dropout-6               [-1, 1, 128]               0
            Linear-7                [-1, 1, 64]           8,256
           Dropout-8                [-1, 1, 64]               0
            Linear-9                 [-1, 1, 2]             130
          Dropout-10                 [-1, 1, 2]               0
Total params: 5,293,122
Trainable params: 5,293,122
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.04
Forward/backward pass size (MB): 0.01
Params size (MB): 20.19
Estima

In [39]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Number of parameters: {total_params}")


Number of parameters: 5293122


In [None]:
#implementing Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=10000)
train_features = vectorizer.fit_transform(train['text'])
val_features = vectorizer.transform(val['text'])


In [53]:
vectorizer.get_feature_names_out()

array(['007', '10', '100', ..., 'zhang', 'zigzag', 'zombies'],
      dtype=object)

In [48]:
train_features.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [51]:
vectorizer.inverse_transform(train_features)

[array(['verbinski', 'implements', 'every', 'hack', 'artist', 'trick',
        'to', 'give', 'us', 'the', 'ooky', 'spookies'], dtype='<U18'),
 array(['the', 'band', 'courage', 'in', 'face', 'of', 'official',
        'repression', 'is', 'inspiring', 'especially', 'for', 'aging',
        'hippies', 'lrb', 'this', 'one', 'included', 'rrb'], dtype='<U18'),
 array(['of', 'is', 'it', 'hampered', 'by', 'lifetime', 'channel', 'kind',
        'plot', 'and', 'lead', 'actress', 'who', 'out', 'her', 'depth'],
       dtype='<U18'),
 array(['warm', 'funny', 'engaging', 'film'], dtype='<U18'),
 array(['to', 'the', 'in', 'of', 'it', 'film', 'way', 'feels', 'like',
        'breath', 'fresh', 'air', 'but', 'only', 'those', 'that', 'allow'],
       dtype='<U18'),
 array(['to', 'in', 'of', 'and', 'but', 'that', 'mafia', 'rap', 'stars',
        'hood', 'rats', 'butt', 'their', 'ugly', 'heads', 'regurgitation',
        'cinematic', 'violence', 'gives', 'brutal', 'birth', 'an',
        'unlikely', 'likable',