In [1]:
#!pip show torch
#!pip show torchtext

In [2]:
# Import necessary libraries
import pandas as pd


In [3]:
df = pd.read_csv("data/movies_2015_2023_preprocessed_genre.csv")
df.head()

Unnamed: 0,title,release_year,language,genre,overview,vote_average,vote_count,popularity,cleaned_overview
0,Pad Man,2018,Indisch,Humor,upon realizing extent woman affected menses se...,7.42,200.0,7.036,upon realizing extent woman affected menses se...
1,Tamasha,2015,Indisch,Humor,meeting vacation ved tara sense connection vow...,6.72,141.0,8.77,meeting vacation ved tara sense connection vow...
2,Tu Jhoothi Main Makkaar,2023,Indisch,Humor,earn extra cash mickey help couple break life ...,6.253,144.0,10.045,earn extra cash mickey help couple break life ...
3,Hindi Medium,2017,Indisch,Humor,mita raj batra affluent couple delhi chandni c...,7.3,166.0,7.001,mita raj batra affluent couple delhi chandni c...
4,Dilwale,2015,Indisch,Humor,raj mafia member one day meet girl meera chasi...,6.648,301.0,11.501,raj mafia member one day meet girl meera chasi...


In [4]:
# Check for missing values and remove them
df_cleaned = df.dropna(subset=['cleaned_overview', 'language', 'genre'])
df_cleaned.head()

Unnamed: 0,title,release_year,language,genre,overview,vote_average,vote_count,popularity,cleaned_overview
0,Pad Man,2018,Indisch,Humor,upon realizing extent woman affected menses se...,7.42,200.0,7.036,upon realizing extent woman affected menses se...
1,Tamasha,2015,Indisch,Humor,meeting vacation ved tara sense connection vow...,6.72,141.0,8.77,meeting vacation ved tara sense connection vow...
2,Tu Jhoothi Main Makkaar,2023,Indisch,Humor,earn extra cash mickey help couple break life ...,6.253,144.0,10.045,earn extra cash mickey help couple break life ...
3,Hindi Medium,2017,Indisch,Humor,mita raj batra affluent couple delhi chandni c...,7.3,166.0,7.001,mita raj batra affluent couple delhi chandni c...
4,Dilwale,2015,Indisch,Humor,raj mafia member one day meet girl meera chasi...,6.648,301.0,11.501,raj mafia member one day meet girl meera chasi...


In [5]:
# Combine the 'cleaned_overview' and 'genre' columns properly to avoid the warning
df_cleaned = df_cleaned.copy()
df_cleaned.loc[:, 'combined_text'] = df_cleaned['cleaned_overview'].fillna('') + ' ' + df_cleaned['genre'].fillna('')


In [6]:
df_cleaned['cleaned_overview'] = df_cleaned['cleaned_overview'].fillna('')

In [7]:
# Define features (X) and target (y)
X = df_cleaned['combined_text']
y = df_cleaned['language']

In [8]:
from sklearn.preprocessing import LabelEncoder
# Convert the 'language' column to numeric labels using LabelEncoder
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['language'])

In [9]:
# Create a new column 'label' to store the encoded language labels
df_cleaned['label'] = label_encoder.fit_transform(df_cleaned['language'])

In [10]:
# Create value tuples (label, cleaned_overview)
value_tuples = []
for _, row in df_cleaned.iterrows():
    value_tuples.append((row['label'], row['cleaned_overview']))

# Print the length and a few examples
print(f"Total number of value tuples: {len(value_tuples)}")
print("First tuple:", value_tuples[0])
print("Last tuple:", value_tuples[-1])

Total number of value tuples: 15499
First tuple: (3, 'upon realizing extent woman affected menses set create sanitary pad machine provide inexpensive sanitary pad woman rural india')
Last tuple: (4, 'several european underground director turn ancient rule mankind inside shocking goresoaked interpretation god law bible world ending commandment dying')


##### using a scikit-learn-based model

In [11]:
from sklearn.model_selection import train_test_split
# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##### building a custom PyTorch classifier

In [12]:
from sklearn.model_selection import train_test_split

# Split the data into 80% training and 20% testing
#train_value_tuples, test_value_tuples = train_test_split(value_tuples, test_size=0.2, random_state=42)

# Print the size of the training and testing sets
#print(f"Training set size: {len(train_value_tuples)}")
#print(f"Testing set size: {len(test_value_tuples)}")


## Preparing data processing pipelines

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=30000)  # Limiting to top 5000 features we can use 30000 like knn to see the difference

#custom PyTorch

#X_train_tfidf = vectorizer.fit_transform([text for label, text in train_value_tuples])
#X_test_tfidf = vectorizer.transform([text for label, text in test_value_tuples])

# using a scikit-learn-based model

# Fit the vectorizer on the training data and transform it
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data using the fitted vectorizer
X_test_tfidf = vectorizer.transform(X_test)

In [14]:
import torch
# Converting TF-IDF matrices to dense tensors
X_train_tensor = torch.tensor(X_train_tfidf.toarray(), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_tfidf.toarray(), dtype=torch.float32)

### Create a Custom Dataset Class

In [15]:
from torch.utils.data import Dataset, DataLoader

class MovieDataset(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data

    def __len__(self):
        return len(self.X_data)

    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]

# Convert y_train and y_test to tensors
y_train_tensor = torch.tensor(label_encoder.transform(y_train.values), dtype=torch.long)
y_test_tensor = torch.tensor(label_encoder.transform(y_test.values), dtype=torch.long)

# Create Dataset objects
train_dataset = MovieDataset(X_train_tensor, y_train_tensor)
test_dataset = MovieDataset(X_test_tensor, y_test_tensor)

# Create DataLoader objects
BATCH_SIZE = 64
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

## Define the PyTorch Model
define a simple feed-forward neural network model

In [16]:
#import torch.nn as nn
#import torch.nn.functional as F

#class TextClassificationModel(nn.Module):
#    def __init__(self, input_size, num_classes):
#        super(TextClassificationModel, self).__init__()
#        self.fc1 = nn.Linear(input_size, 128)
#        self.fc2 = nn.Linear(128, 64)
#        self.fc3 = nn.Linear(64, num_classes)
#
#    def forward(self, x):
#        x = F.relu(self.fc1(x))
#        x = F.relu(self.fc2(x))
#        x = self.fc3(x)
#        return x

#### Reduce Model Complexity
We will start by simplifying the model. A smaller network reduces the risk of overfitting, particularly when the dataset size is relatively small for a complex model

In [17]:
import torch.nn as nn
import torch.nn.functional as F

class TextClassificationModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(TextClassificationModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)  # Reduced the number of neurons
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [18]:
# Instantiate the model
input_size = X_train_tfidf.shape[1]
num_classes = len(label_encoder.classes_)
model = TextClassificationModel(input_size, num_classes)
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))


TextClassificationModel(
  (fc1): Linear(in_features=28191, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=6, bias=True)
)

## Define the Loss Function and Optimizer
We’ll use cross-entropy loss for this multi-class classification problem and an optimizer like Adam for optimization.

In [19]:
import torch.optim as optim

# Loss function
criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)
#optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)  # Adding weight decay



## Training the Model
we feed batches of data to the model and optimize it using backpropagation

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_model(model, train_loader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        correct = 0
        total = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            # Forward pass
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

        epoch_loss = total_loss / len(train_loader)
        epoch_acc = correct / total
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}')

# Train the model for 10 epochs
train_model(model, train_loader, criterion, optimizer, num_epochs=10)


Epoch [1/10], Loss: 1.6445, Accuracy: 0.3147
Epoch [2/10], Loss: 1.0544, Accuracy: 0.6284
Epoch [3/10], Loss: 0.4400, Accuracy: 0.8906
Epoch [4/10], Loss: 0.1525, Accuracy: 0.9717
Epoch [5/10], Loss: 0.0562, Accuracy: 0.9937
Epoch [6/10], Loss: 0.0239, Accuracy: 0.9986
Epoch [7/10], Loss: 0.0124, Accuracy: 0.9996
Epoch [8/10], Loss: 0.0076, Accuracy: 0.9998
Epoch [9/10], Loss: 0.0050, Accuracy: 0.9998
Epoch [10/10], Loss: 0.0036, Accuracy: 0.9998


### Evaluate the Model
After training, we can evaluate our model’s performance on the test set.

In [21]:
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

    accuracy = correct / total
    print(f'Test Accuracy: {accuracy:.4f}')

# Evaluate the model
evaluate_model(model, test_loader)


Test Accuracy: 0.7129


In [22]:
## without Reduce Model Complexity Test Accuracy: 0.7232
## with Reduce Model Complexity and L2 Regularization (Weight Decay) Test Accuracy: 0.6900
## with  Reduce Model Complexity Test Accuracy: 0.7300

In [23]:
import torch.optim as optim

# List of batch sizes to try
batch_sizes = [32, 64, 128, 256]

# Loop over each batch size
for batch_size in batch_sizes:
    print(f"Training with batch size: {batch_size}")
    
    # Create new DataLoader with the current batch size
    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

    # Reinitialize the model
    model = TextClassificationModel(input_size, num_classes)
    model.to(device)

    # Reinitialize the optimizer
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Train the model for 10 epochs with the current batch size
    train_model(model, train_loader, criterion, optimizer, num_epochs=10)

    # Evaluate the model on the test set
    evaluate_model(model, test_loader)


Training with batch size: 32
Epoch [1/10], Loss: 1.5310, Accuracy: 0.3797
Epoch [2/10], Loss: 0.8077, Accuracy: 0.7384
Epoch [3/10], Loss: 0.3014, Accuracy: 0.9237
Epoch [4/10], Loss: 0.0952, Accuracy: 0.9823
Epoch [5/10], Loss: 0.0302, Accuracy: 0.9974
Epoch [6/10], Loss: 0.0116, Accuracy: 0.9995
Epoch [7/10], Loss: 0.0061, Accuracy: 0.9997
Epoch [8/10], Loss: 0.0036, Accuracy: 0.9998
Epoch [9/10], Loss: 0.0027, Accuracy: 0.9998
Epoch [10/10], Loss: 0.0018, Accuracy: 0.9999
Test Accuracy: 0.6977
Training with batch size: 64
Epoch [1/10], Loss: 1.6111, Accuracy: 0.3293
Epoch [2/10], Loss: 0.8806, Accuracy: 0.6930
Epoch [3/10], Loss: 0.3889, Accuracy: 0.9006
Epoch [4/10], Loss: 0.1607, Accuracy: 0.9698
Epoch [5/10], Loss: 0.0641, Accuracy: 0.9922
Epoch [6/10], Loss: 0.0276, Accuracy: 0.9983
Epoch [7/10], Loss: 0.0140, Accuracy: 0.9995
Epoch [8/10], Loss: 0.0085, Accuracy: 0.9997
Epoch [9/10], Loss: 0.0057, Accuracy: 0.9998
Epoch [10/10], Loss: 0.0042, Accuracy: 0.9998
Test Accuracy: 0.6

KeyboardInterrupt: 

In [25]:
import torch.optim as optim

# Define different weight decay values to experiment with
weight_decay_values = [0, 1e-4, 1e-3, 1e-2]

# Loop over the different weight decay values
for weight_decay in weight_decay_values:
    print(f"Training with weight decay: {weight_decay}")
    
    # Reinitialize the model
    model = TextClassificationModel(input_size, num_classes)
    model.to(device)

    # Reinitialize the optimizer with the current weight decay
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=weight_decay)

    # Train the model for 10 epochs with the current weight decay
    train_model(model, train_loader, criterion, optimizer, num_epochs=10)

    # Evaluate the model on the test set
    evaluate_model(model, test_loader)


Training with weight decay: 0


Epoch [1/10], Loss: 1.7589, Accuracy: 0.2510
Epoch [2/10], Loss: 1.5761, Accuracy: 0.4048
Epoch [3/10], Loss: 1.1870, Accuracy: 0.6119
Epoch [4/10], Loss: 0.7679, Accuracy: 0.7751
Epoch [5/10], Loss: 0.4931, Accuracy: 0.8498
Epoch [6/10], Loss: 0.3107, Accuracy: 0.9343
Epoch [7/10], Loss: 0.1852, Accuracy: 0.9749
Epoch [8/10], Loss: 0.1095, Accuracy: 0.9891
Epoch [9/10], Loss: 0.0681, Accuracy: 0.9948
Epoch [10/10], Loss: 0.0443, Accuracy: 0.9981
Test Accuracy: 0.7187
Training with weight decay: 0.0001
Epoch [1/10], Loss: 1.7630, Accuracy: 0.2534
Epoch [2/10], Loss: 1.6261, Accuracy: 0.4382
Epoch [3/10], Loss: 1.3084, Accuracy: 0.5308
Epoch [4/10], Loss: 0.9315, Accuracy: 0.6915
Epoch [5/10], Loss: 0.6084, Accuracy: 0.8312
Epoch [6/10], Loss: 0.3856, Accuracy: 0.9295
Epoch [7/10], Loss: 0.2336, Accuracy: 0.9683
Epoch [8/10], Loss: 0.1455, Accuracy: 0.9848
Epoch [9/10], Loss: 0.0943, Accuracy: 0.9929
Epoch [10/10], Loss: 0.0650, Accuracy: 0.9975
Test Accuracy: 0.7258
Training with weigh

In [26]:
learning_rates = [1e-4, 1e-3, 1e-2]

for lr in learning_rates:
    print(f"Training with learning rate: {lr}")
    
    # Reinitialize the model
    model = TextClassificationModel(input_size, num_classes)
    model.to(device)

    # Reinitialize the optimizer with the current learning rate
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Train the model with the current learning rate
    train_model(model, train_loader, criterion, optimizer, num_epochs=10)

    # Evaluate the model on the test set
    evaluate_model(model, test_loader)


Training with learning rate: 0.0001
Epoch [1/10], Loss: 1.7870, Accuracy: 0.2071
Epoch [2/10], Loss: 1.7790, Accuracy: 0.2071
Epoch [3/10], Loss: 1.7680, Accuracy: 0.2071
Epoch [4/10], Loss: 1.7534, Accuracy: 0.2822
Epoch [5/10], Loss: 1.7353, Accuracy: 0.4414
Epoch [6/10], Loss: 1.7118, Accuracy: 0.3534
Epoch [7/10], Loss: 1.6853, Accuracy: 0.3132
Epoch [8/10], Loss: 1.6548, Accuracy: 0.3032
Epoch [9/10], Loss: 1.6214, Accuracy: 0.3085
Epoch [10/10], Loss: 1.5851, Accuracy: 0.3189
Test Accuracy: 0.3242
Training with learning rate: 0.001
Epoch [1/10], Loss: 1.8130, Accuracy: 0.1712
Epoch [2/10], Loss: 1.6552, Accuracy: 0.3138
Epoch [3/10], Loss: 1.3906, Accuracy: 0.4531
Epoch [4/10], Loss: 1.0261, Accuracy: 0.6544
Epoch [5/10], Loss: 0.6626, Accuracy: 0.8012
Epoch [6/10], Loss: 0.3842, Accuracy: 0.9308
Epoch [7/10], Loss: 0.2089, Accuracy: 0.9705
Epoch [8/10], Loss: 0.1186, Accuracy: 0.9866
Epoch [9/10], Loss: 0.0710, Accuracy: 0.9939
Epoch [10/10], Loss: 0.0454, Accuracy: 0.9981
Test 

: 