In [23]:
#!pip show torch
#!pip show torchtext

In [24]:
# Import necessary libraries
import pandas as pd


In [None]:
df = pd.read_csv("data/movies_2015_2023_preprocessed_genre.csv")
df.head()

In [None]:
# Check for missing values and remove them
df_cleaned = df.dropna(subset=['cleaned_overview', 'language', 'genre'])
df_cleaned.head()

In [27]:
# Combine the 'cleaned_overview' and 'genre' columns properly to avoid the warning
df_cleaned = df_cleaned.copy()
df_cleaned.loc[:, 'combined_text'] = df_cleaned['cleaned_overview'].fillna('') + ' ' + df_cleaned['genre'].fillna('')


In [28]:
df_cleaned['cleaned_overview'] = df_cleaned['cleaned_overview'].fillna('')

In [29]:
# Define features (X) and target (y)
X = df_cleaned['combined_text']
y = df_cleaned['language']

In [30]:
from sklearn.preprocessing import LabelEncoder
# Convert the 'language' column to numeric labels using LabelEncoder
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['language'])

In [31]:
# Create a new column 'label' to store the encoded language labels
df_cleaned['label'] = label_encoder.fit_transform(df_cleaned['language'])

In [None]:
# Create value tuples (label, cleaned_overview)
value_tuples = []
for _, row in df_cleaned.iterrows():
    value_tuples.append((row['label'], row['cleaned_overview']))

# Print the length and a few examples
print(f"Total number of value tuples: {len(value_tuples)}")
print("First tuple:", value_tuples[0])
print("Last tuple:", value_tuples[-1])

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into 80% training and 20% testing
train_value_tuples, test_value_tuples = train_test_split(value_tuples, test_size=0.2, random_state=42)

# Print the size of the training and testing sets
print(f"Training set size: {len(train_value_tuples)}")
print(f"Testing set size: {len(test_value_tuples)}")


## Preparing data processing pipelines

In [34]:
# Handle missing values by replacing NaN with an empty string
X_train = X_train.fillna('')
X_test = X_test.fillna('')


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=30000)  # Limiting to top 5000 features we can use 30000 like knn to see the difference
X_train_tfidf = vectorizer.fit_transform([text for label, text in train_value_tuples])
X_test_tfidf = vectorizer.transform([text for label, text in test_value_tuples])

In [36]:
import torch
# Converting TF-IDF matrices to dense tensors
X_train_tensor = torch.tensor(X_train_tfidf.toarray(), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_tfidf.toarray(), dtype=torch.float32)

### Create a Custom Dataset Class

In [37]:
from torch.utils.data import Dataset, DataLoader

class MovieDataset(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data

    def __len__(self):
        return len(self.X_data)

    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]

# Convert y_train and y_test to tensors
y_train_tensor = torch.tensor(label_encoder.transform(y_train.values), dtype=torch.long)
y_test_tensor = torch.tensor(label_encoder.transform(y_test.values), dtype=torch.long)

# Create Dataset objects
train_dataset = MovieDataset(X_train_tensor, y_train_tensor)
test_dataset = MovieDataset(X_test_tensor, y_test_tensor)

# Create DataLoader objects
BATCH_SIZE = 64
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

## Define the PyTorch Model
define a simple feed-forward neural network model

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class TextClassificationModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(TextClassificationModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Instantiate the model
input_size = X_train_tfidf.shape[1]
num_classes = len(label_encoder.classes_)
model = TextClassificationModel(input_size, num_classes)
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))


## Define the Loss Function and Optimizer
We’ll use cross-entropy loss for this multi-class classification problem and an optimizer like Adam for optimization.

In [39]:
import torch.optim as optim

# Loss function
criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)


## Training the Model
we feed batches of data to the model and optimize it using backpropagation

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_model(model, train_loader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        correct = 0
        total = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            # Forward pass
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

        epoch_loss = total_loss / len(train_loader)
        epoch_acc = correct / total
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}')

# Train the model for 10 epochs
train_model(model, train_loader, criterion, optimizer, num_epochs=10)


### Evaluate the Model
After training, we can evaluate our model’s performance on the test set.

In [None]:
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

    accuracy = correct / total
    print(f'Test Accuracy: {accuracy:.4f}')

# Evaluate the model
evaluate_model(model, test_loader)
