<a href="https://colab.research.google.com/github/Shayshu-NR/APS360-Final-Project/blob/main/APS360_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Team 1



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import torchtext
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision import transforms, utils
from textblob import TextBlob

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
 ! pip install -q kaggle

In [None]:
! pip install -U textblob

In [None]:
! python -m textblob.download_corpora

In [None]:
from google.colab import files

files.upload()

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! ls ~/.kaggle
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle datasets download -d stefanoleone992/imdb-extensive-dataset 

In [None]:
! unzip /content/imdb-extensive-dataset.zip -d '/root/datasets'

# Baseline Model


Extract the necessary info, clean up the data:

In [None]:
movies = pd.read_csv('/root/datasets/IMDb movies.csv', index_col=False)

In [None]:
movies

In [None]:
catcols = ['genre', 'budget', 'country', 'duration', 'year', 'avg_vote']
df = movies[catcols]
not_missing = df.dropna()

In [None]:
not_missing = not_missing[pd.to_numeric(not_missing['year']) > 1980]

In [None]:
no_foreign = not_missing[not_missing['budget'].str[0] == '$']
no_foreign['budget'] = no_foreign['budget'].str.replace('$', '')

no_foreign['budget'] = no_foreign['budget'].astype('float')
no_foreign['duration'] = no_foreign['duration'].astype('float')
no_foreign['year'] = no_foreign['year'].astype('float')
no_foreign['avg_vote'] = no_foreign['avg_vote'].astype('float')

In [None]:
no_foreign

Normalize the continuous data:

In [None]:
# Normalize budget
cont_features = no_foreign[['budget', 'duration', 'year', 'avg_vote']]

normalized = no_foreign

normalized['budget']  = (normalized['budget'] - normalized['budget'].min()) / (normalized['budget'].max() - normalized['budget'].min()) 
normalized['duration']  = (normalized['duration'] - normalized['duration'].min()) / (normalized['duration'].max() - normalized['duration'].min()) 
normalized['year']  = (normalized['year'] - normalized['year'].min()) / (normalized['year'].max() - normalized['year'].min()) 
normalized['avg_vote']  = (normalized['avg_vote'] - normalized['avg_vote'].min()) / (normalized['avg_vote'].max() - normalized['avg_vote'].min()) 

In [None]:
normalized_labels  = normalized[['avg_vote']]
normalized = normalized[['genre', 'budget', 'country', 'duration', 'year']]

In [None]:
normalized


In [None]:
data = pd.get_dummies(normalized)

In [None]:
datanp = data.values.astype(np.float32)
labelnp = normalized_labels.values.astype(np.float32)

Create training and testing data sets:

In [None]:
np.random.seed(1000)

np.random.shuffle(datanp)
np.random.shuffle(labelnp)

train_index = int(len(datanp) * 0.9)

train_set = datanp[:train_index]
test_set = datanp[train_index:]

train_label = labelnp[:train_index]
test_label = labelnp[train_index:]

In [None]:
test_set.shape

In [None]:
test_label.shape

Use a premade model to make predictions

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate 3 plots: the test and training learning curve, the training
    samples vs fit times curve, the fit times vs score curve.

    Parameters
    ----------
    estimator : estimator instance
        An estimator instance implementing `fit` and `predict` methods which
        will be cloned for each validation.

    title : str
        Title for the chart.

    X : array-like of shape (n_samples, n_features)
        Training vector, where ``n_samples`` is the number of samples and
        ``n_features`` is the number of features.

    y : array-like of shape (n_samples) or (n_samples, n_features)
        Target relative to ``X`` for classification or regression;
        None for unsupervised learning.

    axes : array-like of shape (3,), default=None
        Axes to use for plotting the curves.

    ylim : tuple of shape (2,), default=None
        Defines minimum and maximum y-values plotted, e.g. (ymin, ymax).

    cv : int, cross-validation generator or an iterable, default=None
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

          - None, to use the default 5-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, default=None
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like of shape (n_ticks,)
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the ``dtype`` is float, it is regarded
        as a fraction of the maximum size of the training set (that is
        determined by the selected validation method), i.e. it has to be within
        (0, 1]. Otherwise it is interpreted as absolute sizes of the training
        sets. Note that for classification the number of samples usually have
        to be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """
    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
    axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1)
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt

In [None]:
from sklearn import linear_model
import matplotlib.pyplot as plt
import numpy as np
from sklearn import tree
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import learning_curve

model = linear_model.SGDRegressor()
model.fit(train_set, train_label)

result = model.predict(test_set)


accuracy  = 0 
loss = 0
for i in range(len(result)):


  how_close = abs(result[i] - test_label[i])
  
  if how_close < 0.1 :
    accuracy += 1
  
  loss += (how_close)**2.0

plot_learning_curve(model, "Test", test_set, test_label)
print('Testing data performance', 100 * (accuracy / len(result)), '% correctly predicted')
print('Testing data performance', (loss / len(result)), 'Loss')

The testing accuracy of this model was 48.13%, with a mean squared loss of 0.024

# Primary Model

In [None]:
class MovieDataset(torch.utils.data.Dataset):
  def __init__(self, data, labels):
    self.labels = labels
    self.data = data


  def __len__(self):
      return len(self.data)

  def __getitem__(self, index):
      # Load data and get label
      X = self.data[index]
      y = self.labels[index]

      return X, y

In [None]:
movies = pd.read_csv('/root/datasets/IMDb movies.csv', index_col=False)

In [None]:
catcols = ['genre', 'budget', 'country', 'duration', 'year', 'actors', 'director', 'description', 'avg_vote']
df = movies[catcols]
not_missing = df.dropna()

In [None]:
year_dis = []
its = []
years = []
j = 0

for i in range(114):
  year_dis.append(not_missing[not_missing.year == (1906 + i)].shape[0])
  its.append(i + 1906)

  num_movs = year_dis[j]
  for k in range(num_movs):
    years.append(1906 + i)
  j+=1

In [None]:
test = np.array(years)

print(len(np.where(test > 1980)[0]) / len(test))

In [None]:
plt.hist(years, bins=114)
plt.xlabel("Year")
plt.ylabel("Number of movies")
plt.show()

In [None]:
unq_genres = np.sort(not_missing['avg_vote'].unique())
gen_breakd = []
nor_gen = []
votes = []
j = 0

for i in unq_genres:
  gen_breakd.append(not_missing[not_missing.avg_vote == i].shape[0])
  num_movs = gen_breakd[j]
  for k in range(num_movs):
    votes.append(i)
  j += 1

# total = np.sum(gen_breakd)

# for i in gen_breakd:
#   nor_gen.append(100.0 * i / total)

In [None]:
plt.hist(votes, bins=len(unq_genres))
plt.xlabel("Rating")
plt.ylabel("Number of movies")
plt.show()

In [None]:
not_missing = not_missing[pd.to_numeric(not_missing['year']) > 1980]

In [None]:
no_foreign = not_missing[not_missing['budget'].str[0] == '$']
no_foreign['budget'] = no_foreign['budget'].str.replace('$', '')

no_foreign['budget'] = no_foreign['budget'].astype('float')
no_foreign['duration'] = no_foreign['duration'].astype('float')
no_foreign['year'] = no_foreign['year'].astype('float')
no_foreign['avg_vote'] = no_foreign['avg_vote'].astype('float')

In [None]:
# Normalize budget
cont_features = no_foreign[['budget', 'duration', 'year', 'avg_vote']]

normalized = no_foreign

normalized['budget']  = (normalized['budget'] - normalized['budget'].min()) / (normalized['budget'].max() - normalized['budget'].min()) 
normalized['duration']  = (normalized['duration'] - normalized['duration'].min()) / (normalized['duration'].max() - normalized['duration'].min()) 
normalized['year']  = (normalized['year'] - normalized['year'].min()) / (normalized['year'].max() - normalized['year'].min()) 
normalized['avg_vote']  = (normalized['avg_vote'] - normalized['avg_vote'].min()) / (normalized['avg_vote'].max() - normalized['avg_vote'].min()) 
normalized['polarity'] = normalized['description'].apply(lambda x: TextBlob(x).sentiment[0])
normalized['subjectivity'] = normalized['description'].apply(lambda x: TextBlob(x).sentiment[1])


In [None]:
normalized['lead_actor']  = normalized['actors'].apply(lambda x: x.split(",")[0])
normalized['supporting_actor_1'] = normalized['actors'].apply(lambda x: x.split(",")[1] if len(x.split(",")) >= 2 else "")
normalized['supporting_actor_2'] = normalized['actors'].apply(lambda x: x.split(",")[2] if len(x.split(",")) >= 3 else "")

In [None]:
normalized = normalized[['avg_vote','genre', 'budget', 'country', 'duration', 'year', 'polarity', 'subjectivity']]

In [None]:
normalized

In [None]:
data = pd.get_dummies(normalized)

In [None]:
data

In [None]:
datanp = data.values.astype(np.float32)

In [None]:
datanp.shape

In [None]:
# set the numpy seed for reproducibility
# https://docs.scipy.org/doc/numpy/reference/generated/numpy.random.seed.html
np.random.seed(50)

# todo
np.random.shuffle(datanp)

train_index = int(len(datanp)*0.70)
val_index = int(len(datanp)*0.85)

train_set = datanp[:train_index]
val_set =  datanp[train_index:val_index]
test_set = datanp[val_index:]

train_label = datanp[:train_index]
val_label =  datanp[train_index:val_index]
test_label = datanp[val_index:]


In [None]:
train_loader = torch.utils.data.DataLoader(train_set, batch_size=32, shuffle=True) 
val_loader = torch.utils.data.DataLoader(val_set, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=32, shuffle=True)

In [None]:
from torch import nn

class AutoEncoder(nn.Module):
    def __init__(self):
        self.name = "AutoEncoder"
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(1958, 1000), 
            nn.ReLU(),
            nn.Linear(1000, 500),

        )
        self.decoder = nn.Sequential(
            nn.Linear(500, 1000), 
            nn.ReLU(),
            nn.Linear(1000, 1958),
            nn.Sigmoid() # get to the range (0, 1)
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [None]:
def zero_out_rating(records):
    records[:, 0] = 0
    return records

In [None]:
# For the autoencoder
def get_accuracy(model, data_loader):
    total = 0
    acc = 0
    for item in data_loader: # minibatches
        inp = item.detach().numpy()
        out = model(zero_out_rating(item.clone())).detach().numpy()
        for i in range(out.shape[0]): # record in minibatch
            if out[i][0] <= inp[i][0] + 0.10 and out[i][0] >= inp[i][0] - 0.10:
              acc = acc + 1
            total += 1
    return acc / total

In [None]:
def get_model_name(name, batch_size, learning_rate, epoch):
    path = "model_{0}_bs{1}_lr{2}_epoch{3}".format(name, batch_size, learning_rate, epoch)
    return path

In [None]:
def train(model, train_loader, valid_loader, batch_size=32, num_epochs=5, learning_rate=1e-4):
    """ Training loop. You should update this."""
    torch.manual_seed(42)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    iters, losses, train_acc, val_acc = [], [], [], []
    n = 0 # the number of iterations
    k = 0
    j = 0
    val_iters, val_losses, acc_iters = [], [], []

    for epoch in range(num_epochs):

        for data in train_loader:
            #print(data)
            datam = zero_out_rating(data.clone()) # zero out one categorical feature
            recon = model(datam)
            loss = criterion(recon[:,0], data[:,0])
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            # save the current training information
            iters.append(n)
            losses.append(float(loss)/batch_size)             # compute *average* loss
            n += 1
            

                #Calculating validation loss at the end of each epoch
        for data in valid_loader:
          datam = zero_out_rating(data.clone()) # zero out one categorical feature
          recon = model(datam)
          loss = criterion(recon[:,0], data[:,0])

          # save the current training information
          val_iters.append(k)
          val_losses.append(float(loss)/batch_size)             # compute *average* loss
          k += 1

        model_path = get_model_name(model.name, batch_size, learning_rate, epoch)
        torch.save(model.state_dict(), model_path)
        # plotting the training loss every 20 iterations
        plt.title("Training Losses")
        plt.plot(iters, losses, label="Train")
        plt.xlabel("Iterations")
        plt.ylabel("Loss")
        plt.show()
        # plotting validation loss 
        plt.title("Validation Losses")
        plt.plot(val_iters, val_losses, label="Validation")
        plt.xlabel("Validation Iterations")
        plt.ylabel("Loss")
        plt.show()


        #Calculating training accuracy and validation accuarcy 
        acc_iters.append(j)
        j += 1
        val_acc.append(get_accuracy(model,valid_loader))
        train_acc.append(get_accuracy(model,train_loader))
        plt.title("Training Curve")
        plt.plot(acc_iters, train_acc, label="Training")
        plt.plot(acc_iters, val_acc, label="Validation")    
        plt.xlabel("Every 20 Iterations")
        plt.ylabel("Validation Accuracy")
        plt.legend(loc='best')
        plt.show()
        
    print("Final Training Accuracy: {}".format(train_acc[-1]))
    print("Final Validation Accuracy: {}".format(val_acc[-1]))

In [None]:
MyModel = AutoEncoder()
train(MyModel, train_loader, val_loader,32 ,10, 1e-5)

In [None]:
model = AutoEncoder()
model_path = model_path = "model_{0}_bs{1}_lr{2}_epoch{3}".format(model.name,32, 0.00001,9)
state = torch.load(model_path)
model.load_state_dict(state)
test_loader = torch.utils.data.DataLoader(test_set)
test_accuracy = get_accuracy(model, test_loader)
print("Test Accuracy: ", test_accuracy)

In [None]:
# Godzilla Vs Kong movie from IMDb
# https://www.imdb.com/title/tt5034838/
godzilla_desc = 'The epic next chapter in the cinematic Monsterverse pits two of the greatest icons in motion picture history against one another - the fearsome Godzilla and the mighty Kong - with humanity caught in the balance.'
godzilla_year = (2021 - no_foreign['year'].min()) / (2021 - no_foreign['year'].min()) 
godzilla_dur = (113 - no_foreign['duration'].min()) / (no_foreign['duration'].max() - no_foreign['duration'].min()) 
godzilla_bud = (168000000- no_foreign['budget'].min()) / (no_foreign['budget'].max() - no_foreign['budget'].min())
godzilla_pol = TextBlob(godzilla_desc).sentiment[0]
godzilla_sub = TextBlob(godzilla_desc).sentiment[1]

# AI Adventures 
ai_desc = 'A wild ride filled with triumph and dispair. Four students, one project, will they be able to pass this course?'
ai_year = (1981 - no_foreign['year'].min()) / (2021 - no_foreign['year'].min()) 
ai_dur = (8 - no_foreign['duration'].min()) / (no_foreign['duration'].max() - no_foreign['duration'].min()) 
ai_bud = (16000- no_foreign['budget'].min()) / (no_foreign['budget'].max() - no_foreign['budget'].min())
ai_pol = TextBlob(ai_desc).sentiment[0]
ai_sub = TextBlob(ai_desc).sentiment[1]

# Create tensor to hold movie data
demo_tensor = torch.zeros((3, 1958))

In [None]:
jl_desc = "Determined to ensure Superman's ultimate sacrifice was not in vain, Bruce Wayne aligns forces with Diana Prince with plans to recruit a team of metahumans to protect the world from an approaching threat of catastrophic proportions."
jl_year = 1
jl_dur = (242 - no_foreign['duration'].min()) / (no_foreign['duration'].max() - no_foreign['duration'].min()) 
jl_bud = (300000000- no_foreign['budget'].min()) / (no_foreign['budget'].max() - no_foreign['budget'].min())
jl_pol = TextBlob(jl_desc).sentiment[0]
jl_sub = TextBlob(jl_desc).sentiment[1]

In [None]:
# Fill in tensor with cleaned data
demo_tensor[0, 0] = godzilla_bud
demo_tensor[0, 1] = godzilla_dur
demo_tensor[0, 2] = godzilla_year
demo_tensor[0, 3] = godzilla_pol
demo_tensor[0, 4] = godzilla_sub
demo_tensor[0, 95] = 1
demo_tensor[0, 1588] = 1

demo_tensor[1, 0] = ai_bud
demo_tensor[1, 1] = ai_dur
demo_tensor[1, 2] = ai_year
demo_tensor[1, 3] = ai_pol
demo_tensor[1, 4] = ai_sub
demo_tensor[1, 307] = 1
demo_tensor[1, 1588] = 1

demo_tensor[2, 0] = jl_bud
demo_tensor[2, 1] = jl_dur
demo_tensor[2, 2] = jl_year
demo_tensor[2, 3] = jl_pol
demo_tensor[2, 4] = jl_sub
demo_tensor[2, 95] = 1
demo_tensor[2, 1588] = 1

In [None]:
# Run Model on test data and extract the avg_vote column
model = AutoEncoder()
model_path = model_path = "model_{0}_bs{1}_lr{2}_epoch{3}".format(model.name,32, 1e-5, 9)
state = torch.load(model_path)
model.load_state_dict(state)

godzilla_result = model(demo_tensor[0])[0]
ai_result = model(demo_tensor[1])[0]
jl_result = model(demo_tensor[2])[0]

In [None]:
# Unnormalize the data
godzilla_rating = (godzilla_result * (no_foreign['avg_vote'].max() - no_foreign['avg_vote'].min())) + no_foreign['avg_vote'].min()
ai_rating = (ai_result * (no_foreign['avg_vote'].max() - no_foreign['avg_vote'].min())) + no_foreign['avg_vote'].min()
jl_rating = (jl_result * (no_foreign['avg_vote'].max() - no_foreign['avg_vote'].min())) + no_foreign['avg_vote'].min()

In [None]:
# Final results
print("Godzilla Vs. Kong predicted rating:", round(float(godzilla_rating),1))
print("Justice league predicted rating:", round(float(jl_rating), 1))
print("AI Adventures predicted rating:", round(float(ai_rating),1))