In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import numpy as np
import xml.etree.ElementTree as ET
import math
import torch
from torch.autograd import Variable
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
# dataset location
dataset_location = '/content/drive/MyDrive/Statistical Learning 20Fall/Final project'\
                    '/metacritic+starpower+holiday+revenue+screens+reviews'
location_split = '/content/drive/MyDrive/Statistical Learning 20Fall/Final project/'\
                    '/traindevtest_splits'
                  


In [None]:
train = set([])
dev = set([])
test = set([])
for file in os.listdir(location_split):
    file_path = os.path.join(location_split, file)
    for row in open(file_path):
        if file == 'train':
            train.add(row.strip())
        elif file == 'dev':
            dev.add(row.strip())
        else:
            test.add(row.strip())
print(len(train))
print(len(dev))
print(len(test))

1147
317
254


In [None]:
def find(collec, name, tree):
    '''
    :param collec:
    :param name:
    :param tree:
    :return:
    '''
    for elem in tree.iter():
        if elem.tag == name:
            item = elem.text
            if item not in collec:
                collec.append(item)

In [None]:
tags = set([])
for file in os.listdir(dataset_location):
    if file not in train:
      continue
    file_path = os.path.join(dataset_location, file)
    tree = ET.parse(file_path)
    for elem in tree.iter():
        tag = elem.tag.strip()
        if tag not in tags:
            tags.add(tag)

In [None]:
print(len(tags))
print(sorted(list(tags)))

39
['US_Gross', 'actor', 'actors', 'author', 'authors', 'christmas_release', 'company', 'critic', 'director', 'directors', 'genre', 'genres', 'highest_grossing_actor', 'highest_grossing_actors_present', 'independence_release', 'labor_release', 'memorial_release', 'movie', 'name', 'num_highest_grossing_actors', 'num_oscar_winning_actors', 'num_oscar_winning_directors', 'number_of_screens', 'origin', 'origins', 'oscar_winning_actor', 'oscar_winning_actors_present', 'oscar_winning_director', 'oscar_winning_directors_present', 'production_budget', 'rating', 'release_date', 'review', 'reviews', 'running_time', 'snippet', 'summer_release', 'url', 'weekend_gross']


In [None]:
genres = []
origins = []
ratings = []
actors = []
directors = []
names = []
companies = []
oscar_actors = []
oscar_directors = []
for file in os.listdir(dataset_location):
    if file not in train:
      continue
    file_path = os.path.join(dataset_location, file)
    tree = ET.parse(file_path)
    find(names, 'name', tree)
    find(genres, 'genre', tree)
    find(origins, 'origin', tree)
    find(ratings, 'rating', tree)
    find(actors, 'actor', tree)
    find(directors, 'director', tree)
    find(companies, 'company', tree)
    find(oscar_actors, 'oscar_winning_actor', tree)
    find(oscar_directors, 'oscar_winning_director', tree)

In [None]:
print(len(genres))
print(len(origins))
print(len(ratings))
print(len(actors))
print(len(directors))
print(len(names))
print(len(companies))
print(oscar_actors)
print(oscar_directors)


22
170
9
5308
1116
1147
263
['adrien brody', 'william hurt', 'gwyneth paltrow', 'sissy spacek', 'nicole kidman', 'ben kingsley', 'maggie smith', 'charlize theron', 'sean penn', 'louise fletcher', 'jamie foxx', 'michael douglas', 'jeremy irons', 'meryl streep', 'nicolas cage', 'jane fonda', 'dustin hoffman', 'robert duvall', 'geoffrey rush', 'liza minnelli', 'emma thompson', 'al pacino', 'holly hunter', 'anthony hopkins', 'reese witherspoon', 'halle berry', 'richard dreyfuss', 'hilary swank', 'jon voight', 'julie andrews', 'julie christie', 'kevin spacey', 'sally field', 'daniel day-lewis', 'ellen burstyn', 'denzel washington', 'russell crowe', 'julia roberts', 'diane keaton', 'jodie foster', 'helen mirren', 'shirley maclaine', 'jessica lange', 'kathy bates', 'f. murray abraham', 'jack nicholson', 'tom hanks', 'paul newman', 'forest whitaker', 'helen hunt']
['clint eastwood', 'sydney pollack', 'peter jackson', 'sam mendes', 'jonathan demme', 'robert redford', 'woody allen', 'barry levin

In [None]:
new_origins = {'CzechRepublic'}
while len(origins) > 0:
    new_list = []
    origins.sort(key=lambda x: len(x))
    curr = origins[0]
    origins.pop(0)
    if len(origins) == 0:
        break
    new_origins.add(curr)
    for item in origins:
        if curr in item:
            item.replace(curr, '')
            if len(item) != 0:
                continue
        new_list.append(item)
    origins = new_list
print(len(new_origins))

32


In [None]:
meta_feature_names = ['origin', 'running_time', 'production_budget', 'number_of_screens', 'summer_release',
                  'Memorial_release', 'Christmas_release', 'independence_release', 'labor_release', 
                 'num_highest_grossing_actors', 'num_oscar_winning_directors', 'num_oscar_winning_actors'] + genres + ratings + oscar_actors + oscar_directors
meta_feature_dic = {}
for i, name in enumerate(meta_feature_names):
    meta_feature_dic[meta_feature_names[i]] = i
# print(feature_dic)
numericals = [False for i in range(len(meta_feature_names))]
numericals[meta_feature_dic['running_time']] = True
numericals[meta_feature_dic['production_budget']] = True
numericals[meta_feature_dic['number_of_screens']] = True

In [None]:
def extract(tree, feature_names):
    features = [0 for i in range(len(feature_names))]
    value = [0]
    for elem in tree.iter():
        item = elem.text
        if item is None:
            continue
        item = item.replace('$', '')
        item = item.replace(',', '')
        if elem.tag == 'weekend_gross' and int(item) != 0:
            value[0] = float(item)/1000000
        else:
            for pos, name in enumerate(feature_names):
                if elem.tag == name:
                    if item.isnumeric():
                        if name == 'production_budget':
                            features[pos] = math.log(int(item), 2)
                        else:
                            features[pos] = int(item)
                    else:
                        if name == 'origin':
                            word = 'USA'
                        else:
                            word = 'true'
                        if word in item:
                            features[pos] = 1
                        else:
                            features[pos] = 0
                elif elem.tag == 'genre' and item == name:
                    features[pos] = 1
                elif elem.tag == 'rating' and item == name:
                    features[pos] = 1
    return features, value

In [None]:
import sys
import os
import numpy as np
import nltk
from nltk.util import ngrams  
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.util import ngrams  
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from scipy.stats import pearsonr

In [None]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def create_meta(path, train):
  train_meta = ""
  for file_name in os.listdir(path):
    if file_name not in train:
      continue
    # print(file_name)
    file_path = os.path.join(path, file_name)
    tree = ET.parse(file_path)
    for elem in tree.iter():
      if elem.tag == 'snippet':
        item = elem.text
        item += ' '
        train_meta += item
  return train_meta

In [None]:
def normalize(text):
    raw_words = text.split()
    wordnet_lematizer = WordNetLemmatizer()
    lematizer_words = [wordnet_lematizer.lemmatize(raw_word, pos='v') for raw_word in raw_words]
    english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '``', '"']
    filtered_words = [word for word in lematizer_words if word not in stopwords.words('english') and word not in english_punctuations]
    return filtered_words

In [None]:
def build_vocabulary(Path, train):
    raw = create_meta(Path, train)
    filtered_words = normalize(raw)
    unigramfdist = nltk.FreqDist()
    bigramfdist = nltk.FreqDist()
    trigramfdist = nltk.FreqDist()
    vocabulary = {}
    tokens = filtered_words
    unigrams = ngrams(tokens, 1)
    bigrams = ngrams(tokens, 2)
    trigrams = ngrams(tokens, 3)
    unigramfdist.update(unigrams)
    bigramfdist.update(bigrams)
    trigramfdist.update(trigrams)
    i = 0
    look_up = {}
    for fredist in [unigramfdist, bigramfdist, trigramfdist]:
        for key in fredist:
            if fredist[key] >= 5:
                vocabulary[key] = i
                look_up[i] = key
                i += 1
    return vocabulary, look_up

In [None]:
vocabulary, look_up = build_vocabulary(dataset_location, train)
print(len(vocabulary))

16483


In [None]:
print(vocabulary)

{('Since',): 0, ('premier',): 1, ('festival',): 2, ('small',): 3, ('masterpiece',): 4, ('one',): 5, ('best',): 6, ('film',): 7, ('around',): 8, ('secure',): 9, ('proper',): 10, ('theatrical',): 11, ('release,',): 12, ('week',): 13, ('single',): 14, ('L.A.',): 15, ('screen',): 16, ('height',): 17, ('crowd',): 18, ('holiday',): 19, ('season',): 20, ('may',): 21, ('exactly',): 22, ('qualify',): 23, ('nevertheless',): 24, ('joyous',): 25, ('happening.',): 26, ('This',): 27, ('laid-back',): 28, ('noir',): 29, ('steep',): 30, ('City',): 31, ('atmosphere',): 32, ('music',): 33, ('culminate',): 34, ('collide',): 35, ('worlds',): 36, ('genuine',): 37, ('virtual',): 38, ('reality.',): 39, ("Isn't",): 40, ('everyone.',): 41, ('It',): 42, ('seem',): 43, ('certain',): 44, ('confound',): 45, ('many',): 46, ('viewers',): 47, ('But',): 48, ('pic',): 49, ('core',): 50, ('critical',): 51, ('find',): 52, ('transfix',): 53, ('and,',): 54, ('ultimately,',): 55, ('deeply',): 56, ('move',): 57, ("film's",): 

In [None]:
def transfer(file_path, vocabulary):
    text = ""
    file_path = os.path.join(file_path)
    tree = ET.parse(file_path)
    for elem in tree.iter():
      if elem.tag == 'snippet':
        item = elem.text
        if item is not None:
          text += item
    unigramfdist = nltk.FreqDist()
    bigramfdist = nltk.FreqDist()
    trigramfdist = nltk.FreqDist()
    filtered_words = normalize(text)
    BOWDj = [0 for index in range(len(vocabulary))]
    tokens = filtered_words
    unigrams = ngrams(tokens, 1)
    bigrams = ngrams(tokens, 2)
    trigrams = ngrams(tokens, 3)
    unigramfdist.update(unigrams)
    bigramfdist.update(bigrams)
    trigramfdist.update(trigrams)
    for fredist in [unigramfdist, bigramfdist, trigramfdist]:
      for key in fredist:
        if key in vocabulary:
            position = vocabulary[key]
            BOWDj[position] = BOWDj[position] + 1
    return BOWDj

In [None]:
def build_matrix(dataset_location, feature_names, vobabulary):
    feature_matrix_train = []
    values_train = []
    feature_matrix_dev = []
    values_dev = []
    feature_matrix_test = []
    values_test = []
    name_encoder = {}
    for file in os.listdir(dataset_location):
        file_path = os.path.join(dataset_location, file)
        tree = ET.parse(file_path)
        name_encoder[tree.getroot().find('name').text] = len(name_encoder)
        features, value = extract(tree, feature_names)
        features.append(1)
        text_feature = transfer(file_path, vocabulary)
        features += text_feature
        if file in train:
            feature_matrix_train.append(features)
            values_train.append(value)
        elif file in dev:
            feature_matrix_dev.append(features)
            values_dev.append(value)
        elif file in test:
            feature_matrix_test.append(features)
            values_test.append(value)
    feature_matrix_train = np.array(feature_matrix_train)
    values_train = np.array(values_train)
    feature_matrix_dev = np.array(feature_matrix_dev)
    values_dev = np.array(values_dev)
    feature_matrix_test = np.array(feature_matrix_test)
    values_test = np.array(values_test)
    print(feature_matrix_train.shape)
    print(values_train.shape)
    return feature_matrix_train, feature_matrix_dev, feature_matrix_test, values_train, values_dev, values_test

In [None]:
feature_matrix_train, feature_matrix_dev, feature_matrix_test, values_train, values_dev, values_test = build_matrix(dataset_location, meta_feature_names, vocabulary)

(1147, 16600)
(1147, 1)


In [None]:
print(feature_matrix_train.shape)
print(feature_matrix_dev.shape)
print(feature_matrix_test.shape)
print(values_train.shape)
print(values_dev.shape)
print(values_test.shape)

(1147, 16600)
(317, 16600)
(254, 16600)
(1147, 1)
(317, 1)
(254, 1)


In [None]:
meta_dimension = len(meta_feature_names) + 1
print(meta_dimension)
all_dimension = feature_matrix_train.shape[1]

meta_only_train = feature_matrix_train[:, 0 : meta_dimension]
print(meta_only_train.shape)
meta_only_dev = feature_matrix_dev[:, 0 : meta_dimension]
print(meta_only_dev.shape)
meta_only_test = feature_matrix_test[:, 0 : meta_dimension]
print(meta_only_test.shape)
text_only_train = feature_matrix_train[:, meta_dimension :]
print(text_only_train.shape)
text_only_dev = feature_matrix_dev[:, meta_dimension :]
print(text_only_dev.shape)
text_only_test = feature_matrix_test[:, meta_dimension :]
print(text_only_test.shape)


117
(1147, 117)
(317, 117)
(254, 117)
(1147, 16483)
(317, 16483)
(254, 16483)


Scipy experiment

In [None]:
print('For meta only:')
regr = linear_model.LinearRegression()
regr.fit(meta_only_train, values_train)

pred = regr.predict(meta_only_test)
print('Mean absolute error: %.2f'
      % mean_absolute_error(values_test, pred))

For meta only:
Mean absolute error: 7.37


In [None]:
print('For text only:')
regr = linear_model.LinearRegression()
regr.fit(text_only_train, values_train)

pred = regr.predict(text_only_test)
print('Mean absolute error: %.2f'
      % mean_absolute_error(values_test, pred))

For text only:
Mean absolute error: 8.86


In [None]:
print('All features only:')
regr = linear_model.LinearRegression()
regr.fit(feature_matrix_train, values_train)

pred = regr.predict(feature_matrix_test)
print('Mean absolute error: %.2f'
      % mean_absolute_error(values_test, pred))

All features only:
Mean absolute error: 6.78


In [None]:
def my_loss(output, target, params, alpha, lambd):
    loss = torch.mean((output - target) ** 2)
    l1_regularization = alpha * torch.norm(params, 1)
    l2_regularization = (1 - alpha) * torch.norm(params, 2) / 2
    loss = loss / 2 + (l1_regularization + l2_regularization) * lambd
    return loss

In [None]:
def MAE(output, target):
    loss = torch.mean(torch.abs(output - target))
    return loss

In [None]:
class linearRegression(torch.nn.Module):
    def __init__(self, inputSize, outputSize):
        super(linearRegression, self).__init__()
        self.linear = torch.nn.Linear(inputSize, outputSize)

    def forward(self, x):
        out = self.linear(x)
        return out

In [None]:
def train_process(model, epochs, alpha, lambd, optimizer, criterion, best_curr_loss, kind):
    if kind == 'meta_only':
        inputs = meta_only_train
        dev_inputs = meta_only_dev
    if kind == 'text_only':
        inputs = text_only_train
        dev_inputs = text_only_dev
    if kind == 'all':
        inputs = feature_matrix_train
        dev_inputs = feature_matrix_dev
    for epoch in range(epochs):
        # Converting inputs and labels to Variable
        if torch.cuda.is_available():
            inputs = Variable(torch.as_tensor(inputs).double().cuda())
            labels = Variable(torch.as_tensor(values_train).cuda())
        else:
            inputs = Variable(torch.as_tensor(inputs)).double()
            labels = Variable(torch.as_tensor(values_train))

        # Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
        optimizer.zero_grad()

        # get output from the model, given the inputs
        outputs = model(inputs)

        # get loss for the predicted output
        params = torch.cat([x.view(-1) for x in model.linear.parameters()])

        # loss = my_loss(outputs, labels.double())
        loss = criterion(outputs, labels.double(), params, alpha, lambd)
        
        # get gradients w.r.t to parameters
        loss.backward()

        # update parameters
        optimizer.step()

        # print('epoch {}, loss {}'.format(epoch, loss.item()))

    model.eval()
    prediction = model(torch.from_numpy(dev_inputs).double().cuda())
    final_loss = MAE(prediction, Variable(torch.from_numpy(values_dev)).cuda())
    if final_loss < best_curr_loss or best_curr_loss == -1:
        if kind == 'meta_only':
            torch.save(model.state_dict(), 'best_model_meta_only.pth') 
        if kind == 'text_only':
            torch.save(model.state_dict(), 'best_model_text_only.pth') 
        if kind == 'all':
            torch.save(model.state_dict(), 'best_model_all_features.pth') 
    print('Alpha: ', alpha, '  Lambda: ', lambd, '  Loss: ', final_loss.item())
    return alpha, lambd, final_loss.item()

In [None]:
alphas = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
lambdas = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

In [None]:
def param_tuning(model, epochs, alphas, lambdas, optimizer, criterion, kind):
    print('-------------------- ' + kind)
    best_alpha = -1
    best_lambda = -1
    best_curr_loss = -1
    for alpha in alphas:
        for lambd in lambdas:
            alpha, lambd, error = train_process(model, epochs, alpha, lambd, optimizer, criterion, best_curr_loss, kind)
            if best_curr_loss == -1 or error < best_curr_loss:
                best_alpha = alpha
                best_lambda = lambd
                best_curr_loss = error
    print('best alpha: ', best_alpha)
    print('best lambda: ', best_lambda)
    if kind == 'meta_only':
        inputs = meta_only_test
        model_path = '/content/best_model_meta_only.pth'
    if kind == 'text_only':
        inputs = text_only_test
        model_path = '/content/best_model_text_only.pth'
    if kind == 'all':
        inputs = feature_matrix_test
        model_path = '/content/best_model_all_features.pth'
    final_model = linearRegression(inputDim, outputDim)
    final_model.double()
    final_model.load_state_dict(torch.load(model_path))
    final_model.cuda()
    prediction = final_model(torch.from_numpy(inputs).double().cuda())
    best_loss = MAE(prediction, Variable(torch.from_numpy(values_test)).cuda())
    corr, _ = pearsonr(prediction.detach().cpu().numpy().reshape(1, -1)[0], values_test.reshape(1, -1)[0])   
    print('Pearson correlation is: ', corr) 
    print('best loss: ', best_loss)

In [None]:
inputDim = len(meta_feature_names) + 1  # takes variable 'x'
outputDim = 1  # takes variable 'y'
learningRate = 0.1
epochs = 2000
model_meta_only = linearRegression(inputDim, outputDim)
model_meta_only.double()
model_meta_only.cuda()
# criterion = torch.nn.MSELoss()
criterion = my_loss
optimizer = torch.optim.Adam(model_meta_only.parameters(), lr=learningRate)
param_tuning(model_meta_only, epochs, alphas, lambdas, optimizer, my_loss, 'meta_only')

-------------------- meta_only
Alpha:  0   Lambda:  0   Loss:  7.048156518111739
Alpha:  0   Lambda:  0.1   Loss:  6.95618289773637
Alpha:  0   Lambda:  0.2   Loss:  6.876083701523413
Alpha:  0   Lambda:  0.3   Loss:  6.809093308560757
Alpha:  0   Lambda:  0.4   Loss:  6.749939481172394
Alpha:  0   Lambda:  0.5   Loss:  6.6975434489867975
Alpha:  0   Lambda:  0.6   Loss:  6.646796871678679
Alpha:  0   Lambda:  0.7   Loss:  6.60325528801285
Alpha:  0   Lambda:  0.8   Loss:  6.553176738658515
Alpha:  0   Lambda:  0.9   Loss:  6.509139195969585
Alpha:  0   Lambda:  1   Loss:  6.469492927494678
Alpha:  0.1   Lambda:  0   Loss:  7.064289848342124
Alpha:  0.1   Lambda:  0.1   Loss:  6.809253649904943
Alpha:  0.1   Lambda:  0.2   Loss:  6.756788358741614
Alpha:  0.1   Lambda:  0.3   Loss:  6.668686150092337
Alpha:  0.1   Lambda:  0.4   Loss:  6.568727735390903
Alpha:  0.1   Lambda:  0.5   Loss:  6.483995513346431
Alpha:  0.1   Lambda:  0.6   Loss:  8.183578478253548
Alpha:  0.1   Lambda:  0.7

In [None]:
inputDim = text_only_train.shape[1]  # takes variable 'x'
outputDim = 1  # takes variable 'y'
learningRate = 0.1
epochs = 2000
model_text_only = linearRegression(inputDim, outputDim)
model_text_only.double()
model_text_only.cuda()
# criterion = torch.nn.MSELoss()
criterion = my_loss
optimizer = torch.optim.Adam(model_text_only.parameters(), lr=learningRate)
param_tuning(model_text_only, epochs, alphas, lambdas, optimizer, my_loss, 'text_only')

-------------------- text_only
Alpha:  0   Lambda:  0   Loss:  7.66491780362297
Alpha:  0   Lambda:  0.1   Loss:  7.74863798023074
Alpha:  0   Lambda:  0.2   Loss:  7.758291363506661
Alpha:  0   Lambda:  0.3   Loss:  7.747303630977184
Alpha:  0   Lambda:  0.4   Loss:  7.736422877962582
Alpha:  0   Lambda:  0.5   Loss:  7.725852079322349
Alpha:  0   Lambda:  0.6   Loss:  7.710510097005941
Alpha:  0   Lambda:  0.7   Loss:  7.705709292317859
Alpha:  0   Lambda:  0.8   Loss:  7.696110279441851
Alpha:  0   Lambda:  0.9   Loss:  7.686713179052083
Alpha:  0   Lambda:  1   Loss:  7.678006631223901
Alpha:  0.1   Lambda:  0   Loss:  7.689372862357093
Alpha:  0.1   Lambda:  0.1   Loss:  8.07357189994393
Alpha:  0.1   Lambda:  0.2   Loss:  7.814567668958223
Alpha:  0.1   Lambda:  0.3   Loss:  7.71311841894245
Alpha:  0.1   Lambda:  0.4   Loss:  7.6594938201447125
Alpha:  0.1   Lambda:  0.5   Loss:  7.610134458926249
Alpha:  0.1   Lambda:  0.6   Loss:  7.574363211231118
Alpha:  0.1   Lambda:  0.7  

In [None]:
inputDim = feature_matrix_train.shape[1] # takes variable 'x'
outputDim = 1  # takes variable 'y'
learningRate = 0.1
epochs = 2000
model_all = linearRegression(inputDim, outputDim)
model_all.double()
model_all.cuda()
# criterion = torch.nn.MSELoss()
criterion = my_loss
optimizer = torch.optim.Adam(model_all.parameters(), lr=learningRate)
param_tuning(model_all, epochs, alphas, lambdas, optimizer, my_loss, 'all')

-------------------- all
Alpha:  0   Lambda:  0   Loss:  5.976104446031311
Alpha:  0   Lambda:  0.1   Loss:  6.191363232919377
Alpha:  0   Lambda:  0.2   Loss:  6.346417083441362
Alpha:  0   Lambda:  0.3   Loss:  6.359629741474113
Alpha:  0   Lambda:  0.4   Loss:  6.373094714277147
Alpha:  0   Lambda:  0.5   Loss:  6.360447088697715
Alpha:  0   Lambda:  0.6   Loss:  6.3351177068045015
Alpha:  0   Lambda:  0.7   Loss:  6.333563202745773
Alpha:  0   Lambda:  0.8   Loss:  6.319992373397465
Alpha:  0   Lambda:  0.9   Loss:  6.305947851885647
Alpha:  0   Lambda:  1   Loss:  6.292364266138719
Alpha:  0.1   Lambda:  0   Loss:  6.321549428495511
Alpha:  0.1   Lambda:  0.1   Loss:  9.279448867524717
Alpha:  0.1   Lambda:  0.2   Loss:  6.798348768263083
Alpha:  0.1   Lambda:  0.3   Loss:  6.694191211408561
Alpha:  0.1   Lambda:  0.4   Loss:  6.853023880057592
Alpha:  0.1   Lambda:  0.5   Loss:  6.191915402126924
Alpha:  0.1   Lambda:  0.6   Loss:  7.120740218038121
Alpha:  0.1   Lambda:  0.7   L

In [None]:
print(model_text_only)
params = torch.cat([x.view(-1) for x in model_text_only.linear.parameters()])
sorted_vals, sorted_ids = params.sort(dim = -1, descending = True)
params = params.data.detach().cpu().numpy()
sorted_vals = sorted_vals.data.detach().cpu().numpy()
i = 0
for id in sorted_ids.data.detach().cpu().numpy():
    print(look_up[id])
    print(sorted_vals[i])
    i += 1
    if i == 200:
        break

linearRegression(
  (linear): Linear(in_features=16483, out_features=1, bias=True)
)
('action',)
2.764412238794156
('enough',)
2.016997607092993
('first',)
1.7448093749629061
('new',)
1.4046528955545552
("that's",)
1.1493639638223139
('still',)
1.1275882906771175
('two',)
1.1156508082378485
('franchise',)
0.8450679448828837
('much',)
0.8078037496354619
('special',)
0.6469048594014427
('fun',)
0.530120849861037
('The',)
0.5254202561879968
('good',)
0.5138548270251371
('even',)
0.43378187087646514
("It's",)
0.3461958394752716
('lot',)
0.27625809557940556
('last',)
0.23453805165799413
('like',)
0.22422225122023293
('CGI',)
0.12613028898473921
('big',)
0.11775867183028227
('time',)
0.11613309740973143
('thing',)
0.11593822300563376
('look',)
0.10702549029498917
("movie's",)
0.10694154188487781
('comic',)
0.09815011686572765
('I',)
0.09396700950573954
('get',)
0.09179908649724183
('turn',)
0.08131179643700005
('star',)
0.08076613114788249
('movie,',)
0.07759004838881851
('original',)
0.0770

In [None]:
print(model_meta_only)
params = torch.cat([x.view(-1) for x in model_meta_only.linear.parameters()])
sorted_vals, sorted_ids = params.sort(dim = -1, descending = True)
params = params.data.detach().cpu().numpy()
sorted_vals = sorted_vals.data.detach().cpu().numpy()
i = 0
for id in sorted_ids.data.detach().cpu().numpy():
  if id < len(meta_feature_names):
    print(meta_feature_names[id])
    print(sorted_vals[i])
    i += 1
    if i == 200:
      break

linearRegression(
  (linear): Linear(in_features=117, out_features=1, bias=True)
)
num_highest_grossing_actors
1.7156672911751494
Fantasy
0.03736803886767432
num_oscar_winning_actors
0.027521453609144146
Romance
0.02332520814584448
FamilyKids
0.02273912462179598
NC-17
0.020195063758447707
Adventure
0.019025689111400724
labor_release
0.01901515552485112
G
0.014987903733880124
Documentary
0.014980476999246525
GayLesbian
0.007467172879805023
Rated
0.006003330412600294
number_of_screens
0.005410034203586612
ron howard
0.005029965729455292
robert benton
0.005029965729455292
milos forman
0.005029965729455292
russell crowe
0.005029965729455292
denzel washington
0.005029965729455292
kevin spacey
0.005029965729455292
steven spielberg
0.005029965729455292
francis ford coppola
0.005029965729455292
mel gibson
0.005029965729455292
robert zemeckis
0.005029965729455292
anthony minghella
0.005029965729455292
barry levinson
0.005029965729455292
halle berry
0.005029965729455292
f. murray abraham
0.00502

In [None]:
print(model_all)
params = torch.cat([x.view(-1) for x in model_all.linear.parameters()])
sorted_vals, sorted_ids = params.sort(dim = -1, descending = True)
params = params.data.detach().cpu().numpy()
sorted_vals = sorted_vals.data.detach().cpu().numpy()
i = 0
for id in sorted_ids.data.detach().cpu().numpy():
  if id < len(meta_feature_names):
    print(meta_feature_names[id])
    print(sorted_vals[i])
    i += 1
    if i == 200:
      break
  if id > len(meta_feature_names):
    print(look_up[id - len(meta_feature_names) - 1])
    print(sorted_vals[i])
    i += 1
    if i == 200:
      break

linearRegression(
  (linear): Linear(in_features=16600, out_features=1, bias=True)
)
num_highest_grossing_actors
1.6763192141884589
('summer',)
0.0645055019908963
('spectacular',)
0.06238032513440212
('there,',)
0.062282669710869085
('extravagant',)
0.05869133152668596
('heartwarming',)
0.056287068264766066
('previous', 'films,')
0.05592296805480347
('flawlessly',)
0.05563711022564122
('That', 'rare')
0.05549252475357771
('pirate',)
0.0545530664035545
('magic.',)
0.054302287205290865
('plot', 'character')
0.053368852648522716
('was,',)
0.052430124657766704
('enervate',)
0.05242908636613111
('incarnation',)
0.05225317160636928
('stylish',)
0.05214992433831758
('protagonists',)
0.05205849868121657
('potential,',)
0.051621845001727625
('This',)
0.05153232149509603
('enjoyably',)
0.05138331246104244
('price', 'admission')
0.051172550148504525
('progressively',)
0.050495147021833706
('make', 'sorry')
0.05048730429429325
('edge',)
0.04880895507846214
('inventions',)
0.0484761166276153
('tele