In [6]:
# Importing packages
import matplotlib.pyplot as plt
import pandas as pd
import ipywidgets as widgets
import numpy as np
import itertools
from ipywidgets import Layout
import seaborn as sns
import math

import time
from scipy import linalg
import scipy.optimize as optimize
import sympy as sm

from tkinter import *
import tkinter as tk

from data_gen import gen_df 


In this project, we want to try to estimate the ratings of movies. That is, we well calculate the utility of a movie, given genres, duration, awards and so forth, and we will try to calculate the IMDB ratings for each movie, based on this utility. Finally, we will then minimize the distance between our estimated ratings and the actual ratings. 

Our utility function is given by: 
$$ U_i = \sum_{k=1}^{23}(\alpha_k G_{ik}) + \sum_{k=1920}^{2010} (\beta_k D_{ik}) + \gamma N_i + \delta W_i + \rho_1 L_i + \rho_2 L_i^2 $$. 

Based on this utility function, we will estimate the ratings of each movies,
$$ R_i^{model} = \frac{\exp(\omega x_i)}{1 + \exp(\omega x_i)} $$ 

where, 
$$ x_i = \big[G_1, G_2, ..., G_n, D_{1920}, D_{1930}, ..., D_{2010}, N_i, W_i, L_i, L_i^2 \big] $$
$$ \omega = \big[\alpha_1, \alpha_2, ..., \alpha_{23}, \beta_{1920}, \beta_{1930}, ..., \beta_{2010}, \gamma, \delta, \rho_1, \rho_2  \big] $$

We then use optimize methods to solve the following: 

$$ \min_{\omega} \Big\{ \sum_{i=1}^{n} \left( R_i^{model} - R_i^{data} \right)^2 \Big\} $$


### The following code will optimize for each decade and plot all estimates



In [3]:
decade_list = [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]

vars = ['Action','Adventure','Animation','Biography','Comedy','Crime','Documentary',
        'Drama','Family','Fantasy','FilmNoir','History','Horror','Music','Musical',
        'Mystery','Romance','SciFi','Short','Sport','Thriller','War','Western',
        'nrOfNominations','nrOfWins','duration']


In [4]:
def df_dec(decade):
    global df
    global decade_list

    filename = 'imdb.csv'

    df = gen_df(filename)

    df = df.loc[(df['ratingCount']>=5000)]
    df.drop(columns=['Adult','GameShow','News','RealityTV','TalkShow'], inplace=True)

    df = df.loc[df['decade'] == f'{decade}s']
    
    df_X = df.copy()
    df_Y = pd.DataFrame(df['imdbRating'].copy())
    df_Y.rename(columns = {'imdbRating':'rat_data'}, inplace=True)

    drops = ['index', 'tid', 'title', 'imdbRating', 'ratingCount', 'year', 
             'nrOfGenre', 'nrOfPhotos', 'nrOfNewsArticles',  'nrOfUserReviews']

    for i in drops:
        df_X.drop(columns=[i], inplace=True)

    # Rearrange columns
    df_X = df_X.reindex(['Action','Adventure','Animation','Biography','Comedy','Crime','Documentary',
                              'Drama','Family','Fantasy','FilmNoir','History','Horror','Music','Musical',
                              'Mystery','Romance','SciFi','Short','Sport','Thriller','War','Western',
                              'nrOfNominations','nrOfWins','duration'], axis=1)
    
    return df_X, df_Y


In [7]:
def optimizer_dec():
    global fs
    global evals
    global x0
    global df
    
    def sqr_diff_sum(df_X,pars):
        util = df_X@pars
        df_Y['rat_model'] = 10*np.exp(util)/(1+np.exp(util))
        df_Y['sqr_diff'] = (df_Y['rat_model']-df_Y['rat_data'])**2
        return df_Y['sqr_diff'].sum()

    # Scipy minimize
    def zeros(n): 
        list = [0] * n
        return list 

            
    decade_list = [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]
    
    result = []
    
    for decade in decade_list:      
        df_X, df_Y = df_dec(decade)
        x0 = zeros(len(vars))
        evals = 0   
    
        obj_fun = lambda x: sqr_diff_sum(df_X,x)
        
        result_i = optimize.minimize(obj_fun,x0,
                               method='Nelder-Mead',
                               options={"disp":True, "maxiter":50000}, # display the results
                               );
        
        result.append(list(result_i.x))
                      
    return result

t0 = time.time()
result_dec = optimizer_dec()
t1 = time.time()    
print(f'{t1-t0:.8} seconds')




#optimizer_dec()
# BFGS
#print(result.x)

Optimization terminated successfully.
         Current function value: 0.681026
         Iterations: 37
         Function evaluations: 1400
         Gradient evaluations: 50
Optimization terminated successfully.
         Current function value: 6.952928
         Iterations: 56
         Function evaluations: 2156
         Gradient evaluations: 77
Optimization terminated successfully.
         Current function value: 18.422256
         Iterations: 57
         Function evaluations: 2268
         Gradient evaluations: 81
Optimization terminated successfully.
         Current function value: 55.539271
         Iterations: 61
         Function evaluations: 2212
         Gradient evaluations: 79
Optimization terminated successfully.
         Current function value: 85.188373
         Iterations: 64
         Function evaluations: 2408
         Gradient evaluations: 86
Optimization terminated successfully.
         Current function value: 125.301583
         Iterations: 58
         Function eva

In [12]:
result_dec_mod = []
for j,var in enumerate(vars):
    temp = []
    for i,dec in enumerate(decade_list):
        temp.append(result_dec[i][j])
    
    result_dec_mod.append(temp)    


    
def fig(var):
    # Plotting the number of movies per year
    fig = plt.figure(figsize=(12,6))
    ax1 = fig.add_subplot(1,2,1)
    ax2 = fig.add_subplot(1,2,2)
    
    ax1.bar(decade_list, result_dec_mod_NM[vars.index(var)],width=6)

    ax1.set_ylabel('TBA')
    ax1.set_title(f'Nelder-Mead')
    #ax1.set_ylim([0,y_max])
    ax1.set_xticks(decade_list)
    ax1.axhline(y=0,color='black',linewidth=1)
    
widgets.interact(fig,
    var = widgets.Dropdown(description='Input', value='Action', options=vars, 
                ),
);    

interactive(children=(Dropdown(description='Input', options=('Action', 'Adventure', 'Animation', 'Biography', …

In [52]:
print(vars.index('duration'))

25
