In [1]:
# Importing packages
import matplotlib.pyplot as plt
import pandas as pd
import ipywidgets as widgets
import numpy as np
import itertools
from ipywidgets import Layout
import seaborn as sns
import math

import time
from scipy import linalg
import scipy.optimize as optimize
import sympy as sm

from tkinter import *
import tkinter as tk

from data_gen import gen_df 


In this project, we want to try to estimate the ratings of movies. That is, we well calculate the utility of a movie, given genres, duration, awards and so forth, and we will try to calculate the IMDB ratings for each movie, based on this utility. Finally, we will then minimize the distance between our estimated ratings and the actual ratings. 

Our utility function is given by: 
$$ U_i = \sum_{k=1}^{23}(\alpha_k G_{ik}) + \sum_{k=1920s}^{2010s} (\beta_k D_{ik}) + \gamma N_i + \delta W_i + \rho L_i $$. 

Based on this utility function, we will estimate the ratings of each movies,
$$ R_i^{model} = \frac{\exp(\omega x_i)}{1 + \exp(\omega x_i)} $$ 

where, 
$$ x_i = \big[G_1, G_2, ..., G_n, D_{1920}, D_{1930}, ..., D_{2010}, N_i, W_i, L_i \big] $$
$$ \omega = \big[\alpha_1, \alpha_2, ..., \alpha_{23}, \beta_{1920s}, \beta_{1930s}, ..., \beta_{2010s}, \gamma, \delta, \rho  \big] $$

We then use optimize methods to solve the following: 

$$ \min_{\omega} \Big\{ \sum_{i=1}^{n} \left( R_i^{model} - R_i^{data} \right)^2 \Big\} $$


### The following code will optimize for each decade and plot all estimates



In [2]:
decade_list = [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]

vars = ['Action','Adventure','Animation','Biography','Comedy','Crime','Documentary',
        'Drama','Family','Fantasy','FilmNoir','History','Horror','Music','Musical',
        'Mystery','Romance','SciFi','Short','Sport','Thriller','War','Western',
        'nrOfNominations','nrOfWins','duration']


In [3]:
def df_dec(decade):
    global df
    global decade_list

    filename = 'imdb.csv'

    df = gen_df(filename)

    df = df.loc[(df['ratingCount']>=5000)]
    df.drop(columns=['Adult','GameShow','News','RealityTV','TalkShow'], inplace=True)

    df = df.loc[df['decade'] == f'{decade}s']
    
    df_X = df.copy()
    df_Y = pd.DataFrame(df['imdbRating'].copy())
    df_Y.rename(columns = {'imdbRating':'rat_data'}, inplace=True)

    drops = ['index', 'tid', 'title', 'imdbRating', 'ratingCount', 'year', 
             'nrOfGenre', 'nrOfPhotos', 'nrOfNewsArticles',  'nrOfUserReviews']

    for i in drops:
        df_X.drop(columns=[i], inplace=True)

    # Rearrange columns
    df_X = df_X.reindex(['Action','Adventure','Animation','Biography','Comedy','Crime','Documentary',
                              'Drama','Family','Fantasy','FilmNoir','History','Horror','Music','Musical',
                              'Mystery','Romance','SciFi','Short','Sport','Thriller','War','Western',
                              'nrOfNominations','nrOfWins','duration'], axis=1)
    
    return df_X, df_Y


In [4]:
def optimizer_dec():
    global fs
    global evals
    global x0
    global df
    
    def sqr_diff_sum(df_X,pars):
        util = df_X@pars
        df_Y['rat_model'] = 10*np.exp(util)/(1+np.exp(util))
        df_Y['sqr_diff'] = (df_Y['rat_model']-df_Y['rat_data'])**2
        return df_Y['sqr_diff'].sum()

    # Scipy minimize
    def zeros(n): 
        list = [0] * n
        return list 

            
    decade_list = [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]
    
    result = []
    
    for decade in decade_list:      
        df_X, df_Y = df_dec(decade)
        x0 = zeros(len(vars))
        evals = 0   
    
        obj_fun = lambda x: sqr_diff_sum(df_X,x)
        
        result_i = optimize.minimize(obj_fun,x0,
                               method='Nelder-Mead',
                               options={"disp":True, "maxiter":50000}, # display the results
                               );
        
        result.append(list(result_i.x))
                      
    return result

t0 = time.time()
result_dec = optimizer_dec()
t1 = time.time()    
print(f'{t1-t0:.8} seconds')




#optimizer_dec()
# BFGS
#print(result.x)

Optimization terminated successfully.
         Current function value: 5.649208
         Iterations: 10176
         Function evaluations: 12236
Optimization terminated successfully.
         Current function value: 20.618022
         Iterations: 7768
         Function evaluations: 9337
Optimization terminated successfully.
         Current function value: 23.340167
         Iterations: 7874
         Function evaluations: 9474
Optimization terminated successfully.
         Current function value: 81.101081
         Iterations: 11566
         Function evaluations: 13883
Optimization terminated successfully.
         Current function value: 114.676976
         Iterations: 7507
         Function evaluations: 9034
Optimization terminated successfully.
         Current function value: 165.183683
         Iterations: 6818
         Function evaluations: 8234
Optimization terminated successfully.
         Current function value: 401.211190
         Iterations: 7408
         Function evaluations

In [7]:
result_dec_mod = []
for j,var in enumerate(vars):
    temp = []
    for i,dec in enumerate(decade_list):
        temp.append(result_dec[i][j])
    
    result_dec_mod.append(temp)    
    
def fig(var):
    # Plotting the number of movies per year
    fig = plt.figure(figsize=(8,6))
    ax1 = fig.add_subplot(1,1,1)
    
    ax1.bar(decade_list, result_dec_mod[vars.index(var)],width=6)

    ax1.set_ylabel('TBA')
    ax1.set_title(f'TBA')
    #ax1.set_ylim([0,y_max])
    ax1.set_xticks(decade_list)
    ax1.axhline(y=0,color='black',linewidth=1)
    
widgets.interact(fig,
    var = widgets.Dropdown(description='Input', value='Action', options=vars, 
                ),
);    

interactive(children=(Dropdown(description='Input', options=('Action', 'Adventure', 'Animation', 'Biography', …

## The following code will produce a plot with estimate for each parameter

In [15]:
## Fra Sebs kode

def df_s(keep_top=None):
    global df
    global decade_list

    filename = 'imdb.csv'

    df = gen_df(filename)

    decade_list = [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]

    # Decade dummies
    for i in decade_list:
        df[f'decade_{i}'] = 0
        df.loc[(df['decade'] == f'{i}s'),f'decade_{i}'] = 1

    df = df.loc[(df['ratingCount']>=5000)]
    df.drop(columns=['Adult','GameShow','News','RealityTV','TalkShow'], inplace=True)

    if keep_top != None:
        df = df.sort_values('imdbRating', ascending=False)
        df = df.iloc[:keep_top]

    df_X = df.copy()
    df_Y = pd.DataFrame(df['imdbRating'].copy())
    df_Y.rename(columns = {'imdbRating':'rat_data'}, inplace=True)

    drops = ['index', 'tid', 'title', 'imdbRating', 'ratingCount', 'year', 
             'nrOfGenre', 'nrOfPhotos', 'nrOfNewsArticles',  'nrOfUserReviews', 'decade']

    for i in drops:
        df_X.drop(columns=[i], inplace=True)

    # Rearrange columns
    df_X = df_X.reindex(['Action','Adventure','Animation','Biography','Comedy','Crime','Documentary',
                              'Drama','Family','Fantasy','FilmNoir','History','Horror','Music','Musical',
                              'Mystery','Romance','SciFi','Short','Sport','Thriller','War','Western',
                              'decade_1920','decade_1930','decade_1940','decade_1950','decade_1960',
                              'decade_1970','decade_1980','decade_1990','decade_2000','decade_2010',
                              'nrOfNominations','nrOfWins','duration'], axis=1)
    
    return df_X, df_Y

In [33]:
## Fra Sebs kode

vars2 = ['Action','Adventure','Animation','Biography','Comedy','Crime','Documentary',
                          'Drama','Family','Fantasy','FilmNoir','History','Horror','Music','Musical',
                          'Mystery','Romance','SciFi','Short','Sport','Thriller','War','Western',
                          'decade_1920','decade_1930','decade_1940','decade_1950','decade_1960',
                          'decade_1970','decade_1980','decade_1990','decade_2000','decade_2010',
                          'nrOfNominations','nrOfWins','duration']



def optimizer(keep_top=None):
    global fs
    global evals
    global x0
    global df
    
    def sqr_diff_sum(df_X,pars):
        util = df_X@pars
        df_Y['rat_model'] = 10*np.exp(util)/(1+np.exp(util))
        df_Y['sqr_diff'] = (df_Y['rat_model']-df_Y['rat_data'])**2
        return df_Y['sqr_diff'].sum()

    # Scipy minimize
    def zeros(n): 
        list = [0] * n
        return list 

    df_X, df_Y = df_s(keep_top)
    x0 = zeros(len(vars2))
    evals = 0
    
    obj_fun = lambda x: sqr_diff_sum(df_X,x)
        
    result = optimize.minimize(obj_fun,x0,
                               method="Nelder-Mead",
                               options={"disp":True, "maxiter":50000}, # display the results
                               ) 

    
    return result

results = []
for i in [None, 500, 1000, 2000]:
    res_temp = optimizer(keep_top=i)
    temp = res_temp.x
    results.append(temp)

#print(results)
#print(results[1])

Optimization terminated successfully.
         Current function value: 3735.382286
         Iterations: 16566
         Function evaluations: 19203
Optimization terminated successfully.
         Current function value: 213.265323
         Iterations: 17235
         Function evaluations: 19895
Optimization terminated successfully.
         Current function value: 582.831479
         Iterations: 16138
         Function evaluations: 18644
Optimization terminated successfully.
         Current function value: 895.336144
         Iterations: 19477
         Function evaluations: 22457
[array([-0.02249121, -0.03089684,  0.17149985, -0.17152826, -0.02084888,
       -0.03048913,  0.11009148,  0.12785315, -0.06051079, -0.32745048,
        0.02199167, -0.07578793, -0.02139979, -0.03056553, -0.08347352,
       -0.09390543, -0.03941612, -0.09520167,  0.12192359, -0.06356771,
       -0.06100096,  0.19041382,  0.03038359, -0.04148942,  0.12249617,
        0.17989531, -0.03093947,  0.30781892,  0.09645

In [51]:
def fig_2(val):
    #result = optimizer(keep_top=keeptop)
    fig = plt.figure(figsize=(12,6))
    ax1 = fig.add_subplot(1,1,1)
    
    ax1.bar(vars2, results[options.index(val)])
    ax1.scatter(vars2, results[0], marker='D', s=15, zorder=2)
    
    ax1.set_ylabel('TBA')
    ax1.set_title(f'TBA')
    ax1.set_ylim([-0.7,0.7])
    ax1.axhline(y=0,color='black',linewidth=1)
    for tick in ax1.get_xticklabels():
        tick.set_rotation(90)
        
options = [None, 500, 1000, 2000]

widgets.interact(fig_2,
    val = widgets.Dropdown(description='TBA', value=None, options=options, 
                ),
);   

interactive(children=(Dropdown(description='TBA', options=(None, 500, 1000, 2000), value=None), Output()), _do…