In [3]:
# Importing packages
import matplotlib.pyplot as plt
import pandas as pd
import ipywidgets as widgets
import numpy as np
import itertools
from ipywidgets import Layout
import seaborn as sns
import math

import time
from scipy import linalg
import scipy.optimize as optimize
import sympy as sm

from tkinter import *
import tkinter as tk


In [4]:
## Importing and cleaning the data 

filename = 'imdb.csv'

## Defining the function to import and clean the data
def gen_df(filename):
    """ Returns a pandas dataframe containing information about movies from the IMDB database 

    Args:
    filename (csv-file): The csv-file have to be the file imdb.csv

    Returns:
    A pandas dataframe

    Notice:
    The function will not work if other files than imdb.csv are used as input """
        
    # Get .csv.file
    data = pd.read_csv(filename, sep=';', encoding='latin-1', escapechar='\\')

    # Read file into pandas dataframe
    df = pd.DataFrame(data)

    # Drop unwanted columns
    for i in range(44,48):
        df.drop(columns=[f'Unnamed: {i}'], inplace=True)
    df.drop(columns=['fn','wordsInTitle','url'], inplace=True)

    # Keep only observations of 'movie'-type
    I = df['type'] == 'video.movie'
    df = df.loc[I]
    df.drop(columns=['type'], inplace=True)

    # Drop movies with missing observations
    df.dropna(inplace=True)

    # During the importing process, the first decimal has become 0 for all movies.
    # Thus, we replace "0"'s with ""
    df['imdbRating'] = df['imdbRating'].astype(str)
    df['imdbRating'].replace(regex=True, inplace=True,to_replace='0',value='')
    df['imdbRating'] = df['imdbRating'].astype(float)

    # Transform duration from seconds to hours
    df['duration'] = df['duration']/60**2

    # Drop years before 1920 and the year 2014 because of few obervations
    I = (df['year']>=1920) & (df['year']<=2013)
    df = df.loc[I]

    # Change the type of 'year' to integer
    df['year'] = df['year'].astype(int)    
    
    # Sort observations and reset index
    df.sort_values('year', inplace=True)
    df.reset_index(inplace=True)

    # Generating a variable that shows the decade (as a string)
    year_list = [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]

    df['decade'] = ''
    for i,start in enumerate(year_list):
        end = start+10
        df.loc[(df['year'] >= start) & (df['year'] < end), 'decade'] = f'{year_list[i]}s'

    return df

df = gen_df(filename)

In this project, we want to try to estimate the ratings of movies. That is, we well calculate the utility of a movie, given genres, duration, awards and so forth, and we will try to calculate the IMDB ratings for each movie, based on this utility. Finally, we will then minimize the distance between our estimated ratings and the actual ratings. 

Our utility function is given by: 
$$ U_i = \sum_{k=1}^{23}(\alpha_k G_{ik}) + \sum_{k=1920}^{2010} (\beta_k D_{ik}) + \gamma N_i + \delta W_i + \rho_1 L_i + \rho_2 L_i^2 $$. 

Based on this utility function, we will estimate the ratings of each movies,
$$ R_i^{model} = \frac{\exp(\omega x_i)}{1 + \exp(\omega x_i)} $$ 

where, 
$$ x_i = \big[G_1, G_2, ..., G_n, D_{1920}, D_{1930}, ..., D_{2010}, N_i, W_i, L_i, L_i^2 \big] $$
$$ \omega = \big[\alpha_1, \alpha_2, ..., \alpha_{23}, \beta_{1920}, \beta_{1930}, \beta_{2010}, \gamma, \delta, \rho_1, \rho_2  \big] $$

We then use optimize methods to solve the following: 

$$ \min_{\omega} \Big\{ \sum_{i=1}^{n} \left( R_i^{model} - R_i^{data} \right)^2 \Big\} $$


In [5]:
import numpy as np
import pandas as pd
import itertools
import time
import scipy.optimize as optimize
from data_gen import gen_df 

filename = 'imdb.csv'

df = gen_df(filename)

decade_list = [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]

# Decade dummies
for i in decade_list:
    df[f'decade_{i}'] = 0
    df.loc[(df['decade'] == f'{i}s'),f'decade_{i}'] = 1


#print(imdb.shape)

df = df.loc[(df['ratingCount']>=5000)]
df.drop(columns=['Adult','GameShow','News','RealityTV','TalkShow'], inplace=True)

# df['duration_sqr'] = df['duration']*df['duration']

df_X = df.copy()
df_Y = pd.DataFrame(df['imdbRating'].copy())
df_Y.rename(columns = {'imdbRating':'rat_data'}, inplace=True)

#print(type(df_Y))
#print(df_Y.head())
#print(df_X.head())



drops = ['index', 'tid', 'title', 'imdbRating', 'ratingCount', 'year', 
         'nrOfGenre', 'nrOfPhotos', 'nrOfNewsArticles',  'nrOfUserReviews', 'decade']

for i in drops:
    df_X.drop(columns=[i], inplace=True)

#print(df_X.head())

# Rearrange columns
df_X = df_X.reindex(['Action','Adventure','Animation','Biography','Comedy','Crime','Documentary',
                          'Drama','Family','Fantasy','FilmNoir','History','Horror','Music','Musical',
                          'Mystery','Romance','SciFi','Short','Sport','Thriller','War','Western',
                          'decade_1920','decade_1930','decade_1940','decade_1950','decade_1960',
                          'decade_1970','decade_1980','decade_1990','decade_2000','decade_2010',
                          'nrOfNominations','nrOfWins','duration'], axis=1)






In [10]:
#print(df_X.head())
omegas = np.random.uniform(0,1,36)


df_Y['util'] = df_X@omegas
df_Y['rat_model'] = 10*np.exp(df_Y['util'])/(1+np.exp(df_Y['util']))
df_Y['sqr_diff'] = (df_Y['rat_model']-df_Y['rat_data'])**2

def sqr_diff_sum(df_X,pars):
    util = df_X@pars
    df_Y['rat_model'] = 10*np.exp(util)/(1+np.exp(util))
    df_Y['sqr_diff'] = (df_Y['rat_model']-df_Y['rat_data'])**2
    return df_Y['sqr_diff'].sum()



#print(df_X.head())
#print(df_Y.head())

def collect(x):
    global fs
    global evals
    global x0
    
    if evals == 0:
        fs = test[x0]
    
    fs.append(test(x))
    
    evals += 1

# Scipy minimize
evals = 0
def zeros(n): 
    list = [0] * n
    return list 

test = lambda x: sqr_diff_sum(df_X,x)

x0 = zeros(36)
result = optimize.minimize(test,x0,
                           method="Nelder-Mead",
                           options={"disp":True, "maxiter": 1000000} # display the results
                           ) 


Optimization terminated successfully.
         Current function value: 3735.382286
         Iterations: 16566
         Function evaluations: 19203


In [11]:
result.x

liste = ['Action','Adventure','Animation','Biography','Comedy','Crime','Documentary',
                          'Drama','Family','Fantasy','FilmNoir','History','Horror','Music','Musical',
                          'Mystery','Romance','SciFi','Short','Sport','Thriller','War','Western',
                          'decade_1920','decade_1930','decade_1940','decade_1950','decade_1960',
                          'decade_1970','decade_1980','decade_1990','decade_2000','decade_2010',
                          'nrOfNominations','nrOfWins','duration']

ditce = dict()

for j,i in enumerate(liste):
    ditce[i] = f'{result.x[j]:.2f}'
    
ditce

{'Action': '-0.02',
 'Adventure': '-0.03',
 'Animation': '0.17',
 'Biography': '-0.17',
 'Comedy': '-0.02',
 'Crime': '-0.03',
 'Documentary': '0.11',
 'Drama': '0.13',
 'Family': '-0.06',
 'Fantasy': '-0.33',
 'FilmNoir': '0.02',
 'History': '-0.08',
 'Horror': '-0.02',
 'Music': '-0.03',
 'Musical': '-0.08',
 'Mystery': '-0.09',
 'Romance': '-0.04',
 'SciFi': '-0.10',
 'Short': '0.12',
 'Sport': '-0.06',
 'Thriller': '-0.06',
 'War': '0.19',
 'Western': '0.03',
 'decade_1920': '-0.04',
 'decade_1930': '0.12',
 'decade_1940': '0.18',
 'decade_1950': '-0.03',
 'decade_1960': '0.31',
 'decade_1970': '0.10',
 'decade_1980': '0.01',
 'decade_1990': '-0.22',
 'decade_2000': '-0.20',
 'decade_2010': '-0.21',
 'nrOfNominations': '0.00',
 'nrOfWins': '0.02',
 'duration': '0.45'}

In [None]:
df_X = df_X.reindex(['Action','Adventure','Animation','Biography','Comedy','Crime','Documentary',
                          'Drama','Family','Fantasy','FilmNoir','History','Horror','Music','Musical',
                          'Mystery','Romance','SciFi','Short','Sport','Thriller','War','Western',
                          'decade_1920','decade_1930','decade_1940','decade_1950','decade_1960',
                          'decade_1970','decade_1980','decade_1990','decade_2000','decade_2010',
                          'nrOfNominations','nrOfWins','duration'], axis=1)

# Figure ideas 

""" 
Two figures: 
    Left figure: 
        Two curves, one which is actual average rating, while the other is estimated rating. 
        Each observation on the estimated ratings curve is an estimate from our score function. 
        By each iteration, we wish to observe a decreased gap between the two functions. 
    Right figure: 
        Histogram of the squared errors. Each bin should be a new iteration, and preferably, we would 
        see a decreasing bin height, as the squared errors converges towards a minimum. 
"""


