In [53]:
# Importing packages
import matplotlib.pyplot as plt
import pandas as pd
import ipywidgets as widgets
import numpy as np
import itertools
from ipywidgets import Layout
import seaborn as sns
import math

import time
from scipy import linalg
import scipy.optimize as optimize
import sympy as sm

from tkinter import *
import tkinter as tk


In [54]:
## Importing and cleaning the data 

filename = 'imdb.csv'

## Defining the function to import and clean the data
def gen_df(filename):
    """ Returns a pandas dataframe containing information about movies from the IMDB database 

    Args:
    filename (csv-file): The csv-file have to be the file imdb.csv

    Returns:
    A pandas dataframe

    Notice:
    The function will not work if other files than imdb.csv are used as input """
        
    # Get .csv.file
    data = pd.read_csv(filename, sep=';', encoding='latin-1', escapechar='\\')

    # Read file into pandas dataframe
    df = pd.DataFrame(data)

    # Drop unwanted columns
    for i in range(44,48):
        df.drop(columns=[f'Unnamed: {i}'], inplace=True)
    df.drop(columns=['fn','wordsInTitle','url'], inplace=True)

    # Keep only observations of 'movie'-type
    I = df['type'] == 'video.movie'
    df = df.loc[I]
    df.drop(columns=['type'], inplace=True)

    # Drop movies with missing observations
    df.dropna(inplace=True)

    # During the importing process, the first decimal has become 0 for all movies.
    # Thus, we replace "0"'s with ""
    df['imdbRating'] = df['imdbRating'].astype(str)
    df['imdbRating'].replace(regex=True, inplace=True,to_replace='0',value='')
    df['imdbRating'] = df['imdbRating'].astype(float)

    # Transform duration from seconds to hours
    df['duration'] = df['duration']/60**2

    # Drop years before 1920 and the year 2014 because of few obervations
    I = (df['year']>=1920) & (df['year']<=2013)
    df = df.loc[I]

    # Change the type of 'year' to integer
    df['year'] = df['year'].astype(int)    
    
    # Sort observations and reset index
    df.sort_values('year', inplace=True)
    df.reset_index(inplace=True)

    # Generating a variable that shows the decade (as a string)
    year_list = [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]

    df['decade'] = ''
    for i,start in enumerate(year_list):
        end = start+10
        df.loc[(df['year'] >= start) & (df['year'] < end), 'decade'] = f'{year_list[i]}s'

    return df

df = gen_df(filename)

In this project, we want to try to estimate the ratings of movies. That is, we well calculate the utility of a movie, given genres, duration, awards and so forth, and we will try to calculate the IMDB ratings for each movie, based on this utility. Finally, we will then minimize the distance between our estimated ratings and the actual ratings. 

Our utility function is given by: 
$$ U_i = \sum_{k=1}^{23}(\alpha_k G_{ik}) + \sum_{k=1920}^{2010} (\beta_k D_{ik}) + \gamma N_i + \delta W_i + \rho_1 L_i + \rho_2 L_i^2 $$. 

Based on this utility function, we will estimate the ratings of each movies,
$$ R_i^{model} = \frac{\exp(\omega x_i)}{1 + \exp(\omega x_i)} $$ 

where, 
$$ x_i = \big[G_1, G_2, ..., G_n, D_{1920}, D_{1930}, ..., D_{2010}, N_i, W_i, L_i, L_i^2 \big] $$
$$ \omega = \big[\alpha_1, \alpha_2, ..., \alpha_{23}, \beta_{1920}, \beta_{1930}, \beta_{2010}, \gamma, \delta, \rho_1, \rho_2  \big] $$

We then use optimize methods to solve the following: 

$$ \min_{\omega} \Big\{ \sum_{i=1}^{n} \left( R_i^{model} - R_i^{data} \right)^2 \Big\} $$


In [75]:
# Parameters
alphas = np.random.uniform(0,1,3)
betas = [1,2,3]

# Score-function for testing
def u_fun(xs,pars):
        util = xs.iloc[:,0]*pars[0]+xs.iloc[:,1]*pars[1]+xs.iloc[:,29]**pars[2]
        return 10*np.exp(util)/(1+np.exp(util))

# Building dataframe containing values for score-function
df_xs = pd.DataFrame()
df_xs = df.iloc[:,13:41]
df_xs['Duration'] = df['duration']
df_xs['Rating'] = df['imdbRating']

# Function that calculates minimum squared distance
def obj_fun(df_xs,pars):
        df_xs['Est_rating'] = u_fun(df_xs,pars)
        df_xs['Difference'] = (df_xs['Rating']-df_xs['Est_rating'])**2
        print(df_xs[['Est_rating','Rating','Difference']].head())
        return df_xs['Difference'].sum()

# Testing from here
obj_fun(df_xs,alphas)
obj_fun(df_xs,[0,0,0])
"""
min_fun = lambda a: obj_fun(df_xs,a)

bnds = ((0,1),(0,1),(0,1))

result = optimize.minimize(min_fun,alphas,method='SLSQP',bounds=bnds)

print(alphas,result.x)

print(obj_fun(df_xs,alphas),obj_fun(df_xs,result.x))

"""

"""
n = 300

#A1 = np.random.uniform(1,20,size=n)
#A2 = np.linspace(0,10,n)
#A3 = np.linspace(0,10,n)
start = time.time()

A1_grid, A2_grid, A3_grid = np.meshgrid(A1,A2,A3,indexing='ij')
A=A1_grid+A2_grid+A3_grid
print(np.max(A))

end = time.time()-start

print(f'{end:3.f} seconds')

iterator = itertools.combinations(A1, 3)

high = 0
time_list = list()

for k in range(10):

        start = time.time()

        iterator = itertools.combinations(enumerate(A1), 3)
        for i in iterator:
            print(i,i[0:],i[1:])
            
            test1 = i[1][0]/i[1][1]+i[1][2]
            test2 = i[0]*i[1]*i[2]
            
        if test2 < 20:
                if test1 > high:
                        high = test1
                        high_i = i
                        
        if j%100000 == 0:
                part = time.time()-start
                print(f'Iteratrion: {j} after {part:.2f} seconds')
        
        end = time.time()-start
        time_list.append(end)

avg = sum(time_list)/len(time_list)
print(f'{avg:.3f} seconds')

#print(f'{end:.3f} seconds, highest value is {high:.3f} with i: {high_i}')

for k in range(1,4):
    start = time.time()

    list_1 = np.random.uniform(1,20,size=n)
    list_2 = np.random.uniform(1,5,size=n)

    iterator = itertools.combinations(enumerate(list_2), 3)

    hi = 0
    indeces = list()

    for i in iterator:
        duration = sum([j[1] for j in i])        
        if duration >= 12 and duration <= 14:
            score = sum([list_1[j[0]] for j in i])
            if score > hi:
                hi = score
                dur = duration
                indeces = [j[0] for j in i]

    end = time.time() - start

    text = f'Highest value is {hi:.2f} \n'
    text += f'The duration is {dur:.2f} \n'
    text += f'This took {end:.1f} seconds'

    print(f'Iteration: {k}')
    print(text)
"""
    


   Est_rating  Rating  Difference
0    9.683175     7.6    4.339620
1    9.721988     8.1    2.630846
2    9.729072     8.2    2.338062
3    9.721988     8.1    2.630846
4    9.637913     7.1    6.441000
   Est_rating  Rating  Difference
0    7.310586     7.6    0.083761
1    7.310586     8.1    0.623175
2    7.310586     8.2    0.791058
3    7.310586     8.1    0.623175
4    7.310586     7.1    0.044346


"\nn = 300\n\n#A1 = np.random.uniform(1,20,size=n)\n#A2 = np.linspace(0,10,n)\n#A3 = np.linspace(0,10,n)\nstart = time.time()\n\nA1_grid, A2_grid, A3_grid = np.meshgrid(A1,A2,A3,indexing='ij')\nA=A1_grid+A2_grid+A3_grid\nprint(np.max(A))\n\nend = time.time()-start\n\nprint(f'{end:3.f} seconds')\n\niterator = itertools.combinations(A1, 3)\n\nhigh = 0\ntime_list = list()\n\nfor k in range(10):\n\n        start = time.time()\n\n        iterator = itertools.combinations(enumerate(A1), 3)\n        for i in iterator:\n            print(i,i[0:],i[1:])\n            \n            test1 = i[1][0]/i[1][1]+i[1][2]\n            test2 = i[0]*i[1]*i[2]\n            \n        if test2 < 20:\n                if test1 > high:\n                        high = test1\n                        high_i = i\n                        \n        if j%100000 == 0:\n                part = time.time()-start\n                print(f'Iteratrion: {j} after {part:.2f} seconds')\n        \n        end = time.time()-start\n 

In [79]:
# Scipy minimize
evals = 0
x0 = [0,0,0] 
result = optimize.minimize(obj_fun(df_xs,alphas),x0,
                           method='Nelder-Mead',
                           callback=collect, # call the collect() before each iteration
                           options={'disp':True}) # display the results


def zerolistmaker(n):
    listofzeros = [0] * n
    return listofzeros

init = zerolistmaker(3)

def diff(x):
    return obj_fun(df_xs, x)

   Est_rating  Rating  Difference
0    9.683175     7.6    4.339620
1    9.721988     8.1    2.630846
2    9.729072     8.2    2.338062
3    9.721988     8.1    2.630846
4    9.637913     7.1    6.441000


TypeError: 'numpy.float64' object is not callable

In [None]:
# Figure ideas 

""" 
Two figures: 
    Left figure: 
        Two curves, one which is actual average rating, while the other is estimated rating. 
        Each observation on the estimated ratings curve is an estimate from our score function. 
        By each iteration, we wish to observe a decreased gap between the two functions. 
    Right figure: 
        Histogram of the squared errors. Each bin should be a new iteration, and preferably, we would 
        see a decreasing bin height, as the squared errors converges towards a minimum. 
"""


