In [7]:
import numpy as np
import pandas as pd
import itertools
import time
import scipy.optimize as optimize




In [8]:
filename = 'imdb.csv'

def gen_df(filename):

    # Get .csv.file
    data = pd.read_csv(filename, sep=';', encoding='latin-1', escapechar='\\')

    # Read file into pandas dataframe
    df = pd.DataFrame(data)

    # Drop unwanted columns
    for i in range(44,48):
        df.drop(columns=[f'Unnamed: {i}'], inplace=True)

    df.drop(columns=['fn','wordsInTitle','url'], inplace=True)

    # Keep only observations of movie-type
    I = df['type'] == 'video.movie'
    df = df.loc[I]
    df.drop(columns=['type'], inplace=True)

    # Drop observations with missing data
    df.dropna(inplace=True)

    # Replace 0's in imdb-ratings
    df['imdbRating'] = df['imdbRating'].astype(str)
    df['imdbRating'].replace(regex=True, inplace=True,to_replace='0',value='')
    df['imdbRating'] = df['imdbRating'].astype(float)

    # Transform duration from seconds to hours
    df['duration'] = df['duration']/60**2

    # Drop years before 1920 and 2014 because of few obervations
    I = (df['year']>=1920) & (df['year']<=2013)
    df = df.loc[I]

    # Change the type of 'year' to integer
    df['year'] = df['year'].astype(int)    
    
    # Sort observations and reset index
    df.sort_values('year', inplace=True)
    df.reset_index(inplace=True)


    # Generating variable that shows the decade (as a string)
    year_list = [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]

    df['decade'] = ''
    for i,start in enumerate(year_list):
        end = start+10
        df.loc[(df['year'] >= start) & (df['year'] < end), 'decade'] = f'{year_list[i]}s'

    return df

In [10]:
df = gen_df(filename)


Unnamed: 0,index,tid,title,imdbRating,ratingCount,duration,year,nrOfWins,nrOfNominations,nrOfPhotos,...,RealityTV,Romance,SciFi,Short,Sport,TalkShow,Thriller,War,Western,decade
0,2200,tt0011565,The Penalty (1920),7.6,1095.0,1.5,1920,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1920s
1,502,tt0011841,MÌ_dchenlos (1920),8.1,3134.0,2.416667,1920,0.0,0.0,18.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1920s
2,12832,tt0011865,Irrwege einer Ehe (1920),8.2,1042.0,1.5,1920,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1920s
3,12148,tt0010323,Das Cabinet des Dr. Caligari (1920),8.1,29379.0,1.3,1920,0.0,0.0,40.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1920s
4,12368,tt0011071,Buster Keaton als StrÌ_fling (1920),7.1,1474.0,0.333333,1920,0.0,0.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1920s


In [24]:
# Parameters
alphas = np.random.rand(23)
betas = np.random.rand(10)
delta = np.random.rand(1)
gamma = np.random.rand(1)
rhos = np.random.rand(2)

df['Random rating'] = np.random.uniform(1,10, df.shape[0])
df['Difference'] = (df['imdbRating'] - df['Random rating'])**2




Unnamed: 0,index,tid,title,imdbRating,ratingCount,duration,year,nrOfWins,nrOfNominations,nrOfPhotos,...,Short,Sport,TalkShow,Thriller,War,Western,decade,Random rating,Differende,Difference
0,2200,tt0011565,The Penalty (1920),7.6,1095.0,1.5,1920,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1920s,5.576952,5.406342,4.092724
1,502,tt0011841,MÌ_dchenlos (1920),8.1,3134.0,2.416667,1920,0.0,0.0,18.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1920s,3.547998,0.923422,20.720719
2,12832,tt0011865,Irrwege einer Ehe (1920),8.2,1042.0,1.5,1920,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1920s,1.540831,-1.235851,44.34453
3,12148,tt0010323,Das Cabinet des Dr. Caligari (1920),8.1,29379.0,1.3,1920,0.0,0.0,40.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1920s,1.337901,3.113896,45.725977
4,12368,tt0011071,Buster Keaton als StrÌ_fling (1920),7.1,1474.0,0.333333,1920,0.0,0.0,2.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1920s,4.468654,4.727482,6.923979
