In [123]:
import numpy as np
import pandas as pd

filename = 'imdb.csv'

def gen_df(filename):

    # Get .csv.file
    data = pd.read_csv(filename, sep=';', encoding='latin-1', escapechar='\\')

    # Read file into pandas dataframe
    df = pd.DataFrame(data)

    # Drop unwanted columns
    for i in range(44,48):
        df.drop(columns=[f'Unnamed: {i}'], inplace=True)

    df.drop(columns=['fn','wordsInTitle','url'], inplace=True)

    # Keep only observations of movie-type
    I = df['type'] == 'video.movie'
    df = df.loc[I]
    df.drop(columns=['type'], inplace=True)

    # Drop observations with missing data
    df.dropna(inplace=True)

    # Replace 0's in imdb-ratings
    df['imdbRating'] = df['imdbRating'].astype(str)
    df['imdbRating'].replace(regex=True, inplace=True,to_replace='0',value='')
    df['imdbRating'] = df['imdbRating'].astype(float)

    # Transform duration from seconds to hours
    df['duration'] = df['duration']/60**2

    # Drop years before 1920 and 2014 because of few obervations
    I = (df['year']>=1920) & (df['year']<=2013)
    df = df.loc[I]

    # Change the type of 'year' to integer
    df['year'] = df['year'].astype(int)    
    
    # Sort observations and reset index
    df.sort_values('year', inplace=True)
    df.reset_index(inplace=True)


    # Generating variable that shows the decade (as a string)
    year_list = [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]

    df['decade'] = ''
    for i,start in enumerate(year_list):
        end = start+10
        df.loc[(df['year'] >= start) & (df['year'] < end), 'decade'] = f'{year_list[i]}s'

    return df

In [124]:
filename = 'imdb.csv'
df = gen_df(filename)
imdb = df.copy()

decade_list = [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]

# Decade dummies
for i in decade_list:
    imdb[f'decade_{i}'] = 0
    imdb.loc[(imdb['decade'] == f'{i}s'),f'decade_{i}'] = 1

imdb['imdbRating'] = imdb['imdbRating']/10
#print(imdb.head())

genre_list = ['Action', 'Adult', 'Adventure', 'Animation', 'Biography',
         'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy',
         'FilmNoir', 'GameShow', 'History', 'Horror', 'Music', 'Musical',
         'Mystery', 'News', 'RealityTV', 'Romance', 'SciFi', 'Short', 'Sport',
         'TalkShow', 'Thriller', 'War', 'Western']
decade_list = [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]


genre_top = [genre_list[0],genre_list[2],genre_list[3],genre_list[7],genre_list[10]]  #Pick five of the genres above
decade_top_index = 8 # Index from 'decade_list', your favourite decade
decade_top = [f'decade_{decade_list[decade_top_index-1]}',f'decade_{decade_list[decade_top_index]}',
                                                        f'decade_{decade_list[decade_top_index+1]}']


#print(genre_top)
#print(decade_top)

weight_genre = 5 # Between 0-5
weight_decade = 5 # Between 0-5
weight_rating = 5 # Between 0-5

weight_genre0 = 5 # Between 0-5
weight_genre1 = 5 # Between 0-5
weight_genre2 = 5 # Between 0-5
weight_genre3 = 5 # Between 0-5
weight_genre4 = 5 # Between 0-5

alpha = weight_genre/(weight_genre+weight_decade+weight_rating)
beta = weight_decade/(weight_genre+weight_decade+weight_rating)
gamma = weight_rating/(weight_genre+weight_decade+weight_rating)

#print(alpha,beta, gamma)

alpha0 = weight_genre0/(weight_genre0+weight_genre1+weight_genre2+weight_genre3+weight_genre4)
alpha1 = weight_genre1/(weight_genre0+weight_genre1+weight_genre2+weight_genre3+weight_genre4)
alpha2 = weight_genre2/(weight_genre0+weight_genre1+weight_genre2+weight_genre3+weight_genre4)
alpha3 = weight_genre3/(weight_genre0+weight_genre1+weight_genre2+weight_genre3+weight_genre4)
alpha4 = weight_genre4/(weight_genre0+weight_genre1+weight_genre2+weight_genre3+weight_genre4)


def u(df,genre,decade,rating):
    return alpha*(alpha0*df[genre[0]]+alpha1*df[genre[1]]+alpha2*df[genre[2]]+alpha3*df[genre[3]]+alpha4*df[genre[4]])+beta*(0.5*df[decade[0]]+df[decade[1]]+0.5*df[decade[2]])+gamma*df[rating]

imdb['U'] = u(imdb,genre_top,decade_top,'imdbRating')
imdb['Uxdur'] = imdb['U']*imdb['duration']

#imdb.head(10)

In [159]:
temp = imdb.copy()

#print(type(temp))

p99 = lambda x: np.percentile(x,99)

temp = temp.loc[(temp['Uxdur'] > p99(temp['Uxdur']))]

#print(temp.shape)

duration = np.array(temp['duration'])
uxdur = np.array(temp['Uxdur'])

print(duration.shape)
print(uxdur.shape)


duration2 = duration
duration3 = duration
uxdur2 = uxdur
uxdur3 = uxdur

d1_grid, d2_grid, d3_grid = np.meshgrid(duration,duration2,duration3,indexing='ij')
u1_grid, u2_grid, u3_grid = np.meshgrid(uxdur,uxdur2,uxdur3,indexing='ij')

duration_grid = d1_grid+d2_grid+d3_grid
uxdur_grid = u1_grid+u2_grid+u3_grid

print(np.min(duration_grid),np.max(uxdur_grid))
#print(duration_grid)
#print(uxdur_grid)

(104,)
(104,)
7.0 13.603333333333333


In [163]:
U_max = 0
dur_max = 0
for i in range(len(uxdur)):
    for j in range(len(uxdur)):
        for l in range(len(uxdur)):
            if (duration_grid[i,j,l] < 9) and (uxdur_grid[i,j,l] > U_max) and (i != j) and (i !=l) and (j != l):
                U_max =  uxdur_grid[i,j,l]
                dur_max = duration_grid[i,j,l]
                i_index = i 
                j_index = j
                l_index = l

#print(np.max(uxdur_grid), U_max)
print(f'The max utility is {U_max:.2f} with total duration of {dur_max:.2f} hours. The index is {i_index},{j_index},{l_index}')

The max utility is 7.20 with total duration of 8.85 hours. The index is 52,79,40


In [118]:
## Dette er en test 

import numpy as np

n=10


A1 = np.linspace(0,10,n)
A2 = np.linspace(0,10,n)
A3 = np.linspace(0,10,n)
A4 = np.linspace(0,100,n)
A5 = np.linspace(0,100,n)
A6 = np.linspace(0,100,n)

A1_grid, A2_grid, A3_grid= np.meshgrid(A1,A2,A3,indexing='ij')
A4_grid, A5_grid, A6_grid= np.meshgrid(A4,A5,A6,indexing='ij')


A_grid1 = A1_grid + A2_grid + A3_grid
A_grid2 = A4_grid + A5_grid + A6_grid



#print(A_grid1)
#print(A_grid2)

max = 0
for i in range(n):
    for j in range(n):
        for l in range(n):
            if (A_grid2[i,j,l] > 50) and (A_grid1[i,j,l] > max) and (i != j) and (i !=l) and (j != l):
                max =  A_grid1[i,j,l]
                i_index = i
                j_index = j 
                l_index = l
                
print(max, i_index, j_index, l_index)

26.666666666666668 7 8 9
