In [1198]:
import pandas as pd
import numpy as np
import duckdb

## DuckDB

### Initializing a DuckDB database

In [1199]:
con = duckdb.connect(database=':memory:')

In [1200]:
con.execute('''
    CREATE TABLE training_set AS SELECT * FROM read_csv_auto('train-1.csv')
''')

<duckdb.DuckDBPyConnection at 0x1c9229b2f70>

In [1201]:
con.execute("SELECT * FROM training_set").fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
0,4,tt0010600,The Doll,Die Puppe,1919,\N,66,1898.0,True
1,7,tt0011841,Way Down East,Way Down East,1920,\N,145,5376.0,True
2,9,tt0012494,Déstiny,Der müde Tod,1921,\N,97,5842.0,True
3,25,tt0015163,The Navigator,The Navigator,1924,\N,59,9652.0,True
4,38,tt0016220,The Phantom of the Opera,The Phantom of the Opera,1925,\N,93,17887.0,True
...,...,...,...,...,...,...,...,...,...
958,9955,tt9558612,PM Náréndrá Mớdi,PM Narendra Modi,2019,\N,136,7005.0,False
959,9960,tt9598172,Sáving Léningrád,,2019,\N,96,2200.0,False
960,9977,tt9691136,Shadow in the Cloud,,2020,\N,83,22617.0,False
961,9979,tt9695258,So My Grandma's a Lesbian!,Salir del ropero,2019,\N,94,1054.0,False


In [1202]:
con.execute('''
    INSERT INTO training_set SELECT * FROM read_csv_auto('train-2.csv');
    INSERT INTO training_set SELECT * FROM read_csv_auto('train-3.csv');
    INSERT INTO training_set SELECT * FROM read_csv_auto('train-4.csv');
    INSERT INTO training_set SELECT * FROM read_csv_auto('train-5.csv');
    INSERT INTO training_set SELECT * FROM read_csv_auto('train-6.csv');
    INSERT INTO training_set SELECT * FROM read_csv_auto('train-7.csv');
    INSERT INTO training_set SELECT * FROM read_csv_auto('train-8.csv')
''')

<duckdb.DuckDBPyConnection at 0x1c9229b2f70>

In [1203]:
con.execute("SELECT * FROM training_set").fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
0,4,tt0010600,The Doll,Die Puppe,1919,\N,66,1898.0,True
1,7,tt0011841,Way Down East,Way Down East,1920,\N,145,5376.0,True
2,9,tt0012494,Déstiny,Der müde Tod,1921,\N,97,5842.0,True
3,25,tt0015163,The Navigator,The Navigator,1924,\N,59,9652.0,True
4,38,tt0016220,The Phantom of the Opera,The Phantom of the Opera,1925,\N,93,17887.0,True
...,...,...,...,...,...,...,...,...,...
7954,9966,tt9625664,Trauma Center,,2019,\N,87,12951.0,False
7955,9981,tt9741310,Slaxx,Slaxx,2020,\N,77,2464.0,False
7956,9982,tt9742392,Kindred,Kindred,2020,\N,101,1719.0,False
7957,9996,tt9850386,The Bee Gees: How Can You Mend a Broken Heart,,2020,\N,111,4144.0,True


### Checking the number of null values in the columns

In [1204]:
con.execute('''
    SELECT COUNT(*) FROM training_set WHERE primaryTitle IS NULL
''').fetchdf()

Unnamed: 0,count_star()
0,0


In [1205]:
con.execute('''
    SELECT COUNT(*) FROM training_set WHERE originalTitle IS NULL
''').fetchdf()

Unnamed: 0,count_star()
0,3988


In [1206]:
con.execute('''
    SELECT COUNT(*) FROM training_set WHERE startYear = '\\N'
''').fetchdf()

Unnamed: 0,count_star()
0,786


In [1207]:
con.execute('''
    SELECT COUNT(*) FROM training_set WHERE runtimeMinutes = '\\N'
''').fetchdf()

Unnamed: 0,count_star()
0,13


In [1208]:
con.execute('''
    SELECT COUNT(*) FROM training_set WHERE numVotes IS NULL
''').fetchdf()

Unnamed: 0,count_star()
0,790


## Data Cleaning

In [1209]:
def execute(query):
    result = con.execute(query).fetchdf()
    return result

### 1) Replace missing values in startYear column

In [1210]:
def replace_missing_startYear(input_name):
    query = '''
        UPDATE ''' + input_name + '''
        SET startYear = endYear
        WHERE startYear = '\\N'
    '''
    return query

In [1211]:
execute(replace_missing_startYear('training_set'))

Unnamed: 0,Count
0,786


In [1212]:
con.execute("SELECT * FROM training_set WHERE startYear = '\\N'").fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label


### 2) Dropping endYear column

Now that the missing values in startYear have been replaced by the values of endYear, this latter column is no longer necessary. We can drop this column.

In [1213]:
def drop_endYear(input_name):
    query = '''
        ALTER TABLE ''' + input_name + '''
        DROP COLUMN endYear
    '''
    return query

In [1214]:
execute(drop_endYear('training_set'))

Unnamed: 0,Success


In [1215]:
con.execute("SELECT * FROM training_set").fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,numVotes,label
0,4,tt0010600,The Doll,Die Puppe,1919,66,1898.0,True
1,7,tt0011841,Way Down East,Way Down East,1920,145,5376.0,True
2,9,tt0012494,Déstiny,Der müde Tod,1921,97,5842.0,True
3,25,tt0015163,The Navigator,The Navigator,1924,59,9652.0,True
4,38,tt0016220,The Phantom of the Opera,The Phantom of the Opera,1925,93,17887.0,True
...,...,...,...,...,...,...,...,...
7954,9966,tt9625664,Trauma Center,,2019,87,12951.0,False
7955,9981,tt9741310,Slaxx,Slaxx,2020,77,2464.0,False
7956,9982,tt9742392,Kindred,Kindred,2020,101,1719.0,False
7957,9996,tt9850386,The Bee Gees: How Can You Mend a Broken Heart,,2020,111,4144.0,True


### 3) Replace missing values in runtimeMinutes column

First, we can substitute the null values with 0 and set it to type integer so subsequent computations are easy to do. Then, we use the yearly average to impute the missing values - so we replace by the average of `runtimeMins` for movies belonging to the same year.

In [1216]:
def convert_runtimeMins(input_name):
    query = '''
        UPDATE ''' + input_name + ''' SET runtimeMinutes = 0 WHERE runtimeMinutes = '\\N';
        ALTER TABLE ''' + input_name + ''' ALTER COLUMN runtimeMinutes SET DATA TYPE INTEGER;
    '''
    return query

In [1217]:
def calculate_missing_runtimeMins(input_name):
    query = '''
        UPDATE ''' + input_name + ''' m1 
        SET runtimeMinutes = (
          SELECT AVG(runtimeMinutes) as yearly_mean 
          FROM ''' + input_name + ''' m2 
          WHERE m1.startYear = m2.startYear AND runtimeMinutes > 0 
          GROUP BY m2.startYear
        )
        WHERE runtimeMinutes = 0;
    '''
    return query

In [1218]:
execute(convert_runtimeMins('training_set'))

Unnamed: 0,Success


In [1219]:
execute(calculate_missing_runtimeMins('training_set'))

Unnamed: 0,Count
0,13


In [1220]:
con.execute('''
    SELECT COUNT(*) FROM training_set WHERE runtimeMinutes = '\\N'
''').fetchdf()

Unnamed: 0,count_star()
0,0


In [1221]:
con.execute("SELECT runtimeMinutes FROM training_set WHERE runtimeMinutes = 0").fetchdf()

Unnamed: 0,runtimeMinutes


### 4) Replace missing values in numVotes column

Similar to the previous one, we can substitute the null values in `numVotes` with 0 and set it to type integer. Then, we can use a trimmed average of the `numVotes` column, excluding the smallest and largest values so that the mean is less skewed.

In [1222]:
def convert_numVotes(input_name):
    query = '''
        UPDATE ''' + input_name + ''' SET numVotes = 0 WHERE numVotes IS NULL;
        ALTER TABLE ''' + input_name + ''' ALTER COLUMN numVotes SET DATA TYPE INTEGER;
    '''
    return query

In [1223]:
def calculate_missing_numVotes(input_name):
    query = '''
        UPDATE ''' + input_name + ''' m1 
        SET numVotes = (
          SELECT (SUM(numVotes) - MIN(numVotes) - MAX(numVotes)) / CAST(COUNT(*)-2 as FLOAT) as trimmed_mean 
          FROM ''' + input_name + ''' m2 
          WHERE numVotes > 0
        )
        WHERE numVotes = 0;
    '''
    return query

In [1224]:
execute(convert_numVotes('training_set'))

Unnamed: 0,Success


In [1225]:
execute(calculate_missing_numVotes('training_set'))

Unnamed: 0,Count
0,790


In [1226]:
con.execute("SELECT numVotes FROM training_set WHERE numVotes IS NULL").fetchdf()

Unnamed: 0,numVotes


In [1227]:
con.execute("SELECT numVotes FROM training_set WHERE numVotes = 0").fetchdf()

Unnamed: 0,numVotes


### 5) Dropping the originalTitle column

Since over 50% of the values in originalTitle are null and the primaryTitle already contains the main information regarding the movie name, we can discard this column from our dataset.

In [1228]:
def drop_originalTitle(input_name):
    query = '''
        ALTER TABLE ''' + input_name + '''
        DROP COLUMN originalTitle
    '''
    return query

In [1229]:
execute(drop_originalTitle('training_set'))

Unnamed: 0,Success


### 6) Dropping the column0 column

In [1230]:
def drop_column_zero(input_name):
    query = '''
        ALTER TABLE ''' + input_name + '''
        DROP COLUMN column0
    '''
    return query

In [1231]:
execute(drop_column_zero('training_set'))

Unnamed: 0,Success


In [1232]:
training_set = con.execute('''
    SELECT * FROM training_set
''').fetchdf()
training_set = training_set.sort_values(by='tconst')
# training_set
training_set.to_csv('training_set.csv')

### 6) Checking duplicate rows

In [1233]:
con.execute('''
    SELECT * FROM training_set 
    GROUP BY *
    HAVING COUNT(*) > 1
''').fetchdf()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label


In [1234]:
con.execute('''
    SELECT tconst, primaryTitle FROM training_set
    GROUP BY tconst, primaryTitle
    HAVING COUNT(*) > 1
''').fetchdf()

Unnamed: 0,tconst,primaryTitle


In [1235]:
con.execute('''
    SELECT * FROM training_set 
    WHERE primaryTitle = 'Sabrina'
''').fetchdf()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label
0,tt7981492,Sabrina,2018,113,1662,False
1,tt0047437,Sabrina,1954,113,62736,True


In [1236]:
con.execute('''
    SELECT * FROM training_set
    WHERE primaryTitle IN (
        SELECT primaryTitle FROM training_set
        GROUP BY primaryTitle
        HAVING COUNT(*) > 1
    )
''').fetchdf()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label
0,tt0016220,The Phantom of the Opera,1925,93,17887,True
1,tt0024216,King Kong,1933,100,83177,True
2,tt0031647,Midnight,1939,94,4904,True
3,tt0033152,The Thief of Bagdad,1940,106,12840,True
4,tt0038355,The Big Sleep,1946,114,83357,True
...,...,...,...,...,...,...
213,tt3462710,Unforgettable,2017,100,15087,False
214,tt4008758,Black,2015,95,4152,True
215,tt7984766,The King,2019,140,110160,True
216,tt8144778,The Redeemed and the Dominant: Fittest on Earth,2018,119,1347,True


In [1237]:
con.execute('''
    SELECT * FROM training_set
    WHERE primaryTitle = 'The Phantom of the Opera'
''').fetchdf()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label
0,tt0016220,The Phantom of the Opera,1925,93,17887,True
1,tt0119889,The Phantom of the Opera,1998,99,5390,False


### Adding external data to this dataset

In [1238]:
df_external_movies = pd.read_csv('movies.csv')
df_external_movies = df_external_movies[['movieId', 'title']]
df_external_movies

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)
...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017)
9738,193583,No Game No Life: Zero (2017)
9739,193585,Flint (2017)
9740,193587,Bungo Stray Dogs: Dead Apple (2018)


In [1239]:
df_external_movies['release_year'] = df_external_movies['title'].str.extract(r'\((\d{4})\)$')
df_external_movies['title'] = df_external_movies['title'].str.replace(r'\s*\(\d{4}\)$', '')
df_external_movies

Unnamed: 0,movieId,title,release_year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,2017
9738,193583,No Game No Life: Zero,2017
9739,193585,Flint,2017
9740,193587,Bungo Stray Dogs: Dead Apple,2018


In [1240]:
df_external_ratings = pd.read_csv('ratings.csv')
df_external_ratings = df_external_ratings[['movieId', 'rating']]
df_external_ratings

Unnamed: 0,movieId,rating
0,1,4.0
1,3,4.0
2,6,4.0
3,47,5.0
4,50,5.0
...,...,...
100831,166534,4.0
100832,168248,5.0
100833,168250,5.0
100834,168252,5.0


In [1241]:
avg_ratings = df_external_ratings.groupby('movieId')['rating'].mean().reset_index()
df_external_avgRatings = pd.DataFrame(avg_ratings)
df_external_avgRatings

Unnamed: 0,movieId,rating
0,1,3.920930
1,2,3.431818
2,3,3.259615
3,4,2.357143
4,5,3.071429
...,...,...
9719,193581,4.000000
9720,193583,3.500000
9721,193585,3.500000
9722,193587,3.500000


In [1242]:
df_external = pd.merge(df_external_movies, df_external_avgRatings, on='movieId')
df_external

Unnamed: 0,movieId,title,release_year,rating
0,1,Toy Story,1995,3.920930
1,2,Jumanji,1995,3.431818
2,3,Grumpier Old Men,1995,3.259615
3,4,Waiting to Exhale,1995,2.357143
4,5,Father of the Bride Part II,1995,3.071429
...,...,...,...,...
9719,193581,Black Butler: Book of the Atlantic,2017,4.000000
9720,193583,No Game No Life: Zero,2017,3.500000
9721,193585,Flint,2017,3.500000
9722,193587,Bungo Stray Dogs: Dead Apple,2018,3.500000


In [1243]:
df_external.to_csv('external_data.csv', index=False)

In [1244]:
con.execute('''
    CREATE TABLE movielens_data AS SELECT * FROM df_external
''')

<duckdb.DuckDBPyConnection at 0x1c9229b2f70>

In [1245]:
con.execute('''SELECT * FROM movielens_data''').fetchdf()

Unnamed: 0,movieId,title,release_year,rating
0,1,Toy Story,1995,3.920930
1,2,Jumanji,1995,3.431818
2,3,Grumpier Old Men,1995,3.259615
3,4,Waiting to Exhale,1995,2.357143
4,5,Father of the Bride Part II,1995,3.071429
...,...,...,...,...
9719,193581,Black Butler: Book of the Atlantic,2017,4.000000
9720,193583,No Game No Life: Zero,2017,3.500000
9721,193585,Flint,2017,3.500000
9722,193587,Bungo Stray Dogs: Dead Apple,2018,3.500000


In [1246]:
# sql_query = f"ALTER TABLE movielens_data DROP COLUMN movieId"
# con.execute(sql_query)

In [1247]:
con.execute('''SELECT * FROM movielens_data''').fetchdf()

Unnamed: 0,movieId,title,release_year,rating
0,1,Toy Story,1995,3.920930
1,2,Jumanji,1995,3.431818
2,3,Grumpier Old Men,1995,3.259615
3,4,Waiting to Exhale,1995,2.357143
4,5,Father of the Bride Part II,1995,3.071429
...,...,...,...,...
9719,193581,Black Butler: Book of the Atlantic,2017,4.000000
9720,193583,No Game No Life: Zero,2017,3.500000
9721,193585,Flint,2017,3.500000
9722,193587,Bungo Stray Dogs: Dead Apple,2018,3.500000


### Checking for duplicates

In [1248]:
con.execute('''
    SELECT title FROM movielens_data 
    GROUP BY title
    HAVING COUNT(*) > 1
''').fetchdf()

Unnamed: 0,title
0,Sabrina
1,Persuasion
2,"Misérables, Les"
3,Black Sheep
4,Broken Arrow
...,...
247,Feast
248,12 Chairs
249,Dad's Army
250,Elsa & Fred


In [1249]:
con.execute('''
    SELECT * FROM movielens_data
    WHERE title = 'Sabrina'
''').fetchdf()

Unnamed: 0,movieId,title,release_year,rating
0,7,Sabrina,1995,3.185185
1,915,Sabrina,1954,3.766667


In [1250]:
con.execute('''
    SELECT * FROM movielens_data
    WHERE title = 'Persuasion'
''').fetchdf()

Unnamed: 0,movieId,title,release_year,rating
0,28,Persuasion,1995,4.227273
1,74508,Persuasion,2007,3.333333


In [1251]:
con.execute('''
    DELETE FROM movielens_data
    WHERE title IN (
        SELECT title FROM movielens_data
        GROUP BY title
        HAVING COUNT(*) > 1
    )
''')

<duckdb.DuckDBPyConnection at 0x1c9229b2f70>

In [1252]:
con.execute('''
    SELECT title FROM movielens_data 
    GROUP BY title
    HAVING COUNT(*) > 1
''').fetchdf()

Unnamed: 0,title


In [1253]:
con.execute('''SELECT * FROM movielens_data''').fetchdf()

Unnamed: 0,movieId,title,release_year,rating
0,1,Toy Story,1995,3.920930
1,2,Jumanji,1995,3.431818
2,3,Grumpier Old Men,1995,3.259615
3,4,Waiting to Exhale,1995,2.357143
4,5,Father of the Bride Part II,1995,3.071429
...,...,...,...,...
9189,193581,Black Butler: Book of the Atlantic,2017,4.000000
9190,193583,No Game No Life: Zero,2017,3.500000
9191,193585,Flint,2017,3.500000
9192,193587,Bungo Stray Dogs: Dead Apple,2018,3.500000


### 6) Merging tables

In [1125]:
def add_external_columns(input_name, external_name):
    query = '''
        CREATE TABLE merged_''' + input_name + ''' AS SELECT * FROM 
        ''' + input_name + ''' LEFT JOIN ''' + external_name + ''' 
        ON ''' + input_name + '''.primaryTitle = ''' + external_name + '''.title
        AND ''' + input_name + '''.startYear = ''' + external_name + '''.release_year
    '''
    return query

In [1126]:
execute(add_external_columns('training_set', 'movielens_data'))

Unnamed: 0,Count
0,7959


In [1127]:
con.execute('''SELECT * FROM merged_training_set''').fetchdf()
# merged = merged[['tconst', 'primaryTitle', "startYear", "runtimeMinutes", "numVotes", "label"]]
# merged = merged.sort_values('tconst')
# # merged
# merged.to_csv('merged.csv')

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label,movieId,title,release_year,rating
0,tt0031385,"Goodbye, Mr. Chips",1939,114,10311,True,7311.0,"Goodbye, Mr. Chips",1939.0,3.666667
1,tt0031762,Only Angels Have Wings,1939,121,13595,True,2847.0,Only Angels Have Wings,1939.0,3.000000
2,tt0034862,Holiday Inn,1942,100,14436,True,3061.0,Holiday Inn,1942.0,3.625000
3,tt0039631,Monsieur Verdoux,1947,124,16962,True,3632.0,Monsieur Verdoux,1947.0,4.000000
4,tt0053183,Pork Chop Hill,1959,97,4527,True,2669.0,Pork Chop Hill,1959.0,3.500000
...,...,...,...,...,...,...,...,...,...,...
7954,tt0126886,Election,1999,103,29179,True,2599.0,Election,1999.0,3.660714
7955,tt0277027,I Am Sam,2001,132,147514,True,5014.0,I Am Sam,2001.0,3.764706
7956,tt4284010,"What Happened, Miss Simone?",2015,101,13425,True,127164.0,"What Happened, Miss Simone?",2015.0,3.500000
7957,tt0116922,Lost Highway,1997,134,137572,True,1464.0,Lost Highway,1997.0,3.264706


### 7) Cleaning merged table

### Drop title column

Contained a lot of missing values and redundant information that was already present in primaryTitle

In [1134]:
def drop_movie_id(input_name):
    query = '''
        ALTER TABLE ''' + input_name + '''
        DROP COLUMN movieId
    '''
    return query

In [1135]:
execute(drop_movie_id('merged_training_set'))

Unnamed: 0,Success


In [1128]:
def drop_title(input_name):
    query = '''
        ALTER TABLE ''' + input_name + '''
        DROP COLUMN title
    '''
    return query

In [1129]:
execute(drop_title('merged_training_set'))

Unnamed: 0,Success


In [1130]:
def drop_release_year(input_name):
    query = '''
        ALTER TABLE ''' + input_name + '''
        DROP COLUMN release_year
    '''
    return query

In [1132]:
execute(drop_release_year('merged_training_set'))

Unnamed: 0,Success


In [1136]:
con.execute('''SELECT * FROM merged_training_set''').fetchdf()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label,rating
0,tt0031385,"Goodbye, Mr. Chips",1939,114,10311,True,3.666667
1,tt0031762,Only Angels Have Wings,1939,121,13595,True,3.000000
2,tt0034862,Holiday Inn,1942,100,14436,True,3.625000
3,tt0039631,Monsieur Verdoux,1947,124,16962,True,4.000000
4,tt0053183,Pork Chop Hill,1959,97,4527,True,3.500000
...,...,...,...,...,...,...,...
7954,tt0126886,Election,1999,103,29179,True,3.660714
7955,tt0277027,I Am Sam,2001,132,147514,True,3.764706
7956,tt4284010,"What Happened, Miss Simone?",2015,101,13425,True,3.500000
7957,tt0116922,Lost Highway,1997,134,137572,True,3.264706


### Impute missing values in rating column

Using same technique as used for numVotes, use a trimmed average of budget column, excluding smallest & largest values

In [1137]:
con.execute('''
    SELECT COUNT(*) FROM merged_training_set WHERE rating IS NULL
''').fetchdf()

Unnamed: 0,count_star()
0,6993


In [1138]:
def convert_rating(input_name):
    query = '''
        UPDATE ''' + input_name + ''' SET rating = 0 WHERE rating IS NULL;
        ALTER TABLE ''' + input_name + ''' ALTER COLUMN rating SET DATA TYPE INTEGER;
    '''
    return query

In [1139]:
def calculate_missing_rating(input_name):
    query = '''
        UPDATE ''' + input_name + ''' m1 
        SET rating = (
          SELECT (SUM(rating) - MIN(rating) - MAX(rating)) / CAST(COUNT(*)-2 as FLOAT) as trimmed_mean 
          FROM ''' + input_name + ''' m2 
          WHERE rating > 0
        )
        WHERE rating = 0;
    '''
    return query

In [1140]:
execute(convert_rating('merged_training_set'))

Unnamed: 0,Success


In [1141]:
execute(calculate_missing_rating('merged_training_set'))

Unnamed: 0,Count
0,7003


In [1142]:
con.execute('''SELECT * FROM merged_training_set''').fetchdf()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label,rating
0,tt0031385,"Goodbye, Mr. Chips",1939,114,10311,True,4
1,tt0031762,Only Angels Have Wings,1939,121,13595,True,3
2,tt0034862,Holiday Inn,1942,100,14436,True,4
3,tt0039631,Monsieur Verdoux,1947,124,16962,True,4
4,tt0053183,Pork Chop Hill,1959,97,4527,True,4
...,...,...,...,...,...,...,...
7954,tt0126886,Election,1999,103,29179,True,4
7955,tt0277027,I Am Sam,2001,132,147514,True,4
7956,tt4284010,"What Happened, Miss Simone?",2015,101,13425,True,4
7957,tt0116922,Lost Highway,1997,134,137572,True,3


### Check column rating for missing values

In [1143]:
con.execute('''
    SELECT * FROM merged_training_set
    WHERE rating IS NULL
''').fetchdf()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label,rating


## Model training

In [1144]:
df = con.execute('''SELECT * FROM training_set''').fetchdf()
df

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label
0,tt0010600,The Doll,1919,66,1898,True
1,tt0011841,Way Down East,1920,145,5376,True
2,tt0012494,Déstiny,1921,97,5842,True
3,tt0015163,The Navigator,1924,59,9652,True
4,tt0016220,The Phantom of the Opera,1925,93,17887,True
...,...,...,...,...,...,...
7954,tt9625664,Trauma Center,2019,87,12951,False
7955,tt9741310,Slaxx,2020,77,2464,False
7956,tt9742392,Kindred,2020,101,1719,False
7957,tt9850386,The Bee Gees: How Can You Mend a Broken Heart,2020,111,4144,True


In [1145]:
df_merged = con.execute('''SELECT * FROM merged_training_set''').fetchdf()
df_merged

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label,rating
0,tt0031385,"Goodbye, Mr. Chips",1939,114,10311,True,4
1,tt0031762,Only Angels Have Wings,1939,121,13595,True,3
2,tt0034862,Holiday Inn,1942,100,14436,True,4
3,tt0039631,Monsieur Verdoux,1947,124,16962,True,4
4,tt0053183,Pork Chop Hill,1959,97,4527,True,4
...,...,...,...,...,...,...,...
7954,tt0126886,Election,1999,103,29179,True,4
7955,tt0277027,I Am Sam,2001,132,147514,True,4
7956,tt4284010,"What Happened, Miss Simone?",2015,101,13425,True,4
7957,tt0116922,Lost Highway,1997,134,137572,True,3


In [459]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data into training and test sets
X = df[['startYear', 'runtimeMinutes', 'numVotes']]
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an SVM classifier
clf = SVC()
clf.fit(X_train, y_train)

# Make predictions on the test set and evaluate the performance
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Test set accuracy: {accuracy:.2f}')

Test set accuracy: 0.56


In [1146]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data into training and test sets
X = df_merged[['startYear', 'runtimeMinutes', 'numVotes', 'rating']]
y = df_merged['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an SVM classifier
clf = SVC()
clf.fit(X_train, y_train)

# Make predictions on the test set and evaluate the performance
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Test set accuracy: {accuracy:.2f}')

Test set accuracy: 0.56


In [461]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Split data into features and labels
X = df[['startYear', 'runtimeMinutes', 'numVotes']]
y = df['label']

# Train-test split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(train_X, train_y)
lr_pred = lr_model.predict(test_X)
lr_acc = accuracy_score(test_y, lr_pred)
print("Logistic Regression Accuracy: {:.4f}".format(lr_acc))

# Train random forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(train_X, train_y)
rf_pred = rf_model.predict(test_X)
rf_acc = accuracy_score(test_y, rf_pred)
print("Random Forest Accuracy: {:.4f}".format(rf_acc))

# Train gradient boosted trees model
gbt_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gbt_model.fit(train_X, train_y)
gbt_pred = gbt_model.predict(test_X)
gbt_acc = accuracy_score(test_y, gbt_pred)
print("Gradient Boosted Trees Accuracy: {:.4f}".format(gbt_acc))

Logistic Regression Accuracy: 0.6828
Random Forest Accuracy: 0.6954
Gradient Boosted Trees Accuracy: 0.7274


In [1147]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Split data into features and labels
X = df_merged[['startYear', 'runtimeMinutes', 'numVotes', 'rating']]
y = df_merged['label']

# Train-test split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(train_X, train_y)
lr_pred = lr_model.predict(test_X)
lr_acc = accuracy_score(test_y, lr_pred)
print("Logistic Regression Accuracy: {:.4f}".format(lr_acc))

# Train random forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(train_X, train_y)
rf_pred = rf_model.predict(test_X)
rf_acc = accuracy_score(test_y, rf_pred)
print("Random Forest Accuracy: {:.4f}".format(rf_acc))

# Train gradient boosted trees model
gbt_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gbt_model.fit(train_X, train_y)
gbt_pred = gbt_model.predict(test_X)
gbt_acc = accuracy_score(test_y, gbt_pred)
print("Gradient Boosted Trees Accuracy: {:.4f}".format(gbt_acc))

Logistic Regression Accuracy: 0.7048
Random Forest Accuracy: 0.7205
Gradient Boosted Trees Accuracy: 0.7456


## Validation set

In [463]:
con.execute('''
    CREATE TABLE validation_set AS SELECT * FROM read_csv_auto('validation_hidden.csv')
''')

<duckdb.DuckDBPyConnection at 0x1c9619927f0>

In [464]:
con.execute('''SELECT * FROM validation_set''').fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes
0,0,tt0003740,Cabiria,,1914,\N,148,3452.0
1,1,tt0008663,A Man There Was,Terje Vigen,1917,\N,65,1882.0
2,3,tt0010307,J'accuse!,,1919,\N,166,1692.0
3,18,tt0014429,Safety Last!,Safety Last!,1923,\N,74,19898.0
4,27,tt0015175,Die Nibelungen: Siegfried,,1924,\N,143,5676.0
...,...,...,...,...,...,...,...,...
950,9974,tt9686154,You Will Die at 20,,2019,\N,103,2106.0
951,9976,tt9690328,Pápér Spidérs,Paper Spiders,2020,\N,109,
952,9980,tt9735790,Me You Madness,Me You Madness,2021,\N,98,1056.0
953,9984,tt9769668,Tughlaq Durbar,Tughlaq Durbar,2021,\N,145,1430.0


In [465]:
## Replace missing startYear values and drop endYear
execute(replace_missing_startYear('validation_set'))
execute(drop_endYear('validation_set'))

## Replace missing runtimeMins values
execute(convert_runtimeMins('validation_set'))
execute(calculate_missing_runtimeMins('validation_set'))

## Replace missing numVotes values
execute(convert_numVotes('validation_set'))
execute(calculate_missing_numVotes('validation_set'))

## Dropping the originalTitle column
execute(drop_originalTitle('validation_set'))
execute(drop_column_zero('validation_set'))
execute(add_external_columns('validation_set', 'movielens_data'))
execute(drop_title('merged_validation_set'))
execute(drop_original_title('merged_validation_set'))
execute(drop_id('merged_validation_set'))
execute(drop_release_year('merged_validation_set'))

execute(convert_budget('merged_validation_set'))
execute(calculate_missing_budget('merged_validation_set'))
execute(convert_vote_average('merged_validation_set'))
execute(calculate_missing_vote_average('merged_validation_set'))
execute(convert_revenue('merged_validation_set'))
execute(calculate_missing_revenue_average('merged_validation_set'))

Unnamed: 0,Count
0,809


In [466]:
con.execute('''
    SELECT COUNT(*) FROM validation_set WHERE primaryTitle IS NULL
''').fetchdf()

Unnamed: 0,count_star()
0,0


In [467]:
con.execute('''
    SELECT COUNT(*) FROM validation_set WHERE startYear = '\\N'
''').fetchdf()

Unnamed: 0,count_star()
0,0


In [468]:
con.execute('''
    SELECT COUNT(*) FROM validation_set WHERE runtimeMinutes = '\\N'
''').fetchdf()

Unnamed: 0,count_star()
0,0


In [469]:
con.execute('''
    SELECT COUNT(*) FROM validation_set WHERE numVotes IS NULL
''').fetchdf()

Unnamed: 0,count_star()
0,0


In [512]:
df_validation = con.execute('''SELECT * FROM validation_set''').fetchdf()
df_validation

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes
0,tt0003740,Cabiria,1914,148,3452
1,tt0008663,A Man There Was,1917,65,1882
2,tt0010307,J'accuse!,1919,166,1692
3,tt0014429,Safety Last!,1923,74,19898
4,tt0015175,Die Nibelungen: Siegfried,1924,143,5676
...,...,...,...,...,...
950,tt9686154,You Will Die at 20,2019,103,2106
951,tt9690328,Pápér Spidérs,2020,109,25068
952,tt9735790,Me You Madness,2021,98,1056
953,tt9769668,Tughlaq Durbar,2021,145,1430


In [513]:
df_validation_merged = con.execute('''SELECT * FROM merged_validation_set''').fetchdf()
df_validation_merged = df_validation_merged.sort_values(by='tconst').reset_index()
# df_validation = df_validation[:-1]
df_validation_merged


Unnamed: 0,index,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,budget,revenue,vote_average
0,273,tt0003740,Cabiria,1914,148,3452,210,64777168,7
1,300,tt0008663,A Man There Was,1917,65,1882,20680528,64777168,7
2,281,tt0010307,J'accuse!,1919,166,1692,20680528,64777168,8
3,141,tt0014429,Safety Last!,1923,74,19898,20680528,623,8
4,210,tt0015175,Die Nibelungen: Siegfried,1924,143,5676,20680528,64777168,8
...,...,...,...,...,...,...,...,...,...
951,951,tt9686154,You Will Die at 20,2019,103,2106,20680528,64777168,6
952,952,tt9690328,Pápér Spidérs,2020,109,25068,20680528,64777168,6
953,953,tt9735790,Me You Madness,2021,98,1056,20680528,64777168,6
954,954,tt9769668,Tughlaq Durbar,2021,145,1430,20680528,64777168,6


In [515]:
df_validation_merged = df_validation_merged[['tconst', 'primaryTitle', 'startYear', 'runtimeMinutes', 'numVotes']]
df_validation_merged
df_diff = pd.concat([df_validation,df_validation_merged]).drop_duplicates(keep=False)
df_diff
# merged = pd.merge(df_validation, df_validation_merged, how='outer', indicator=True)
# extra_rows = merged[merged['_merge'] == 'right_only']
# print(extra_rows)

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes


In [471]:
X_val = df_validation[['startYear', 'runtimeMinutes', 'numVotes', 'budget', 'revenue', 'vote_average']]

# SVM
y_val_preds = clf.predict(X_val)

# Logistic regression model
lr_val_preds = lr_model.predict(X_val)

# Random forest model
rf_val_preds = rf_model.predict(X_val)

# Gradient boosted trees model
gbt_val_preds = gbt_model.predict(X_val)

In [472]:
val_preds = [str(pred) for pred in gbt_val_preds]

In [473]:
pd.DataFrame(val_preds).to_csv('val_predictions.csv', index=False, header=False)

## Test set

In [474]:
con.execute('''
    CREATE TABLE test_set AS SELECT * FROM read_csv_auto('test_hidden.csv')
''')

<duckdb.DuckDBPyConnection at 0x1c9619927f0>

In [475]:
con.execute('''SELECT * FROM test_set''').fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes
0,22,tt0014972,He Who Gets Slapped,He Who Gets Slapped,1924,\N,95,3654.0
1,23,tt0015016,The Iron Horse,,1924,\N,150,2136.0
2,26,tt0015174,Die Nibelungen: Kriemhild's Revenge,,1924,\N,129,4341.0
3,28,tt0015214,At 3:25,,\N,1925,59,1724.0
4,34,tt0015863,Go West,,1925,\N,69,4188.0
...,...,...,...,...,...,...,...,...
1081,9942,tt9430698,One Piece: Stampede,,2019,\N,101,5109.0
1082,9943,tt9441638,The Big Ugly,,2020,\N,106,5780.0
1083,9948,tt9495690,Págálpánti,Pagalpanti,2019,\N,149,2331.0
1084,9950,tt9519642,The Wedding Unplanner,,2020,\N,110,


In [476]:
## Replace missing startYear values and drop endYear
execute(replace_missing_startYear('test_set'))
execute(drop_endYear('test_set'))

## Replace missing runtimeMins values
execute(convert_runtimeMins('test_set'))
execute(calculate_missing_runtimeMins('test_set'))

## Replace missing numVotes values
execute(convert_numVotes('test_set'))
execute(calculate_missing_numVotes('test_set'))

## Dropping the originalTitle column
execute(drop_originalTitle('test_set'))
execute(drop_column_zero('test_set'))
execute(add_external_columns('test_set', 'movielens_data'))
execute(drop_title('merged_test_set'))
execute(drop_original_title('merged_test_set'))
execute(drop_id('merged_test_set'))
execute(drop_release_year('merged_test_set'))

execute(convert_budget('merged_test_set'))
execute(calculate_missing_budget('merged_test_set'))
execute(convert_vote_average('merged_test_set'))
execute(calculate_missing_vote_average('merged_test_set'))
execute(convert_revenue('merged_test_set'))
execute(calculate_missing_revenue_average('merged_test_set'))

Unnamed: 0,Count
0,942


In [477]:
con.execute('''
    SELECT COUNT(*) FROM test_set WHERE primaryTitle IS NULL
''').fetchdf()

Unnamed: 0,count_star()
0,0


In [478]:
con.execute('''
    SELECT COUNT(*) FROM test_set WHERE startYear = '\\N'
''').fetchdf()

Unnamed: 0,count_star()
0,0


In [479]:
con.execute('''
    SELECT COUNT(*) FROM test_set WHERE runtimeMinutes = '\\N'
''').fetchdf()

Unnamed: 0,count_star()
0,0


In [480]:
con.execute('''
    SELECT COUNT(*) FROM test_set WHERE numVotes IS NULL
''').fetchdf()

Unnamed: 0,count_star()
0,0


In [481]:
df_test = con.execute('''SELECT * FROM merged_test_set''').fetchdf()
df_test = df_test[:-1]
df_test

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,budget,revenue,vote_average
0,tt0112641,Casino,1995,178,493532,52000000,116112375,8
1,tt0113855,Mortal Kombat,1995,101,115334,18000000,122195920,5
2,tt0114814,The Usual Suspects,1995,106,1038523,6000000,23341568,8
3,tt0114323,Safe,1995,119,14501,23313208,70189184,7
4,tt0103994,Like Water for Chocolate,1992,105,17576,2000000,21665468,7
...,...,...,...,...,...,...,...,...
1081,tt9316022,Struggle: The Life and Lost Art of Szukalski,2018,115,28524,23313208,70189184,6
1082,tt9430698,One Piece: Stampede,2019,101,5109,23313208,70189184,6
1083,tt9441638,The Big Ugly,2020,106,5780,23313208,70189184,6
1084,tt9495690,Págálpánti,2019,149,2331,23313208,70189184,6


In [482]:
X_test = df_test[['startYear', 'runtimeMinutes', 'numVotes', 'budget', 'revenue', 'vote_average']]

# SVM
y_test_preds = clf.predict(X_test)

# Logistic regression model
lr_test_preds = lr_model.predict(X_test)

# Random forest model
rf_test_preds = rf_model.predict(X_test)

# Gradient boosted trees model
gbt_test_preds = gbt_model.predict(X_test)

In [483]:
test_preds = [str(pred) for pred in gbt_test_preds]

In [484]:
pd.DataFrame(gbt_test_preds).to_csv('test_predictions.csv', index=False, header=False)