In [549]:
import pandas as pd
import numpy as np
import duckdb

## DuckDB

### Initializing a DuckDB database

In [550]:
con = duckdb.connect(database=':memory:')

In [551]:
con.execute('''
    CREATE TABLE training_set AS SELECT * FROM read_csv_auto('train-1.csv')
''')

<duckdb.DuckDBPyConnection at 0x24068858330>

In [552]:
con.execute("SELECT * FROM training_set").fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
0,4,tt0010600,The Doll,Die Puppe,1919,\N,66,1898.0,True
1,7,tt0011841,Way Down East,Way Down East,1920,\N,145,5376.0,True
2,9,tt0012494,Déstiny,Der müde Tod,1921,\N,97,5842.0,True
3,25,tt0015163,The Navigator,The Navigator,1924,\N,59,9652.0,True
4,38,tt0016220,The Phantom of the Opera,The Phantom of the Opera,1925,\N,93,17887.0,True
...,...,...,...,...,...,...,...,...,...
958,9955,tt9558612,PM Náréndrá Mớdi,PM Narendra Modi,2019,\N,136,7005.0,False
959,9960,tt9598172,Sáving Léningrád,,2019,\N,96,2200.0,False
960,9977,tt9691136,Shadow in the Cloud,,2020,\N,83,22617.0,False
961,9979,tt9695258,So My Grandma's a Lesbian!,Salir del ropero,2019,\N,94,1054.0,False


In [553]:
con.execute('''
    INSERT INTO training_set SELECT * FROM read_csv_auto('train-2.csv');
    INSERT INTO training_set SELECT * FROM read_csv_auto('train-3.csv');
    INSERT INTO training_set SELECT * FROM read_csv_auto('train-4.csv');
    INSERT INTO training_set SELECT * FROM read_csv_auto('train-5.csv');
    INSERT INTO training_set SELECT * FROM read_csv_auto('train-6.csv');
    INSERT INTO training_set SELECT * FROM read_csv_auto('train-7.csv');
    INSERT INTO training_set SELECT * FROM read_csv_auto('train-8.csv')
''')

<duckdb.DuckDBPyConnection at 0x24068858330>

In [554]:
con.execute("SELECT * FROM training_set").fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
0,4,tt0010600,The Doll,Die Puppe,1919,\N,66,1898.0,True
1,7,tt0011841,Way Down East,Way Down East,1920,\N,145,5376.0,True
2,9,tt0012494,Déstiny,Der müde Tod,1921,\N,97,5842.0,True
3,25,tt0015163,The Navigator,The Navigator,1924,\N,59,9652.0,True
4,38,tt0016220,The Phantom of the Opera,The Phantom of the Opera,1925,\N,93,17887.0,True
...,...,...,...,...,...,...,...,...,...
7954,9966,tt9625664,Trauma Center,,2019,\N,87,12951.0,False
7955,9981,tt9741310,Slaxx,Slaxx,2020,\N,77,2464.0,False
7956,9982,tt9742392,Kindred,Kindred,2020,\N,101,1719.0,False
7957,9996,tt9850386,The Bee Gees: How Can You Mend a Broken Heart,,2020,\N,111,4144.0,True


### Checking the number of null values in the columns

In [555]:
con.execute('''
    SELECT COUNT(*) FROM training_set WHERE primaryTitle IS NULL
''').fetchdf()

Unnamed: 0,count_star()
0,0


In [556]:
con.execute('''
    SELECT COUNT(*) FROM training_set WHERE originalTitle IS NULL
''').fetchdf()

Unnamed: 0,count_star()
0,3988


In [557]:
con.execute('''
    SELECT COUNT(*) FROM training_set WHERE startYear = '\\N'
''').fetchdf()

Unnamed: 0,count_star()
0,786


In [558]:
con.execute('''
    SELECT COUNT(*) FROM training_set WHERE runtimeMinutes = '\\N'
''').fetchdf()

Unnamed: 0,count_star()
0,13


In [559]:
con.execute('''
    SELECT COUNT(*) FROM training_set WHERE numVotes IS NULL
''').fetchdf()

Unnamed: 0,count_star()
0,790


## Data Cleaning

In [560]:
def execute(query):
    result = con.execute(query).fetchdf()
    return result

### 1) Replace missing values in startYear column

In [561]:
def replace_missing_startYear(input_name):
    query = '''
        UPDATE ''' + input_name + '''
        SET startYear = endYear
        WHERE startYear = '\\N'
    '''
    return query

In [562]:
execute(replace_missing_startYear('training_set'))

Unnamed: 0,Count
0,786


In [563]:
con.execute("SELECT * FROM training_set WHERE startYear = '\\N'").fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label


### 2) Dropping endYear column

Now that the missing values in startYear have been replaced by the values of endYear, this latter column is no longer necessary. We can drop this column.

In [564]:
def drop_endYear(input_name):
    query = '''
        ALTER TABLE ''' + input_name + '''
        DROP COLUMN endYear
    '''
    return query

In [565]:
execute(drop_endYear('training_set'))

Unnamed: 0,Success


In [566]:
con.execute("SELECT * FROM training_set").fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,numVotes,label
0,4,tt0010600,The Doll,Die Puppe,1919,66,1898.0,True
1,7,tt0011841,Way Down East,Way Down East,1920,145,5376.0,True
2,9,tt0012494,Déstiny,Der müde Tod,1921,97,5842.0,True
3,25,tt0015163,The Navigator,The Navigator,1924,59,9652.0,True
4,38,tt0016220,The Phantom of the Opera,The Phantom of the Opera,1925,93,17887.0,True
...,...,...,...,...,...,...,...,...
7954,9966,tt9625664,Trauma Center,,2019,87,12951.0,False
7955,9981,tt9741310,Slaxx,Slaxx,2020,77,2464.0,False
7956,9982,tt9742392,Kindred,Kindred,2020,101,1719.0,False
7957,9996,tt9850386,The Bee Gees: How Can You Mend a Broken Heart,,2020,111,4144.0,True


### 3) Replace missing values in runtimeMinutes column

First, we can substitute the null values with 0 and set it to type integer so subsequent computations are easy to do. Then, we use the yearly average to impute the missing values - so we replace by the average of `runtimeMins` for movies belonging to the same year.

In [567]:
def convert_runtimeMins(input_name):
    query = '''
        UPDATE ''' + input_name + ''' SET runtimeMinutes = 0 WHERE runtimeMinutes = '\\N';
        ALTER TABLE ''' + input_name + ''' ALTER COLUMN runtimeMinutes SET DATA TYPE INTEGER;
    '''
    return query

In [568]:
def calculate_missing_runtimeMins(input_name):
    query = '''
        UPDATE ''' + input_name + ''' m1 
        SET runtimeMinutes = (
          SELECT AVG(runtimeMinutes) as yearly_mean 
          FROM ''' + input_name + ''' m2 
          WHERE m1.startYear = m2.startYear AND runtimeMinutes > 0 
          GROUP BY m2.startYear
        )
        WHERE runtimeMinutes = 0;
    '''
    return query

In [569]:
execute(convert_runtimeMins('training_set'))

Unnamed: 0,Success


In [570]:
execute(calculate_missing_runtimeMins('training_set'))

Unnamed: 0,Count
0,13


In [571]:
con.execute('''
    SELECT COUNT(*) FROM training_set WHERE runtimeMinutes = '\\N'
''').fetchdf()

Unnamed: 0,count_star()
0,0


In [572]:
con.execute("SELECT runtimeMinutes FROM training_set WHERE runtimeMinutes = 0").fetchdf()

Unnamed: 0,runtimeMinutes


### 4) Replace missing values in numVotes column

In [573]:
con.execute("SELECT * FROM training_set WHERE numVotes IS NULL").fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,numVotes,label
0,118,tt0023973,Thé Éáglé ánd thé Háwk,,1933,73,,True
1,119,tt0023986,Émplớyéés' Éntráncé,,1933,75,,True
2,163,tt0027478,The Crime of Monsieur Lange,Le crime de Monsieur Lange,1936,80,,True
3,180,tt0028333,Swing Timé,,1936,103,,True
4,466,tt0040626,My Dear Secretary,My Dear Secretary,1948,94,,False
...,...,...,...,...,...,...,...,...
785,9446,tt7134096,The Rhythm Section,The Rhythm Section,2020,109,,False
786,9664,tt8017136,Tớny: My Méntớr thé Sériál Killér,Tony,2018,124,,True
787,9803,tt8671462,Invoking 5,Invoking 5,2018,90,,False
788,9806,tt8694228,Mikhael,,2019,150,,False


#### 4a) Using trimmed mean

Similar to the previous one, we can substitute the null values in `numVotes` with 0 and set it to type integer. Then, we can use a trimmed average of the `numVotes` column, excluding the smallest and largest values so that the mean is less skewed.

In [574]:
def convert_numVotes(input_name):
    query = '''
        UPDATE ''' + input_name + ''' SET numVotes = 0 WHERE numVotes IS NULL;
        ALTER TABLE ''' + input_name + ''' ALTER COLUMN numVotes SET DATA TYPE INTEGER;
    '''
    return query

In [575]:
execute(convert_numVotes('training_set'))

Unnamed: 0,Success


In [410]:
def calculate_missing_numVotes(input_name):
    query = '''
        UPDATE ''' + input_name + ''' m1 
        SET numVotes = (
          SELECT (SUM(numVotes) - MIN(numVotes) - MAX(numVotes)) / CAST(COUNT(*)-2 as FLOAT) as trimmed_mean 
          FROM ''' + input_name + ''' m2 
          WHERE numVotes > 0
        )
        WHERE numVotes = 0;
    '''
    return query

In [423]:
execute(calculate_missing_numVotes('training_set'))

Unnamed: 0,Count
0,790


In [99]:
con.execute("SELECT numVotes FROM training_set WHERE numVotes IS NULL").fetchdf()

Unnamed: 0,numVotes


In [100]:
con.execute("SELECT numVotes FROM training_set WHERE numVotes = 0").fetchdf()

Unnamed: 0,numVotes
0,0
1,0
2,0
3,0
4,0
...,...
785,0
786,0
787,0
788,0


#### 4b) Using a supervised learning algorithm

We can use a supervised ML algorithm to impute the missing values in the numVotes column.

In [576]:
from sklearn.linear_model import LinearRegression

def predict_missing_numVotes(input_name):
    df_train = con.execute('''
        SELECT * FROM ''' + input_name + '''
        WHERE numVotes != 0
    ''').fetchdf()
    X_train = df_train[['startYear', 'runtimeMinutes']]
    y_train = df_train['numVotes']
    
    df_test = con.execute('''
        SELECT * FROM ''' + input_name + '''
        WHERE numVotes = 0
    ''').fetchdf()
    X_test = df_test[['startYear', 'runtimeMinutes']]
    
    # Train an SVM classifier
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Make predictions on the test set and evaluate the performance
    y_pred = model.predict(X_test)
    preds = [max(0, int(a)) for a in y_pred]
    
    df = con.execute('''
        SELECT * FROM ''' + input_name + '''
    ''').fetchdf()
    
    df.loc[df['numVotes'] == 0, 'numVotes'] = preds        
    
    con.execute('''
        DROP TABLE IF EXISTS ''' + input_name + '''
    ''')
    con.execute('''
        CREATE TABLE ''' + input_name + ''' AS SELECT * FROM df;
    ''')

In [577]:
predict_missing_numVotes('training_set')

In [578]:
def replace_zeros_numVotes(input_name):
    query = '''
    UPDATE ''' + input_name + ''' 
        SET numVotes = (
          SELECT AVG(numVotes) as mean 
          FROM ''' + input_name + ''' 
        )
    WHERE numVotes = 0;
    '''
    
    return query

In [579]:
execute(replace_new_zeros('training_set'))

Unnamed: 0,Count
0,18


In [580]:
con.execute("SELECT * FROM training_set WHERE numVotes = 0 ").fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,numVotes,label


### 5) Dropping the originalTitle column

Since over 50% of the values in originalTitle are null and the primaryTitle already contains the main information regarding the movie name, we can discard this column from our dataset.

In [581]:
def drop_originalTitle(input_name):
    query = '''
        ALTER TABLE ''' + input_name + '''
        DROP COLUMN originalTitle
    '''
    return query

In [582]:
execute(drop_originalTitle('training_set'))

Unnamed: 0,Success


### 6) Checking duplicate rows

In [583]:
con.execute('''
    SELECT * FROM training_set 
    GROUP BY *
    HAVING COUNT(*) > 1
''').fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label


In [584]:
con.execute('''
    SELECT tconst, primaryTitle FROM training_set
    GROUP BY tconst, primaryTitle
    HAVING COUNT(*) > 1
''').fetchdf()

Unnamed: 0,tconst,primaryTitle


In [585]:
con.execute('''
    SELECT * FROM training_set 
    WHERE primaryTitle = 'Sabrina'
''').fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label
0,9650,tt7981492,Sabrina,2018,113,1662,False
1,620,tt0047437,Sabrina,1954,113,62736,True


In [586]:
con.execute('''
    SELECT * FROM training_set
    WHERE primaryTitle IN (
        SELECT primaryTitle FROM training_set
        GROUP BY primaryTitle
        HAVING COUNT(*) > 1
    )
''').fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label
0,38,tt0016220,The Phantom of the Opera,1925,93,17887,True
1,125,tt0024216,King Kong,1933,100,83177,True
2,239,tt0031647,Midnight,1939,94,4904,True
3,279,tt0033152,The Thief of Bagdad,1940,106,12840,True
4,400,tt0038355,The Big Sleep,1946,114,83357,True
...,...,...,...,...,...,...,...
213,8120,tt3462710,Unforgettable,2017,100,15087,False
214,8392,tt4008758,Black,2015,95,4152,True
215,9651,tt7984766,The King,2019,140,110160,True
216,9699,tt8144778,The Redeemed and the Dominant: Fittest on Earth,2018,119,1347,True


In [587]:
con.execute('''
    SELECT * FROM training_set
    WHERE primaryTitle = 'The Phantom of the Opera'
''').fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label
0,38,tt0016220,The Phantom of the Opera,1925,93,17887,True
1,3303,tt0119889,The Phantom of the Opera,1998,99,5390,False


In [588]:
con.execute("SELECT * FROM training_set").fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label
0,4,tt0010600,The Doll,1919,66,1898,True
1,7,tt0011841,Way Down East,1920,145,5376,True
2,9,tt0012494,Déstiny,1921,97,5842,True
3,25,tt0015163,The Navigator,1924,59,9652,True
4,38,tt0016220,The Phantom of the Opera,1925,93,17887,True
...,...,...,...,...,...,...,...
7954,9966,tt9625664,Trauma Center,2019,87,12951,False
7955,9981,tt9741310,Slaxx,2020,77,2464,False
7956,9982,tt9742392,Kindred,2020,101,1719,False
7957,9996,tt9850386,The Bee Gees: How Can You Mend a Broken Heart,2020,111,4144,True


### Adding external data to this dataset

In [589]:
con.execute('''CREATE TABLE movielens_data AS SELECT * FROM read_csv_auto('movielens_data.csv')''')

<duckdb.DuckDBPyConnection at 0x24068858330>

In [590]:
con.execute('''SELECT * FROM movielens_data''').fetchdf()

Unnamed: 0,id,original_title,popularity,title,vote_average,release_year
0,862,Toy Story,21.946943,Toy Story,7.700000,1995
1,8844,Jumanji,17.015539,Jumanji,6.900000,1995
2,15602,Grumpier Old Men,11.712900,Grumpier Old Men,6.500000,1995
3,31357,Waiting to Exhale,3.859495,Waiting to Exhale,6.100000,1995
4,11862,Father of the Bride Part II,8.387519,Father of the Bride Part II,5.700000,1995
...,...,...,...,...,...,...
45284,439050,رگ خواب,0.072051,Subdue,4.000000,1900
45285,111109,Siglo ng Pagluluwal,0.178241,Century of Birthing,9.000000,2011
45286,67758,Betrayal,0.903007,Betrayal,3.800000,2003
45287,227506,Satana likuyushchiy,0.003503,Satan Triumphant,6.750000,1917


In [591]:
def add_external_columns(input_name):
    query = '''
        CREATE TABLE merged_''' + input_name + ''' AS 
        SELECT ''' + input_name + '''.*, movielens_data.popularity, movielens_data.vote_average FROM 
        ''' + input_name + ''' LEFT JOIN movielens_data 
        ON ''' + input_name + '''.primaryTitle = movielens_data.title
        AND ''' + input_name + '''.startYear = movielens_data.release_year;
    '''
    return query

In [592]:
execute(add_external_columns('training_set'))

Unnamed: 0,Count
0,7959


In [593]:
con.execute('''SELECT * FROM merged_training_set''').fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label,popularity,vote_average
0,3030,tt0113277,Heat,1995,170,616475,True,17.924927,7.7
1,3027,tt0113189,GoldenEye,1995,130,250071,True,14.686036,6.6
2,2999,tt0112453,Balto,1995,78,42057,True,12.140733,7.1
3,3073,tt0114388,Sense and Sensibility,1995,136,52412,True,10.673167,7.2
4,3056,tt0113845,Money Train,1995,110,40354,False,7.337906,5.4
...,...,...,...,...,...,...,...,...,...
7954,9966,tt9625664,Trauma Center,2019,87,12951,False,,
7955,9981,tt9741310,Slaxx,2020,77,2464,False,,
7956,9982,tt9742392,Kindred,2020,101,1719,False,,
7957,9996,tt9850386,The Bee Gees: How Can You Mend a Broken Heart,2020,111,4144,True,,


### Impute missing values in popularity column

Using same technique as before, use a trimmed average of popularity column as a whole, excluding smallest & largest values.

In [594]:
con.execute('''
    SELECT COUNT(*) FROM merged_training_set WHERE popularity IS NULL
''').fetchdf()

Unnamed: 0,count_star()
0,4011


In [595]:
def convert_popularity(input_name):
    query = '''
        UPDATE ''' + input_name + ''' SET popularity = 0 WHERE popularity IS NULL;
        ALTER TABLE ''' + input_name + ''' ALTER COLUMN popularity SET DATA TYPE FLOAT;
    '''
    return query

In [596]:
def calculate_missing_popularity(input_name):
    query = '''
        UPDATE ''' + input_name + ''' m1 
        SET popularity = (
          SELECT (SUM(popularity) - MIN(popularity) - MAX(popularity)) / CAST(COUNT(*)-2 as FLOAT) as trimmed_mean 
          FROM ''' + input_name + ''' m2 
          WHERE popularity > 0
        )
        WHERE popularity = 0;
    '''
    return query

In [597]:
execute(convert_popularity('merged_training_set'))

Unnamed: 0,Success


In [598]:
execute(calculate_missing_popularity('merged_training_set'))

Unnamed: 0,Count
0,4011


In [599]:
con.execute('''SELECT * FROM merged_training_set WHERE popularity IS NULL''').fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label,popularity,vote_average


In [600]:
con.execute('''SELECT * FROM merged_training_set WHERE popularity = 0''').fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label,popularity,vote_average


In [601]:
con.execute('''SELECT * FROM merged_training_set''').fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label,popularity,vote_average
0,3030,tt0113277,Heat,1995,170,616475,True,17.924927,7.7
1,3027,tt0113189,GoldenEye,1995,130,250071,True,14.686036,6.6
2,2999,tt0112453,Balto,1995,78,42057,True,12.140733,7.1
3,3073,tt0114388,Sense and Sensibility,1995,136,52412,True,10.673167,7.2
4,3056,tt0113845,Money Train,1995,110,40354,False,7.337906,5.4
...,...,...,...,...,...,...,...,...,...
7954,9966,tt9625664,Trauma Center,2019,87,12951,False,4.837576,
7955,9981,tt9741310,Slaxx,2020,77,2464,False,4.837576,
7956,9982,tt9742392,Kindred,2020,101,1719,False,4.837576,
7957,9996,tt9850386,The Bee Gees: How Can You Mend a Broken Heart,2020,111,4144,True,4.837576,


### Impute missing values in vote_average column

In [602]:
con.execute('''
    SELECT COUNT(*) FROM merged_training_set WHERE vote_average IS NULL
''').fetchdf()

Unnamed: 0,count_star()
0,4011


In [603]:
def convert_vote_average(input_name):
    query = '''
        UPDATE ''' + input_name + ''' SET vote_average = 0 WHERE vote_average IS NULL;
        ALTER TABLE ''' + input_name + ''' ALTER COLUMN vote_average SET DATA TYPE FLOAT;
    '''
    return query

In [604]:
execute(convert_vote_average('merged_training_set'))

Unnamed: 0,Success


In [467]:
def calculate_missing_vote_average(input_name):
    query = '''
        UPDATE ''' + input_name + ''' 
        SET vote_average = (
          SELECT MEDIAN(vote_average) as median 
          FROM ''' + input_name + ''' 
          WHERE vote_average > 0
        )
        WHERE vote_average = 0;
    '''
    return query

In [468]:
execute(calculate_missing_vote_average('merged_training_set'))

Unnamed: 0,Count
0,4011


In [469]:
con.execute('''SELECT * FROM merged_training_set WHERE vote_average = 0''').fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label,popularity,vote_average


In [470]:
con.execute('''SELECT * FROM merged_training_set''').fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label,popularity,vote_average
0,3030,tt0113277,Heat,1995,170,616475,True,17.924927,7.7
1,3027,tt0113189,GoldenEye,1995,130,250071,True,14.686036,6.6
2,2999,tt0112453,Balto,1995,78,42057,True,12.140733,7.1
3,3073,tt0114388,Sense and Sensibility,1995,136,52412,True,10.673167,7.2
4,3056,tt0113845,Money Train,1995,110,40354,False,7.337906,5.4
...,...,...,...,...,...,...,...,...,...
7954,9966,tt9625664,Trauma Center,2019,87,12951,False,4.837576,6.3
7955,9981,tt9741310,Slaxx,2020,77,2464,False,4.837576,6.3
7956,9982,tt9742392,Kindred,2020,101,1719,False,4.837576,6.3
7957,9996,tt9850386,The Bee Gees: How Can You Mend a Broken Heart,2020,111,4144,True,4.837576,6.3


### Using supervised learning for vote_average imputation

In [605]:
from sklearn.linear_model import LinearRegression

def predict_missing_vote_average(input_name):
    df_train = con.execute('''
        SELECT * FROM ''' + input_name + '''
        WHERE vote_average != 0
    ''').fetchdf()
    X_train = df_train[['startYear', 'runtimeMinutes', 'numVotes']]
    y_train = df_train['vote_average']
    
    df_test = con.execute('''
        SELECT * FROM ''' + input_name + '''
        WHERE vote_average = 0
    ''').fetchdf()
    X_test = df_test[['startYear', 'runtimeMinutes', 'numVotes']]
    
    # Train an SVM classifier
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Make predictions on the test set and evaluate the performance
    y_pred = model.predict(X_test)
    preds = [max(0, int(a)) for a in y_pred]
    
    df = con.execute('''
        SELECT * FROM ''' + input_name + '''
    ''').fetchdf()
    
    df.loc[df['vote_average'] == 0, 'vote_average'] = preds        
    
    con.execute('''
        DROP TABLE IF EXISTS ''' + input_name + '''
    ''')
    con.execute('''
        CREATE TABLE ''' + input_name + ''' AS SELECT * FROM df;
    ''')

In [606]:
predict_missing_vote_average('merged_training_set')

In [607]:
def replace_zeros_vote_average(input_name):
    query = '''
    UPDATE ''' + input_name + ''' 
        SET vote_average = (
          SELECT AVG(vote_average) as mean 
          FROM ''' + input_name + ''' 
        )
    WHERE vote_average = 0;
    '''
    
    return query

In [608]:
execute(replace_zeros_vote_average('merged_training_set'))

Unnamed: 0,Count
0,0


In [609]:
con.execute('''SELECT * FROM merged_training_set WHERE vote_average IS NULL''').fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label,popularity,vote_average


In [610]:
con.execute('''SELECT * FROM merged_training_set WHERE vote_average = 0''').fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label,popularity,vote_average


In [611]:
con.execute('''SELECT * FROM merged_training_set''').fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label,popularity,vote_average
0,3030,tt0113277,Heat,1995,170,616475,True,17.924927,7.7
1,3027,tt0113189,GoldenEye,1995,130,250071,True,14.686036,6.6
2,2999,tt0112453,Balto,1995,78,42057,True,12.140733,7.1
3,3073,tt0114388,Sense and Sensibility,1995,136,52412,True,10.673167,7.2
4,3056,tt0113845,Money Train,1995,110,40354,False,7.337906,5.4
...,...,...,...,...,...,...,...,...,...
7954,9966,tt9625664,Trauma Center,2019,87,12951,False,4.837576,5.0
7955,9981,tt9741310,Slaxx,2020,77,2464,False,4.837576,5.0
7956,9982,tt9742392,Kindred,2020,101,1719,False,4.837576,5.0
7957,9996,tt9850386,The Bee Gees: How Can You Mend a Broken Heart,2020,111,4144,True,4.837576,5.0


## Model training

In [None]:
def reindex_data(training_set, merged_training_set):
    df_1 = con.execute('''SELECT * FROM ''' + training_set + '''''').fetchdf()
    df_1 = df_1.set_index('column0')
    df_2 = con.execute('''SELECT * FROM merged_''' + training_set + '''''').fetchdf()
    df_2 = df_2.set_index('column0')
    df_2 = df_2.reindex(df_1.index)
    return df_2

In [612]:
df_1 = con.execute('''SELECT * FROM training_set''').fetchdf()
df_1 = df_1.set_index('column0')
df_1

Unnamed: 0_level_0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label
column0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4,tt0010600,The Doll,1919,66,1898,True
7,tt0011841,Way Down East,1920,145,5376,True
9,tt0012494,Déstiny,1921,97,5842,True
25,tt0015163,The Navigator,1924,59,9652,True
38,tt0016220,The Phantom of the Opera,1925,93,17887,True
...,...,...,...,...,...,...
9966,tt9625664,Trauma Center,2019,87,12951,False
9981,tt9741310,Slaxx,2020,77,2464,False
9982,tt9742392,Kindred,2020,101,1719,False
9996,tt9850386,The Bee Gees: How Can You Mend a Broken Heart,2020,111,4144,True


In [613]:
df_2 = con.execute('''SELECT * FROM merged_training_set''').fetchdf()
df_2 = df_2.set_index('column0')
df_2

Unnamed: 0_level_0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label,popularity,vote_average
column0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3030,tt0113277,Heat,1995,170,616475,True,17.924927,7.7
3027,tt0113189,GoldenEye,1995,130,250071,True,14.686036,6.6
2999,tt0112453,Balto,1995,78,42057,True,12.140733,7.1
3073,tt0114388,Sense and Sensibility,1995,136,52412,True,10.673167,7.2
3056,tt0113845,Money Train,1995,110,40354,False,7.337906,5.4
...,...,...,...,...,...,...,...,...
9966,tt9625664,Trauma Center,2019,87,12951,False,4.837576,5.0
9981,tt9741310,Slaxx,2020,77,2464,False,4.837576,5.0
9982,tt9742392,Kindred,2020,101,1719,False,4.837576,5.0
9996,tt9850386,The Bee Gees: How Can You Mend a Broken Heart,2020,111,4144,True,4.837576,5.0


In [614]:
df_2 = df_2.reindex(df_1.index)
df_2

Unnamed: 0_level_0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label,popularity,vote_average
column0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
4,tt0010600,The Doll,1919,66,1898,True,4.837576,6.0
7,tt0011841,Way Down East,1920,145,5376,True,1.649987,6.2
9,tt0012494,Déstiny,1921,97,5842,True,4.837576,6.0
25,tt0015163,The Navigator,1924,59,9652,True,3.031803,7.3
38,tt0016220,The Phantom of the Opera,1925,93,17887,True,11.083684,7.0
...,...,...,...,...,...,...,...,...
9966,tt9625664,Trauma Center,2019,87,12951,False,4.837576,5.0
9981,tt9741310,Slaxx,2020,77,2464,False,4.837576,5.0
9982,tt9742392,Kindred,2020,101,1719,False,4.837576,5.0
9996,tt9850386,The Bee Gees: How Can You Mend a Broken Heart,2020,111,4144,True,4.837576,5.0


In [615]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data into training and test sets
X = df_2[['startYear', 'runtimeMinutes', 'numVotes', 'popularity', 'vote_average']]
y = df_2['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an SVM classifier
clf = SVC()
clf.fit(X_train, y_train)

# Make predictions on the test set and evaluate the performance
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Test set accuracy: {accuracy:.2f}')

Test set accuracy: 0.57


In [616]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Split data into features and labels
X = df_2[['startYear', 'runtimeMinutes', 'numVotes', 'popularity', 'vote_average']]
y = df_2['label']

# Train-test split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(train_X, train_y)
lr_pred = lr_model.predict(test_X)
lr_acc = accuracy_score(test_y, lr_pred)
print("Logistic Regression Accuracy: {:.4f}".format(lr_acc))

# Train random forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(train_X, train_y)
rf_pred = rf_model.predict(test_X)
rf_acc = accuracy_score(test_y, rf_pred)
print("Random Forest Accuracy: {:.4f}".format(rf_acc))

# Train gradient boosted trees model
gbt_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gbt_model.fit(train_X, train_y)
gbt_pred = gbt_model.predict(test_X)
gbt_acc = accuracy_score(test_y, gbt_pred)
print("Gradient Boosted Trees Accuracy: {:.4f}".format(gbt_acc))

Logistic Regression Accuracy: 0.6840
Random Forest Accuracy: 0.8046
Gradient Boosted Trees Accuracy: 0.8310


## Validation set

In [617]:
con.execute('''
    CREATE TABLE validation_set AS SELECT * FROM read_csv_auto('validation_hidden.csv')
''')

<duckdb.DuckDBPyConnection at 0x24068858330>

In [618]:
con.execute('''SELECT * FROM validation_set''').fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes
0,0,tt0003740,Cabiria,,1914,\N,148,3452.0
1,1,tt0008663,A Man There Was,Terje Vigen,1917,\N,65,1882.0
2,3,tt0010307,J'accuse!,,1919,\N,166,1692.0
3,18,tt0014429,Safety Last!,Safety Last!,1923,\N,74,19898.0
4,27,tt0015175,Die Nibelungen: Siegfried,,1924,\N,143,5676.0
...,...,...,...,...,...,...,...,...
950,9974,tt9686154,You Will Die at 20,,2019,\N,103,2106.0
951,9976,tt9690328,Pápér Spidérs,Paper Spiders,2020,\N,109,
952,9980,tt9735790,Me You Madness,Me You Madness,2021,\N,98,1056.0
953,9984,tt9769668,Tughlaq Durbar,Tughlaq Durbar,2021,\N,145,1430.0


In [619]:
## Replace missing startYear values and drop endYear
execute(replace_missing_startYear('validation_set'))
execute(drop_endYear('validation_set'))

## Replace missing runtimeMins values
execute(convert_runtimeMins('validation_set'))
execute(calculate_missing_runtimeMins('validation_set'))

## Replace missing numVotes values
execute(convert_numVotes('validation_set'))
# execute(calculate_missing_numVotes('validation_set'))
predict_missing_numVotes('validation_set')
execute(replace_zeros_numVotes('validation_set'))

## Dropping the originalTitle column
execute(drop_originalTitle('validation_set'))

execute(add_external_columns('validation_set'))

execute(convert_popularity('merged_validation_set'))
execute(calculate_missing_popularity('merged_validation_set'))
execute(convert_vote_average('merged_validation_set'))
# execute(calculate_missing_vote_average('merged_validation_set'))
predict_missing_vote_average('merged_validation_set')
execute(replace_zeros_vote_average('merged_validation_set'))

Unnamed: 0,Count
0,0


In [620]:
con.execute('''
    SELECT COUNT(*) FROM validation_set WHERE primaryTitle IS NULL
''').fetchdf()

Unnamed: 0,count_star()
0,0


In [621]:
con.execute('''
    SELECT COUNT(*) FROM validation_set WHERE startYear = '\\N'
''').fetchdf()

Unnamed: 0,count_star()
0,0


In [622]:
con.execute('''
    SELECT COUNT(*) FROM validation_set WHERE runtimeMinutes = '\\N'
''').fetchdf()

Unnamed: 0,count_star()
0,0


In [623]:
con.execute('''
    SELECT COUNT(*) FROM validation_set WHERE numVotes IS NULL
''').fetchdf()

Unnamed: 0,count_star()
0,0


### Predictions on validation set

In [624]:
df_1 = con.execute('''SELECT * FROM validation_set''').fetchdf()
df_1 = df_1.set_index('column0')
df_1

Unnamed: 0_level_0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes
column0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,tt0003740,Cabiria,1914,148,3452
1,tt0008663,A Man There Was,1917,65,1882
3,tt0010307,J'accuse!,1919,166,1692
18,tt0014429,Safety Last!,1923,74,19898
27,tt0015175,Die Nibelungen: Siegfried,1924,143,5676
...,...,...,...,...,...
9974,tt9686154,You Will Die at 20,2019,103,2106
9976,tt9690328,Pápér Spidérs,2020,109,31974
9980,tt9735790,Me You Madness,2021,98,1056
9984,tt9769668,Tughlaq Durbar,2021,145,1430


In [625]:
df_2 = con.execute('''SELECT * FROM merged_validation_set''').fetchdf()
df_2 = df_2.set_index('column0')
df_2

Unnamed: 0_level_0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,popularity,vote_average
column0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3112,tt0115683,Bio-Dome,1996,88,27132,6.387469,4.4
3166,tt0116839,Lawnmower Man 2: Beyond Cyberspace,1996,93,9451,2.495350,2.8
3028,tt0113247,La Haine,1995,98,162601,12.108196,7.9
3000,tt0112461,The Basketball Diaries,1995,102,107119,7.415144,7.2
2996,tt0112427,An Awfully Big Adventure,1995,112,2765,0.922958,6.0
...,...,...,...,...,...,...,...
9974,tt9686154,You Will Die at 20,2019,103,2106,4.691750,5.0
9976,tt9690328,Pápér Spidérs,2020,109,31974,4.691750,5.0
9980,tt9735790,Me You Madness,2021,98,1056,4.691750,5.0
9984,tt9769668,Tughlaq Durbar,2021,145,1430,4.691750,6.0


In [626]:
df_2 = df_2.reindex(df_1.index)
df_2

Unnamed: 0_level_0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,popularity,vote_average
column0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,tt0003740,Cabiria,1914,148,3452,0.704028,6.8
1,tt0008663,A Man There Was,1917,65,1882,1.322761,6.6
3,tt0010307,J'accuse!,1919,166,1692,0.834679,7.9
18,tt0014429,Safety Last!,1923,74,19898,4.394823,7.7
27,tt0015175,Die Nibelungen: Siegfried,1924,143,5676,1.827403,7.7
...,...,...,...,...,...,...,...
9974,tt9686154,You Will Die at 20,2019,103,2106,4.691750,5.0
9976,tt9690328,Pápér Spidérs,2020,109,31974,4.691750,5.0
9980,tt9735790,Me You Madness,2021,98,1056,4.691750,5.0
9984,tt9769668,Tughlaq Durbar,2021,145,1430,4.691750,6.0


In [627]:
X_val = df_2[['startYear', 'runtimeMinutes', 'numVotes', 'popularity', 'vote_average']]

# SVM
y_val_preds = clf.predict(X_val)

# Logistic regression model
lr_val_preds = lr_model.predict(X_val)

# Random forest model
rf_val_preds = rf_model.predict(X_val)

# Gradient boosted trees model
gbt_val_preds = gbt_model.predict(X_val)

In [628]:
val_preds = [str(pred) for pred in gbt_val_preds]

In [629]:
pd.DataFrame(val_preds).to_csv('val_predictions.csv', index=False, header=False)

## Test set

In [630]:
con.execute('''
    CREATE TABLE test_set AS SELECT * FROM read_csv_auto('test_hidden.csv')
''')

<duckdb.DuckDBPyConnection at 0x24068858330>

In [631]:
con.execute('''SELECT * FROM test_set''').fetchdf()

Unnamed: 0,column0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes
0,22,tt0014972,He Who Gets Slapped,He Who Gets Slapped,1924,\N,95,3654.0
1,23,tt0015016,The Iron Horse,,1924,\N,150,2136.0
2,26,tt0015174,Die Nibelungen: Kriemhild's Revenge,,1924,\N,129,4341.0
3,28,tt0015214,At 3:25,,\N,1925,59,1724.0
4,34,tt0015863,Go West,,1925,\N,69,4188.0
...,...,...,...,...,...,...,...,...
1081,9942,tt9430698,One Piece: Stampede,,2019,\N,101,5109.0
1082,9943,tt9441638,The Big Ugly,,2020,\N,106,5780.0
1083,9948,tt9495690,Págálpánti,Pagalpanti,2019,\N,149,2331.0
1084,9950,tt9519642,The Wedding Unplanner,,2020,\N,110,


In [632]:
## Replace missing startYear values and drop endYear
execute(replace_missing_startYear('test_set'))
execute(drop_endYear('test_set'))

## Replace missing runtimeMins values
execute(convert_runtimeMins('test_set'))
execute(calculate_missing_runtimeMins('test_set'))

## Replace missing numVotes values
execute(convert_numVotes('test_set'))
# execute(calculate_missing_numVotes('validation_set'))
predict_missing_numVotes('test_set')
execute(replace_zeros_numVotes('test_set'))

## Dropping the originalTitle column
execute(drop_originalTitle('test_set'))

execute(add_external_columns('test_set'))

execute(convert_popularity('merged_test_set'))
execute(calculate_missing_popularity('merged_test_set'))
execute(convert_vote_average('merged_test_set'))
# execute(calculate_missing_vote_average('merged_validation_set'))
predict_missing_vote_average('merged_test_set')
execute(replace_zeros_vote_average('merged_test_set'))

Unnamed: 0,Count
0,0


In [633]:
con.execute('''
    SELECT COUNT(*) FROM test_set WHERE primaryTitle IS NULL
''').fetchdf()

Unnamed: 0,count_star()
0,0


In [634]:
con.execute('''
    SELECT COUNT(*) FROM test_set WHERE startYear = '\\N'
''').fetchdf()

Unnamed: 0,count_star()
0,0


In [635]:
con.execute('''
    SELECT COUNT(*) FROM test_set WHERE runtimeMinutes = '\\N'
''').fetchdf()

Unnamed: 0,count_star()
0,0


In [636]:
con.execute('''
    SELECT COUNT(*) FROM test_set WHERE numVotes IS NULL
''').fetchdf()

Unnamed: 0,count_star()
0,0


### Predictions on test set

In [637]:
df_1 = con.execute('''SELECT * FROM test_set''').fetchdf()
df_1 = df_1.set_index('column0')
df_1

Unnamed: 0_level_0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes
column0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
22,tt0014972,He Who Gets Slapped,1924,95,3654
23,tt0015016,The Iron Horse,1924,150,2136
26,tt0015174,Die Nibelungen: Kriemhild's Revenge,1924,129,4341
28,tt0015214,At 3:25,1925,59,1724
34,tt0015863,Go West,1925,69,4188
...,...,...,...,...,...
9942,tt9430698,One Piece: Stampede,2019,101,5109
9943,tt9441638,The Big Ugly,2020,106,5780
9948,tt9495690,Págálpánti,2019,149,2331
9950,tt9519642,The Wedding Unplanner,2020,110,35085


In [638]:
df_2 = con.execute('''SELECT * FROM merged_test_set''').fetchdf()
df_2 = df_2.set_index('column0')
df_2

Unnamed: 0_level_0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,popularity,vote_average
column0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3011,tt0112641,Casino,1995,178,493532,10.137389,7.8
3058,tt0113855,Mortal Kombat,1995,101,115334,10.870138,5.4
3091,tt0114814,The Usual Suspects,1995,106,1038523,16.302465,8.1
3072,tt0114323,Safe,1995,119,14501,8.180047,7.2
2713,tt0103994,Like Water for Chocolate,1992,105,17576,4.126083,6.6
...,...,...,...,...,...,...,...
9942,tt9430698,One Piece: Stampede,2019,101,5109,4.663778,5.0
9943,tt9441638,The Big Ugly,2020,106,5780,4.663778,5.0
9948,tt9495690,Págálpánti,2019,149,2331,4.663778,6.0
9950,tt9519642,The Wedding Unplanner,2020,110,35085,4.663778,5.0


In [639]:
df_2 = df_2.reindex(df_1.index)
df_2

Unnamed: 0_level_0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,popularity,vote_average
column0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
22,tt0014972,He Who Gets Slapped,1924,95,3654,0.719924,7.2
23,tt0015016,The Iron Horse,1924,150,2136,0.990037,7.1
26,tt0015174,Die Nibelungen: Kriemhild's Revenge,1924,129,4341,0.828949,7.5
28,tt0015214,At 3:25,1925,59,1724,4.663778,6.0
34,tt0015863,Go West,1925,69,4188,1.014248,6.8
...,...,...,...,...,...,...,...
9942,tt9430698,One Piece: Stampede,2019,101,5109,4.663778,5.0
9943,tt9441638,The Big Ugly,2020,106,5780,4.663778,5.0
9948,tt9495690,Págálpánti,2019,149,2331,4.663778,6.0
9950,tt9519642,The Wedding Unplanner,2020,110,35085,4.663778,5.0


In [640]:
X_test = df_2[['startYear', 'runtimeMinutes', 'numVotes', 'popularity', 'vote_average']]

# SVM
y_test_preds = clf.predict(X_test)

# Logistic regression model
lr_test_preds = lr_model.predict(X_test)

# Random forest model
rf_test_preds = rf_model.predict(X_test)

# Gradient boosted trees model
gbt_test_preds = gbt_model.predict(X_test)

In [641]:
test_preds = [str(pred) for pred in gbt_test_preds]

In [642]:
pd.DataFrame(gbt_test_preds).to_csv('test_predictions.csv', index=False, header=False)