In [32]:
import duckdb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import json

In [33]:
# Connect to an in-memory DuckDB instance
con = duckdb.connect(database=':memory:')

In [34]:
# Load CSV files
train_df = con.execute("SELECT * FROM 'imdb/train-*.csv'").fetchdf()
directors_df = con.execute("SELECT * FROM 'imdb/directing.json'").fetchdf()
writers_df = con.execute("SELECT * FROM 'imdb/writing.json'").fetchdf()

# Check data
print(train_df.head())

   column0     tconst              primaryTitle             originalTitle  \
0        4  tt0010600                  The Doll                 Die Puppe   
1        7  tt0011841             Way Down East             Way Down East   
2        9  tt0012494                   Déstiny              Der müde Tod   
3       25  tt0015163             The Navigator             The Navigator   
4       38  tt0016220  The Phantom of the Opera  The Phantom of the Opera   

  startYear endYear runtimeMinutes  numVotes  label  
0      1919      \N             66    1898.0   True  
1      1920      \N            145    5376.0   True  
2      1921      \N             97    5842.0   True  
3      1924      \N             59    9652.0   True  
4      1925      \N             93   17887.0   True  


In [35]:
# Convert JSON-like strings to actual dictionaries
def safe_json_loads(x):
	try:
		return json.loads(x) if isinstance(x, str) and x else x
	except json.JSONDecodeError:
		return None

directors_df['movie'] = directors_df['movie'].apply(safe_json_loads)
directors_df['director'] = directors_df['director'].apply(safe_json_loads)

writers_df['movie'] = writers_df['movie'].apply(safe_json_loads)
writers_df['writer'] = writers_df['writer'].apply(safe_json_loads)

# Explode dictionaries into separate rows
directors_expanded = directors_df.explode('movie').explode('director')
directors_expanded.rename(columns={'movie': 'tconst', 'director': 'director_id'}, inplace=True)

writers_expanded = writers_df.explode('movie').explode('writer')
writers_expanded.rename(columns={'movie': 'tconst', 'writer': 'writer_id'}, inplace=True)

# Check the cleaned data
print(directors_expanded.head())
print(writers_expanded.head())

  tconst director_id
0      0           0
0      0           1
0      0           2
0      0           3
0      0           4
  tconst writer_id
0   None      None
1   None      None
2   None      None
3   None      None
4   None      None


In [36]:
# Create base tables in DuckDB from pandas DataFrames
con.execute("CREATE TABLE train_df AS SELECT * FROM train_df")
con.execute("CREATE TABLE directors_expanded AS SELECT * FROM directors_expanded")
con.execute("CREATE TABLE writers_expanded AS SELECT * FROM writers_expanded")

# Create indexes for faster lookup
con.execute("CREATE INDEX train_df_tconst_idx ON train_df (tconst);")
con.execute("CREATE INDEX directors_expanded_tconst_idx ON directors_expanded (tconst);")
con.execute("CREATE INDEX writers_expanded_tconst_idx ON writers_expanded (tconst);")

# Join tables
movies_df = con.execute("""
    SELECT t.*, d.director_id, w.writer_id
    FROM train_df t
    LEFT JOIN directors_expanded d ON t.tconst = d.tconst
    LEFT JOIN writers_expanded w ON t.tconst = w.tconst
""").fetchdf()

In [37]:
# Check the final dataset
print(movies_df.head())

   column0     tconst               primaryTitle     originalTitle startYear  \
0       25  tt0015163              The Navigator     The Navigator      1924   
1      746  tt0051507  Curse of the Faceless Man              None        \N   
2      766  tt0052151        Run Silent Run Deep              None      1958   
3      815  tt0053242                   Sápphiré              None      1959   
4      989  tt0057530           A Stitch in Time  A Stitch in Time      1963   

  endYear runtimeMinutes  numVotes  label director_id  writer_id  
0      \N             59    9652.0   True        None       <NA>  
1    1958             67    1035.0  False        None       <NA>  
2      \N             93   11169.0   True        None       <NA>  
3      \N             92    1730.0   True        None       <NA>  
4      \N             89    1122.0   True        None       <NA>  


In [38]:
# Convert startYear to integer and then to movie age
movies_df['startYear'] = pd.to_numeric(movies_df['startYear'], errors='coerce')
movies_df['endYear'] = pd.to_numeric(movies_df['endYear'], errors='coerce')

# If startYear is empty or NaN, fill it with endYear
movies_df['startYear'].fillna(movies_df['endYear'], inplace=True)

# If primaryTitle is empty or NaN, fill it with originalTitle
movies_df['primaryTitle'].fillna(movies_df['originalTitle'], inplace=True)

# Remove the columns originalTitle and endYear
movies_df.drop(columns=['originalTitle', 'endYear'], inplace=True)

movies_df['movie_age'] = 2025 - movies_df['startYear']

# Log-transform numVotes
movies_df['log_numVotes'] = np.log1p(movies_df['numVotes'])

# Categorize runtime
movies_df['runtime_category'] = pd.cut(
    pd.to_numeric(movies_df['runtimeMinutes'], errors='coerce'),
    bins=[0, 60, 90, 120, 180, np.inf],
    labels=['short', 'medium', 'long', 'very long', 'epic']
)

movies_df.to_csv("cleaned_imdb_data.csv", index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movies_df['startYear'].fillna(movies_df['endYear'], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movies_df['primaryTitle'].fillna(movies_df['originalTitle'], inplace=True)


In [39]:
# Load data
df = pd.read_csv("cleaned_imdb_data.csv")

# Prepare features and labels
X = df[['movie_age', 'log_numVotes']]  # Add more features as needed
y = df['label']

In [40]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.5910804020100503


In [41]:
# Load test data
test_df = con.execute("SELECT * FROM 'imdb/test_hidden.csv'").fetchdf()

# Convert startYear to numeric
test_df['startYear'] = pd.to_numeric(test_df['startYear'], errors='coerce')

# Apply same feature transformations
test_df['movie_age'] = 2025 - test_df['startYear']
test_df['log_numVotes'] = np.log1p(test_df['numVotes'])

# Predict
X_test_hidden = test_df[['movie_age', 'log_numVotes']]
test_df['predicted_label'] = clf.predict(X_test_hidden)

In [42]:
# Load validation data
validation_df = con.execute("SELECT * FROM 'imdb/validation_hidden.csv'").fetchdf()

# Convert startYear to numeric
validation_df['startYear'] = pd.to_numeric(validation_df['startYear'], errors='coerce')

# Apply same feature transformations
validation_df['movie_age'] = 2025 - validation_df['startYear']
validation_df['log_numVotes'] = np.log1p(validation_df['numVotes'])

# Predict
X_validation_hidden = validation_df[['movie_age', 'log_numVotes']]
validation_df['predicted_label'] = clf.predict(X_validation_hidden)

In [43]:
# Save for submission
test_df[['predicted_label']].to_csv("submission_test.csv", index=False, header=False)

In [44]:
# Save for submission
validation_df[['predicted_label']].to_csv("submission_validation.csv", index=False, header=False)