In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import duckdb

In [2]:
# Load all train data (train-1.csv to train-8.csv)
train_files = [
    "C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/train-1.csv",
    "C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/train-2.csv",
    "C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/train-3.csv",
    "C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/train-4.csv",
    "C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/train-5.csv",
    "C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/train-6.csv",
    "C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/train-7.csv",
    "C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/train-8.csv"
]

df_train = pd.concat([pd.read_csv(file) for file in train_files], ignore_index=True)

# Load `directing.json` and `writing.json`
df_directors = pd.read_json("C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/directing.json")
df_writers = pd.read_json("C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/writing.json")

# Rename 'movie' to 'tconst' for merging
df_directors.rename(columns={"movie": "tconst", "director": "director_id"}, inplace=True)
df_writers.rename(columns={"movie": "tconst", "writer": "writer_id"}, inplace=True)

# Merge training data with directing & writing information
con = duckdb.connect()
con.register("movies", df_train)
con.register("directors", df_directors)
con.register("writers", df_writers)

query = """
SELECT 
    movies.*, 
    directors.director_id, 
    writers.writer_id
FROM movies
LEFT JOIN directors ON movies.tconst = directors.tconst
LEFT JOIN writers ON movies.tconst = writers.tconst
"""

df_train = con.execute(query).fetchdf()
con.close()

# Fill missing values
df_train["director_id"].fillna("unknown", inplace=True)
df_train["writer_id"].fillna("unknown", inplace=True)

# Convert numerical columns
numeric_columns = ["startYear", "runtimeMinutes", "numVotes"]
df_train[numeric_columns] = df_train[numeric_columns].apply(pd.to_numeric, errors="coerce")
df_train[numeric_columns] = df_train[numeric_columns].fillna(df_train[numeric_columns].median())

# Save cleaned training data
df_train.to_csv("C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/final_training_data.csv", index=False)

print("✅ Final training dataset prepared!")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train["director_id"].fillna("unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train["writer_id"].fillna("unknown", inplace=True)


✅ Final training dataset prepared!


In [3]:
# Load validation and test data
val_path = "C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/validation_hidden.csv"
test_path = "C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/test_hidden.csv"

df_val = pd.read_csv(val_path)
df_test = pd.read_csv(test_path)

# Merge with directing & writing information using DuckDB
con = duckdb.connect()
con.register("validation", df_val)
con.register("test", df_test)
con.register("directors", df_directors)
con.register("writers", df_writers)

query = """
SELECT 
    validation.*, 
    directors.director_id, 
    writers.writer_id
FROM validation
LEFT JOIN directors ON validation.tconst = directors.tconst
LEFT JOIN writers ON validation.tconst = writers.tconst
"""

df_val = con.execute(query).fetchdf()

query = """
SELECT 
    test.*, 
    directors.director_id, 
    writers.writer_id
FROM test
LEFT JOIN directors ON test.tconst = directors.tconst
LEFT JOIN writers ON test.tconst = writers.tconst
"""

df_test = con.execute(query).fetchdf()
con.close()

# Handle missing values
df_val["director_id"].fillna("unknown", inplace=True)
df_val["writer_id"].fillna("unknown", inplace=True)
df_test["director_id"].fillna("unknown", inplace=True)
df_test["writer_id"].fillna("unknown", inplace=True)

df_val[numeric_columns] = df_val[numeric_columns].apply(pd.to_numeric, errors="coerce").fillna(df_train[numeric_columns].median())
df_test[numeric_columns] = df_test[numeric_columns].apply(pd.to_numeric, errors="coerce").fillna(df_train[numeric_columns].median())

print("✅ Validation and Test datasets prepared!")


✅ Validation and Test datasets prepared!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_val["director_id"].fillna("unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_val["writer_id"].fillna("unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

In [6]:
# Select features
features = ["startYear", "runtimeMinutes", "numVotes", "director_id", "writer_id"]
X_train = df_train[features]
y_train = df_train["label"]

X_val = df_val[features]
X_test = df_test[features]

# Preprocessing pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_features = ["startYear", "runtimeMinutes", "numVotes"]
categorical_features = ["director_id", "writer_id"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

# Train the final model
model = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

print("🔹 Training the final model on all training data...")
model.fit(X_train, y_train)

# Make predictions
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

# Save predictions in required format
df_val["label"] = y_val_pred
df_test["label"] = y_test_pred

df_val["label"].to_csv("validation_predictions.csv", header=False, index=False)
df_test["label"].to_csv("test_predictions.csv", header=False, index=False)

print("✅ Predictions saved! You can now submit 'validation_predictions.csv' and 'test_predictions.csv'.")

🔹 Training the final model on all training data...
✅ Predictions saved! You can now submit 'validation_predictions.csv' and 'test_predictions.csv'.
