In [1]:
import pandas as pd
import numpy as np
import glob
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
import duckdb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

In [2]:
# Path to the dataset folder
path = "C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/"

# Get all train files (train-1.csv to train-8.csv)
file_paths = glob.glob(path + "train-*.csv")

# Load all CSVs into a single DataFrame
df_movies = pd.concat([pd.read_csv(file) for file in file_paths], ignore_index=True)

# Check the structure
print("✅ Combined Dataset Loaded Successfully!")
print("Shape of Combined Dataset:", df_movies.shape)
print("Columns:\n", df_movies.columns)

# Display missing values count
print("\nMissing Values:\n", df_movies.isnull().sum())

# Show first few rows
df_movies.head()


✅ Combined Dataset Loaded Successfully!
Shape of Combined Dataset: (7959, 9)
Columns:
 Index(['Unnamed: 0', 'tconst', 'primaryTitle', 'originalTitle', 'startYear',
       'endYear', 'runtimeMinutes', 'numVotes', 'label'],
      dtype='object')

Missing Values:
 Unnamed: 0           0
tconst               0
primaryTitle         0
originalTitle     3988
startYear            0
endYear              0
runtimeMinutes       0
numVotes           790
label                0
dtype: int64


Unnamed: 0.1,Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
0,4,tt0010600,The Doll,Die Puppe,1919,\N,66,1898.0,True
1,7,tt0011841,Way Down East,Way Down East,1920,\N,145,5376.0,True
2,9,tt0012494,Déstiny,Der müde Tod,1921,\N,97,5842.0,True
3,25,tt0015163,The Navigator,The Navigator,1924,\N,59,9652.0,True
4,38,tt0016220,The Phantom of the Opera,The Phantom of the Opera,1925,\N,93,17887.0,True


In [3]:
# Path to JSON files
directing_path = "C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/directing.json"
writing_path = "C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/writing.json"

# Load JSON files into DuckDB
con = duckdb.connect()
df_directors = con.execute(f"SELECT * FROM read_json_auto('{directing_path}')").fetchdf()
df_writers = con.execute(f"SELECT * FROM read_json_auto('{writing_path}')").fetchdf()

# Check structure
print("✅ Directors JSON Loaded Successfully!")
print("Columns:", df_directors.columns)
print(df_directors.head(), "\n")

print("✅ Writers JSON Loaded Successfully!")
print("Columns:", df_writers.columns)
print(df_writers.head())

# Check for missing values
print("\nMissing Values in Directors JSON:\n", df_directors.isnull().sum())
print("\nMissing Values in Writers JSON:\n", df_writers.isnull().sum())


✅ Directors JSON Loaded Successfully!
Columns: Index(['movie', 'director'], dtype='object')
                                               movie  \
0  {'0': 'tt0003740', '1': 'tt0008663', '2': 'tt0...   

                                            director  
0  {'0': 'nm0665163', '1': 'nm0803705', '2': 'nm0...   

✅ Writers JSON Loaded Successfully!
Columns: Index(['movie', 'writer'], dtype='object')
       movie     writer
0  tt0003740  nm0195339
1  tt0003740  nm0515385
2  tt0003740  nm0665163
3  tt0003740  nm0758215
4  tt0008663  nm0406585

Missing Values in Directors JSON:
 movie       0
director    0
dtype: int64

Missing Values in Writers JSON:
 movie     0
writer    0
dtype: int64


In [4]:
# Ensure JSON files are correctly loaded
df_directors = pd.read_json("C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/directing.json")
df_writers = pd.read_json("C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/writing.json")

# Rename 'movie' to 'tconst' for merging
df_directors.rename(columns={"movie": "tconst", "director": "director_id"}, inplace=True)
df_writers.rename(columns={"movie": "tconst", "writer": "writer_id"}, inplace=True)

# Convert nested JSON fields into strings (fix unhashable type issue)
df_directors["director_id"] = df_directors["director_id"].astype(str)
df_writers["writer_id"] = df_writers["writer_id"].astype(str)

# Drop duplicates
df_directors.drop_duplicates(inplace=True)
df_writers.drop_duplicates(inplace=True)

# Check missing values
print("✅ Cleaned Directors Data:")
print(df_directors.isnull().sum(), "\n")
print(df_directors.head(), "\n")

print("✅ Cleaned Writers Data:")
print(df_writers.isnull().sum(), "\n")
print(df_writers.head())

# Save cleaned versions in DuckDB for merging
import duckdb
con = duckdb.connect()
con.register("directors", df_directors)
con.register("writers", df_writers)

print("\n✅ Cleaned `directing.json` and `writing.json` ready for merging!")


✅ Cleaned Directors Data:
tconst         0
director_id    0
dtype: int64 

      tconst director_id
0  tt0003740   nm0665163
1  tt0008663   nm0803705
2  tt0009369   nm0428059
3  tt0009369   nm0949648
4  tt0010307   nm0304098 

✅ Cleaned Writers Data:
tconst       0
writer_id    0
dtype: int64 

      tconst  writer_id
0  tt0003740  nm0195339
1  tt0003740  nm0515385
2  tt0003740  nm0665163
3  tt0003740  nm0758215
4  tt0008663  nm0406585

✅ Cleaned `directing.json` and `writing.json` ready for merging!


In [5]:
# Register df_movies in DuckDB
con = duckdb.connect()
con.register("movies", df_movies)
con.register("directors", df_directors)
con.register("writers", df_writers)

# Perform the LEFT JOIN to merge director_id and writer_id
query = """
SELECT 
    movies.*, 
    directors.director_id, 
    writers.writer_id
FROM movies
LEFT JOIN directors ON movies.tconst = directors.tconst
LEFT JOIN writers ON movies.tconst = writers.tconst
"""

# Run query and get final merged dataset
df_merged = con.execute(query).fetchdf()

# Save the merged dataset
df_merged.to_csv("C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/merged_cleaned_data.csv", index=False)

# Close DuckDB connection
con.close()

print("\n✅ Merged dataset saved as 'merged_cleaned_data.csv' successfully!")



✅ Merged dataset saved as 'merged_cleaned_data.csv' successfully!


In [6]:
# Load merged dataset
df = pd.read_csv("C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/merged_cleaned_data.csv")

# Step 1: Drop unnecessary columns (if needed)
columns_to_drop = ["originalTitle", "endYear", "Unnamed: 0"]  # Add more if necessary
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

# Step 2: Handle missing values

## Handle missing values in numerical columns
numeric_columns = ["startYear", "runtimeMinutes", "numVotes"]
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors="coerce")  # Ensure numeric format
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median())  # Fill missing with median

## Handle missing values in categorical columns
df["director_id"] = df["director_id"].fillna("unknown")
df["writer_id"] = df["writer_id"].fillna("unknown")

# Step 3: Ensure correct data types
df["startYear"] = df["startYear"].astype(int)
df["runtimeMinutes"] = df["runtimeMinutes"].astype(int)
df["numVotes"] = df["numVotes"].astype(int)

# Step 4: Verify missing values are handled
print("\n✅ Missing Values After Handling:\n", df.isnull().sum())

# Save the cleaned dataset
df.to_csv("C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/final_cleaned_data.csv", index=False)

print("\n✅ Final cleaned dataset saved as 'final_cleaned_data.csv' and ready for model training!")



✅ Missing Values After Handling:
 tconst            0
primaryTitle      0
startYear         0
runtimeMinutes    0
numVotes          0
label             0
director_id       0
writer_id         0
dtype: int64

✅ Final cleaned dataset saved as 'final_cleaned_data.csv' and ready for model training!


In [7]:
df.head()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label,director_id,writer_id
0,tt0010600,The Doll,1919,66,1898,True,nm0523932,nm0932559
1,tt0011841,Way Down East,1920,145,5376,True,nm0000428,nm0000428
2,tt0012494,Déstiny,1921,97,5842,True,nm0000485,nm0902376
3,tt0015163,The Navigator,1924,59,9652,True,nm0000036,nm0369841
4,tt0016220,The Phantom of the Opera,1925,93,17887,True,nm0781292,nm1541473


In [8]:
# Load cleaned dataset
df = pd.read_csv("C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/final_cleaned_data.csv")

# Step 1: Select Features and Target
features = ["startYear", "runtimeMinutes", "numVotes", "director_id", "writer_id"]
X = df[features]
y = df["label"]  # Target variable (binary classification)

# Step 2: Train-Test Split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Preprocessing for Numeric & Categorical Data

## Numeric Columns (Standardization)
numeric_features = ["startYear", "runtimeMinutes", "numVotes"]

## Categorical Columns (One-Hot Encoding)
categorical_features = ["director_id", "writer_id"]

## Preprocessing Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),  # Scale numeric features
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)  # Encode categorical features
    ]
)

print("\n✅ Data Split Successfully! Ready for Model Training.")



✅ Data Split Successfully! Ready for Model Training.


In [9]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Create a Pipeline for Preprocessing + Logistic Regression
model_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))  # Ensure convergence
])

# Step 2: Train the Model
model_pipeline.fit(X_train, y_train)

# Step 3: Make Predictions
y_pred = model_pipeline.predict(X_test)

# Step 4: Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
print(f"\n✅ Logistic Regression Model Accuracy: {accuracy:.4f}")

# Display Classification Report
print("\nClassification Report:\n", classification_report(y_test, y_pred))



✅ Logistic Regression Model Accuracy: 0.8853

Classification Report:
               precision    recall  f1-score   support

       False       0.86      0.92      0.89      2273
        True       0.91      0.85      0.88      2270

    accuracy                           0.89      4543
   macro avg       0.89      0.89      0.89      4543
weighted avg       0.89      0.89      0.89      4543



In [10]:
# Step 1: Define models to compare
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel="linear", probability=True)  # Linear Kernel for Binary Classification
}

# Step 2: Train and evaluate each model
for name, model in models.items():
    print(f"\n🔹 Training {name}...")

    # Create pipeline with preprocessing + model
    pipeline = Pipeline([
        ("preprocessing", preprocessor),
        ("classifier", model)
    ])

    # Train model
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Evaluate model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"✅ {name} Accuracy: {accuracy:.4f}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred))



🔹 Training Logistic Regression...
✅ Logistic Regression Accuracy: 0.8853

Classification Report:
               precision    recall  f1-score   support

       False       0.86      0.92      0.89      2273
        True       0.91      0.85      0.88      2270

    accuracy                           0.89      4543
   macro avg       0.89      0.89      0.89      4543
weighted avg       0.89      0.89      0.89      4543


🔹 Training Random Forest...
✅ Random Forest Accuracy: 0.9227

Classification Report:
               precision    recall  f1-score   support

       False       0.90      0.95      0.92      2273
        True       0.94      0.90      0.92      2270

    accuracy                           0.92      4543
   macro avg       0.92      0.92      0.92      4543
weighted avg       0.92      0.92      0.92      4543


🔹 Training SVM...
✅ SVM Accuracy: 0.9311

Classification Report:
               precision    recall  f1-score   support

       False       0.92      0.94     