First we import all packages and libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, FunctionTransformer

Then we do some data exploration on one of the trianing files to see what needs to happen to clean and preprocess the data

In [2]:
df = pd.read_csv("C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/train-3.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
0,5,tt0011439,The Mark of Zorro,The Mark of Zorro,1920,\N,79,2439.0,True
1,10,tt0012532,Ớrpháns ớf thé Stớrm,,1921,\N,150,,True
2,13,tt0013933,The Faithful Heart,Coeur fidèle,1923,\N,87,1252.0,True
3,31,tt0015400,The Thief of Bagdad,,1924,\N,155,6001.0,True
4,33,tt0015842,The Joyless Street,,1925,\N,125,1554.0,True


In [4]:
df.dtypes

Unnamed: 0          int64
tconst             object
primaryTitle       object
originalTitle      object
startYear          object
endYear            object
runtimeMinutes     object
numVotes          float64
label                bool
dtype: object

In [5]:
# Step 1: Drop 'originalTitle' column (if required)
df.drop(columns=["originalTitle"], inplace=True)

# Step 2: Convert relevant columns to integers, handling errors
cols_to_convert = ["startYear", "endYear", "runtimeMinutes"]
for col in cols_to_convert:
    df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")  # Keeps NaNs as <NA>

# Step 3: Check for missing values
print("Missing values per column before imputation:")
print(df.isnull().sum())

# Step 4: Calculate the average difference between startYear and endYear
valid_years = df.dropna(subset=["startYear", "endYear"])  # Keep only complete cases
avg_duration = (valid_years["endYear"] - valid_years["startYear"]).mean()

# Step 5: Fill missing endYear values by adding the average duration to startYear
df.loc[df["endYear"].isna(), "endYear"] = df["startYear"] + avg_duration
df["endYear"] = df["endYear"].round().astype("Int64")  # Convert to integer

# Step 6: Handle other missing values
df.loc[:, "startYear"] = df["startYear"].fillna(df["startYear"].median())
df.loc[:, "runtimeMinutes"] = df["runtimeMinutes"].fillna(df["runtimeMinutes"].median())
df.loc[:, "numVotes"] = df["numVotes"].fillna(0)

# Step 7: Display cleaned data
print("Missing values per column after imputation:")
print(df.isnull().sum())

df.head()


Missing values per column before imputation:
Unnamed: 0          0
tconst              0
primaryTitle        0
startYear         105
endYear           846
runtimeMinutes      1
numVotes          114
label               0
dtype: int64
Missing values per column after imputation:
Unnamed: 0          0
tconst              0
primaryTitle        0
startYear           0
endYear           846
runtimeMinutes      0
numVotes            0
label               0
dtype: int64


Unnamed: 0.1,Unnamed: 0,tconst,primaryTitle,startYear,endYear,runtimeMinutes,numVotes,label
0,5,tt0011439,The Mark of Zorro,1920,,79,2439.0,True
1,10,tt0012532,Ớrpháns ớf thé Stớrm,1921,,150,0.0,True
2,13,tt0013933,The Faithful Heart,1923,,87,1252.0,True
3,31,tt0015400,The Thief of Bagdad,1924,,155,6001.0,True
4,33,tt0015842,The Joyless Street,1925,,125,1554.0,True


In [6]:
import pandas as pd

# Load dataset
df = pd.read_csv("C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/train-3.csv")

# Step 1: Drop 'originalTitle' and 'endYear' columns
df.drop(columns=["originalTitle", "endYear"], inplace=True)

# Step 2: Convert relevant columns to integers, handling errors
cols_to_convert = ["startYear", "runtimeMinutes"]
for col in cols_to_convert:
    df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")  # Keeps NaNs as <NA>

# Step 3: Handle missing values
df["startYear"] = df["startYear"].fillna(df["startYear"].median()).astype("Int64")
df["runtimeMinutes"] = df["runtimeMinutes"].fillna(df["runtimeMinutes"].median()).astype("Int64")
df["numVotes"] = df["numVotes"].fillna(0)

# Step 4: Save the cleaned dataset
df.to_csv("cleaned_file.csv", index=False)

# Step 5: Display cleaned data summary
print("Missing values per column after cleanup:")
print(df.isnull().sum())

df.head()

Missing values per column after cleanup:
Unnamed: 0        0
tconst            0
primaryTitle      0
startYear         0
runtimeMinutes    0
numVotes          0
label             0
dtype: int64


Unnamed: 0.1,Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,numVotes,label
0,5,tt0011439,The Mark of Zorro,1920,79,2439.0,True
1,10,tt0012532,Ớrpháns ớf thé Stớrm,1921,150,0.0,True
2,13,tt0013933,The Faithful Heart,1923,87,1252.0,True
3,31,tt0015400,The Thief of Bagdad,1924,155,6001.0,True
4,33,tt0015842,The Joyless Street,1925,125,1554.0,True


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load cleaned dataset
df = pd.read_csv("cleaned_file.csv")

# Step 1: Select features and target variable
features = ["startYear", "runtimeMinutes", "numVotes"]  # Numerical features
X = df[features]
y = df["label"]  # Target variable

# Step 2: Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Normalize numerical features (optional but recommended)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Train a simple Logistic Regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Step 5: Make predictions
y_pred = model.predict(X_test_scaled)

# Step 6: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")  # Prints accuracy score
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Model Accuracy: 0.7016

Classification Report:
               precision    recall  f1-score   support

       False       0.69      0.80      0.74       103
        True       0.71      0.59      0.65        88

    accuracy                           0.70       191
   macro avg       0.70      0.69      0.69       191
weighted avg       0.70      0.70      0.70       191



In [8]:
import pandas as pd

# Load JSON files
df_directors = pd.read_json("C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/directing.json")
df_writers = pd.read_json("C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/writing.json")

# Print column names before fixing
print("Before Renaming:")
print("Directors JSON Columns:", df_directors.columns)
print("Writers JSON Columns:", df_writers.columns)

# Rename 'movie' to 'tconst' for proper merging
df_directors.rename(columns={"movie": "tconst", "director": "director_id"}, inplace=True)
df_writers.rename(columns={"movie": "tconst", "writer": "writer_id"}, inplace=True)

# Print column names after renaming
print("\nAfter Renaming:")
print("Directors JSON Columns:", df_directors.columns)
print("Writers JSON Columns:", df_writers.columns)

# Show first few rows after renaming
print("\nFirst few rows of Directors Data:")
print(df_directors.head())

print("\nFirst few rows of Writers Data:")
print(df_writers.head())

# Save cleaned files
df_directors.to_json("C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/cleaned_directing.json", orient="records")
df_writers.to_json("C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/cleaned_writing.json", orient="records")

print("\n✅ Cleaned JSON files saved successfully!")


Before Renaming:
Directors JSON Columns: Index(['movie', 'director'], dtype='object')
Writers JSON Columns: Index(['movie', 'writer'], dtype='object')

After Renaming:
Directors JSON Columns: Index(['tconst', 'director_id'], dtype='object')
Writers JSON Columns: Index(['tconst', 'writer_id'], dtype='object')

First few rows of Directors Data:
      tconst director_id
0  tt0003740   nm0665163
1  tt0008663   nm0803705
2  tt0009369   nm0428059
3  tt0009369   nm0949648
4  tt0010307   nm0304098

First few rows of Writers Data:
      tconst  writer_id
0  tt0003740  nm0195339
1  tt0003740  nm0515385
2  tt0003740  nm0665163
3  tt0003740  nm0758215
4  tt0008663  nm0406585

✅ Cleaned JSON files saved successfully!


In [9]:
import duckdb
import pandas as pd

# Load main dataset
df_movies = pd.read_csv("C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/train-3.csv")

# Load cleaned JSON files
df_directors = pd.read_json("C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/cleaned_directing.json")
df_writers = pd.read_json("C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/cleaned_writing.json")

# Check if tconst exists before merging
print("Movies Columns:", df_movies.columns)
print("Directors Columns:", df_directors.columns)
print("Writers Columns:", df_writers.columns)

# Connect to DuckDB
con = duckdb.connect()

# Register Pandas DataFrames as DuckDB tables
con.register("movies", df_movies)
con.register("directors", df_directors)
con.register("writers", df_writers)

# Perform the LEFT JOIN to merge director_id and writer_id
query = """
SELECT 
    movies.*, 
    directors.director_id, 
    writers.writer_id
FROM movies
LEFT JOIN directors ON movies.tconst = directors.tconst
LEFT JOIN writers ON movies.tconst = writers.tconst
"""

# Run query and get final merged dataset
df_merged = con.execute(query).fetchdf()

# Save the merged dataset
df_merged.to_csv("C:/Users/Gebruiker/Documents/UVA/Vakken met code/BD/BigData-Group10/imdb/merged_cleaned_data.csv", index=False)

# Close DuckDB connection
con.close()

print("\n✅ Merged dataset saved as 'merged_cleaned_data.csv' successfully!")


Movies Columns: Index(['Unnamed: 0', 'tconst', 'primaryTitle', 'originalTitle', 'startYear',
       'endYear', 'runtimeMinutes', 'numVotes', 'label'],
      dtype='object')
Directors Columns: Index(['tconst', 'director_id'], dtype='object')
Writers Columns: Index(['tconst', 'writer_id'], dtype='object')

✅ Merged dataset saved as 'merged_cleaned_data.csv' successfully!
