In [3]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
df = pd.read_csv("../Database/Dataset.csv")
df_raw = df.copy()  # Keep a backup of the original dataset

In [4]:
df.shape

(1266348, 24)

In [5]:
print(df.columns.tolist())

['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date', 'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'tagline', 'genres', 'production_companies', 'production_countries', 'spoken_languages', 'keywords']


In [6]:
# Initial Data Cleaning - Remove Columns with Too Many Null Values

# 1. Threshold setzen ( hier 50 % )
# Falls eine Spalte mehr als 50 % Null Values hat, wird sie deleted.
# Es bleiben nur Cols, die weniger als 50 % Null Values haben.
threshold = len(df) * 0.5

# Zähle die Null values pro Spalte
null_counts = df.isnull().sum()


# 2. Spalten ermitteln, bei denen null_counts > threshold
cols_to_drop = null_counts[null_counts > threshold].index.tolist()

# 3. Spalten löschen
df = df.drop(columns=cols_to_drop)

print("Original shape:", df_raw.shape)
print("Cleaned shape:", df.shape)
print("----------------------------------------\n")

# To Checken was gedropped wurde:
# dropped_cols als Set Difference
dropped_cols = set(df_raw.columns) - set(df.columns)
print("Dropped columns:", dropped_cols)
print("----------------------------------------")
print("----------------------------------------")
print("----------------------------------------")
print("Remaining columns:", df.columns.tolist())

Original shape: (1266348, 24)
Cleaned shape: (1266348, 19)
----------------------------------------

Dropped columns: {'production_companies', 'backdrop_path', 'keywords', 'homepage', 'tagline'}
----------------------------------------
----------------------------------------
----------------------------------------
Remaining columns: ['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date', 'revenue', 'runtime', 'adult', 'budget', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'genres', 'production_countries', 'spoken_languages']


In [7]:
# Second Data Cleaning
# Wir Droppen die Spalten, die wir nicht brauchen

cols_to_drop_for_model = [
    'id',
    'title',
    'original_title',
    'imdb_id',
    'poster_path',
    'overview',
    'vote_average'
]

features = df.drop(columns=cols_to_drop_for_model)
target = df['vote_average']

print("Original DataFrame:", df.shape)
print("Features DataFrame:", features.shape)
print("-------------------------------------------")
print("The Features we are going to be using:", features.columns.tolist())



Original DataFrame: (1266348, 19)
Features DataFrame: (1266348, 12)
-------------------------------------------
The Features we are going to be using: ['vote_count', 'status', 'release_date', 'revenue', 'runtime', 'adult', 'budget', 'original_language', 'popularity', 'genres', 'production_countries', 'spoken_languages']


In [8]:
# check values in status column
print(df['status'].value_counts(dropna=False))

status
Released           1229871
In Production        15591
Post Production      11191
Planned               8862
Rumored                507
Canceled               326
Name: count, dtype: int64


In [9]:
# Feature Engineering


# Use One Hot Encoding for the status
df['status_released'] = (df['status'] == 'Released').astype(int)
# By Far the majority of movies are Released -> Binary Choice
# 1 : Released, 0 : Not Released


# Convert 'release_date' to datetime and extract the year
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
# coerce -> If parsing fails, set the value to NaT (Not a Time)
df['release_year'] = df['release_date'].dt.year

# Normalize revenue and budget
# Fill NaN values with 0 for budget and revenue
df['budget'] = df['budget'].fillna(0)
df['revenue'] = df['revenue'].fillna(0)

scaler = MinMaxScaler()
df[['budget_scaled', 'revenue_scaled']] = scaler.fit_transform(df[['budget', 'revenue']])




feature_cols = ['vote_count', 'status', 'release_year', 'revenue', 'runtime', 'adult', 'budget', 'original_language', 'popularity', 'genres', 'production_countries', 'spoken_languages' ]


features = df[feature_cols]