# 03 - Feature Engineering

This notebook creates and transforms features for modeling.

## Objectives:
- Create new features from existing data
- Feature scaling and normalization
- Encoding categorical variables
- Feature selection
- Dimensionality reduction (if needed)

In [1]:
import pandas as pd
import numpy as np

# Load preprocessed data
df = pd.read_csv('../data/processed/movies_preprocessed.csv')
print(f"Data shape: {df.shape}")

df['runtime'].value_counts()

Data shape: (11242, 22)


runtime
90     390
100    359
95     315
105    309
97     304
      ... 
23       1
213      1
367      1
195      1
215      1
Name: count, Length: 168, dtype: int64

In [None]:
# Create new features
use_post_release_features = False

df['num_production_companies'] = df['production_companies'].apply(
    lambda x: len(str(x).split(',')) if pd.notna(x) else 0
)
df['num_production_countries'] = df['production_countries'].apply(
    lambda x: len(str(x).split(',')) if pd.notna(x) and str(x) != 'Unknown' else 0
)

# Extract budget feature
df['budget_per_minute'] = np.where(df['runtime'] > 0, df['budget'] / df['runtime'], 0)
if use_post_release_features and 'popularity' in df.columns:
    df['budget_popularity_ratio'] = (df['budget'] + 1) / (df['popularity'] + 1)

# Extract genre features
df['num_genres'] = df['genres'].apply(
    lambda x: len(str(x).split(', ')) if pd.notna(x) else 0
)
df['is_action'] = df['genres'].apply(lambda x: 1 if 'Action' in str(x) else 0)
df['is_animation'] = df['genres'].apply(lambda x: 1 if 'Animation' in str(x) else 0)
df['is_comedy'] = df['genres'].apply(lambda x: 1 if 'Comedy' in str(x) else 0)
df['is_drama'] = df['genres'].apply(lambda x: 1 if 'Drama' in str(x) else 0)
df['is_scifi'] = df['genres'].apply(lambda x: 1 if 'Science Fiction' in str(x) else 0)

# Extract cast features
df['num_cast'] = df['cast'].apply(lambda x: len(str(x).split(',')) if pd.notna(x) else 0)
df['has_cast'] = (df['num_cast'] > 0).astype(int)

# Extract director features
df['num_directors'] = df['director'].apply(
    lambda x: len(str(x).split(',')) if pd.notna(x) else 0
)

# Extract temporal features
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['release_year'] = df['release_date'].dt.year
df['release_month'] = df['release_date'].dt.month

df['is_summer'] = df['release_month'].isin([5, 6, 7]).astype(int)
df['is_holiday'] = df['release_month'].isin([11, 12]).astype(int)
df['is_weekend_month'] = df['release_month'].isin([12, 7]).astype(int)
reference_year = int(df['release_year'].max()) if df['release_year'].notna().any() else pd.Timestamp.now().year
df['movie_age'] = reference_year - df['release_year']
df['decade'] = (df['release_year'] // 10) * 10

# Extract keywords features
df['num_keywords'] = df['keywords'].apply(
    lambda x: len(str(x).split(',')) if pd.notna(x) else 0
)
df['has_keywords'] = (df['num_keywords'] > 0).astype(int)

# Extract collection features
df['in_collection'] = (df['collection'].notna()).astype(int)

# Extract language features (global language)
df['is_english'] = (df['original_language'] == 'en').astype(int)

# drop intermediate columns if needed
cols_drop = ['id', 'title', 'popularity', 'rating', 'keywords', 'vote_count', 
             'cast', 'director', 'collection', 'original_language', 
             'genres', 'production_companies', 'production_countries', 
             'release_date']
df.drop(columns=cols_drop, inplace=True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11242 entries, 0 to 11241
Data columns (total 32 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   budget                    11242 non-null  float64
 1   revenue                   11242 non-null  float64
 2   runtime                   11242 non-null  int64  
 3   rating                    11242 non-null  float64
 4   poster_brightness         11242 non-null  float64
 5   poster_saturation         11242 non-null  float64
 6   poster_dom_r              11242 non-null  float64
 7   poster_dom_g              11242 non-null  float64
 8   poster_dom_b              11242 non-null  float64
 9   num_production_companies  11242 non-null  int64  
 10  num_production_countries  11242 non-null  int64  
 11  budget_per_minute         11242 non-null  float64
 12  num_genres                11242 non-null  int64  
 13  is_action                 11242 non-null  int64  
 14  is_ani

In [4]:
# Save Save engineered features
df.to_csv('../data/processed/movies_featured.csv', index=False)
print(f"Shape after feature engineering: {df.shape}")
print("Data saved to ../data/processed/movies_featured.csv")

Shape after feature engineering: (11242, 32)
Data saved to ../data/processed/movies_featured.csv
