In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

In [None]:
# Let's imagine a dataset with both numerical and categorical features.

data = {
    'duration': [90, 150, 105, 120, 95],
    'budget_millions': [1.5, 200.0, 5.0, 50.0, 2.5],
    'genre': ['Comedy', 'Action', 'Drama', 'Action', 'Comedy'],
    'target_score': [6.5, 8.8, 7.1, 8.5, 6.2]
}
movies_df = pd.DataFrame(data)
print("--- Original Movie Data ---")
print(movies_df)
print("\n" + "="*40 + "\n")

--- Original Movie Data ---
   duration  budget_millions   genre  target_score
0        90              1.5  Comedy           6.5
1       150            200.0  Action           8.8
2       105              5.0   Drama           7.1
3       120             50.0  Action           8.5
4        95              2.5  Comedy           6.2




In [7]:
# --- 2. Preparing the Ritual: Defining our features ---
# We separate our features (X) from our target (y).
X = movies_df[['duration', 'budget_millions', 'genre']]
y = movies_df['target_score']

In [None]:
# Define which columns are numerical and which are categorical.

numerical_features = ['duration', 'budget_millions']
categorical_features = ['genre']

In [None]:
# --- 3. The Grand Ritual: Building the ColumnTransformer ---
# ColumnTransformer is our magical tool for applying different rituals
# to different columns in our dataset simultaneously.

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

print("--- Applying the Rituals... ---")


--- Applying the Rituals... ---


In [None]:
# .fit_transform() learns the rules from the training data AND applies them.

X_transformed = preprocessor.fit_transform(X)
print("Rituals complete! Here is the transformed data:")

transformed_df = pd.DataFrame(X_transformed, 
                 columns=preprocessor.get_feature_names_out())
print(transformed_df)
print("\n" + "="*40 + "\n")

Rituals complete! Here is the transformed data:
   num__duration  num__budget_millions  cat__genre_Action  cat__genre_Comedy  \
0      -1.019130             -0.659140                0.0                1.0   
1       1.760316              1.942038                1.0                0.0   
2      -0.324269             -0.613275                0.0                0.0   
3       0.370593             -0.023588                1.0                0.0   
4      -0.787510             -0.646035                0.0                1.0   

   cat__genre_Drama  
0               0.0  
1               0.0  
2               1.0  
3               0.0  
4               0.0  


