In [26]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

In [None]:
# --- 1. The Mock Artifacts: Creating a more complex dataset with mixed types ---
# We'll create a dataset that has both numerical and categorical features,
# similar to what you would see in the real world.

np.random.seed(42)
data = {
    'duration_minutes': np.random.uniform(90, 180, size=100),
    'budget_millions': np.random.uniform(5, 300, size=100),
    'genre': np.random.choice(['Sci-Fi', 'Action', 'Comedy', 'Drama'], 
                              size=100),    
}
movies_df = pd.DataFrame(data)

# Setting 'is_hit'
# A movie is now a "hit" if its budget is above the median budget
movies_df['is_hit'] = (movies_df['budget_millions'] > movies_df['budget_millions'].median()).astype(int)



In [28]:
# --- 2. Preparing the Ritual: Defining our preprocessor and model ---
# First, we identify our numerical and categorical features.

numerical_features = ['duration_minutes', 'budget_millions']
categorical_features = ['genre']

In [29]:
# We define the preprocessor using a ColumnTransformer.
# This will apply StandardScaler to numeric data and OneHotEncoder to categorical data.

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])


In [30]:
# Now we define our model.
model = DecisionTreeClassifier(random_state=42)


In [31]:
# --- 3. The Grand Ritual: Building and training the Pipeline ---
# The pipeline links the preprocessor and the model together.
# This ensures every step is applied in the correct order.

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])


In [32]:
# Separate our features (X) and our target (y)
X = movies_df.drop('is_hit', axis=1)
y = movies_df['is_hit']

In [33]:
# Split our data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
# Now, we train the entire pipeline on the training data.
# The .fit() command automatically applies the preprocessor and then trains the model.

print("Training the pipeline...")
pipeline.fit(X_train, y_train)
print("Pipeline training complete.")


Training the pipeline...
Pipeline training complete.


In [35]:
# --- 4. The Grand Revelation: Evaluating the pipeline's prophecy ---

print("\n--- Performing 5-Fold Cross-Validation on the entire pipeline ---")
cv_scores = cross_val_score(pipeline, X, y, cv=5)

print(f"Cross-Validation scores (5 folds): {cv_scores}")
print(f"Average Cross-Validation score: {cv_scores.mean():.4f}")


--- Performing 5-Fold Cross-Validation on the entire pipeline ---
Cross-Validation scores (5 folds): [1.   1.   0.95 1.   1.  ]
Average Cross-Validation score: 0.9900
