In [17]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

In [18]:
# Define the columns
num_features = ['age', 'balance', 'num_contacts']
cat_features = ['job']

# Simulate numerical data
np.random.seed(42)
X_num = np.random.rand(100, 3)

# Simulate categorical data
X_job = np.random.choice(['admin', 'technician', 'retired', 'management'], size=(100, 1))
# X_marital = np.random.choice(['married', 'unmarried'], size=(100, 1))


# Combine numerical and categorical data
X = np.hstack((X_num, X_job))
print(X.shape)

# Generate a binary target variable for churn (1 for churned, 0 for not churned)
y = np.random.choice([0, 1], size=(100,))

# Convert to DataFrame
feature_names = num_features + cat_features
X_df = pd.DataFrame(X, columns=feature_names)
print(X_df)


(100, 4)
                     age               balance         num_contacts  \
0     0.3745401188473625    0.9507143064099162   0.7319939418114051   
1     0.5986584841970366   0.15601864044243652  0.15599452033620265   
2    0.05808361216819946    0.8661761457749352   0.6011150117432088   
3     0.7080725777960455  0.020584494295802447   0.9699098521619943   
4     0.8324426408004217   0.21233911067827616  0.18182496720710062   
..                   ...                   ...                  ...   
95  0.035942273796742086   0.46559801813246016   0.5426446347075766   
96    0.2865412521282844    0.5908332605690108  0.03050024993904943   
97   0.03734818874921442    0.8226005606596583   0.3601906414112629   
98   0.12706051265188478    0.5222432600548044   0.7699935530986108   
99   0.21582102749684318    0.6228904758190003    0.085347464993768   

           job  
0        admin  
1   management  
2   management  
3   technician  
4      retired  
..         ...  
95  management  
96

In [19]:


# Custom transformer for interaction score
class InteractionScoreTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.max_interaction = X[:, :2].max()
        return self

    def transform(self, X):
        interaction_score = (X[:, 0] * X[:, 1]) / self.max_interaction
        return np.hstack((X, interaction_score[:, None]))

# Pipelines for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)])

# Full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('interaction', InteractionScoreTransformer()),
    ('classifier', RandomForestClassifier(random_state=42))])

# Separate preprocessing for numerical data to pass to the custom transformer
X_num_only = X_df[num_features].astype(float).values  # Convert to numerical array

# Fit the pipeline
pipeline.fit(X_df, y)


In [20]:
# Preprocessed data
preprocessed_data = pipeline.named_steps['preprocessor'].transform(X_df)
print("Preprocessed Data (after preprocessing step):")
print(preprocessed_data[3,:])

# Interaction score data
interaction_data = pipeline.named_steps['interaction'].transform(preprocessed_data)
print("\nData with Interaction Scores (after interaction step):")
print(interaction_data[3,:])

Preprocessed Data (after preprocessing step):
[ 0.81266807 -1.59356748  1.65209819  0.          0.          0.
  1.        ]

Data with Interaction Scores (after interaction step):
[ 0.81266807 -1.59356748  1.65209819  0.          0.          0.
  1.         -0.71797874]


In [21]:
from sklearn.model_selection import cross_val_score
# Perform cross-validation
cv_scores = cross_val_score(pipeline, X_df, y, cv=5, scoring='accuracy')  # 5-fold CV


In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# Define the parameter grid for RandomForestClassifier
param_grid = {
    'classifier__n_estimators': [50, 100, 200],  # Number of trees in the forest
    'classifier__max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'classifier__min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'classifier__min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'classifier__max_features': ['sqrt', 'log2', None]  # Number of features to consider when looking for the best split
}

# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=pipeline,         # Pipeline to optimize
    param_grid=param_grid,      # Parameter grid
    cv=5,                       # 5-fold cross-validation
    scoring='accuracy',         # Metric to optimize
    verbose=2,                  # Verbose output
    n_jobs=-1                   # Use all available cores
)

# Perform grid search
grid_search.fit(X_df, y)

# Get the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validated Accuracy:", grid_search.best_score_)



# Set the best parameters to the pipeline
pipeline.set_params(**grid_search.best_params_)

# Train the final model on the entire dataset
final_model = pipeline.fit(X_df, y)


Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best Parameters: {'classifier__max_depth': None, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 200}
Best Cross-Validated Accuracy: 0.5700000000000001
