# Dataset Analysis

## Importing Python libraries

In [None]:
# importing libraries for data handling and analysis
import pandas as pd
from pandas.plotting import scatter_matrix
from pandas import ExcelWriter
from pandas import ExcelFile
import numpy as np
import statsmodels.api as sm
from openpyxl import load_workbook
from scipy.stats import norm, skew
from scipy import stats

In [None]:
# importing libraries for data visualisations
import seaborn as sns
from matplotlib import pyplot
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import matplotlib
%matplotlib inline
color = sns.color_palette()
from IPython.display import display
pd.options.display.max_columns = None
# Standard plotly imports
import chart_studio.plotly as py
import plotly.graph_objects as go
import plotly
#import plotly.plotly as py
import plotly.figure_factory as ff
from plotly.offline import iplot, init_notebook_mode
# Using plotly + cufflinks in offline mode
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
cf.set_config_file(offline=True)
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)

In [None]:
# sklearn modules for preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
# from imblearn.over_sampling import SMOTE  # SMOTE
# sklearn modules for ML model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# Libraries for data modelling
from sklearn import svm, tree, linear_model, neighbors
from sklearn import naive_bayes, ensemble, discriminant_analysis, gaussian_process
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.preprocessing import minmax_scale
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from tensorflow.compat.v1.losses import sparse_softmax_cross_entropy
import mlflow
import mlflow.sklearn
import lime
import shap
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

# Common sklearn Model Helpers
from sklearn.feature_selection import SelectKBest, f_classif, RFE, VarianceThreshold
from sklearn import model_selection
from sklearn import metrics
from sklearn.decomposition import PCA
# from sklearn.datasets import make_classification

# sklearn modules for performance metrics
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve
from sklearn.metrics import auc, roc_auc_score, roc_curve, recall_score, log_loss
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, make_scorer
from sklearn.metrics import average_precision_score

In [None]:
# importing misceallenous libraries
import os
import re
import sys
import timeit
import string
from datetime import datetime
from time import time
from dateutil.parser import parse
# ip = get_ipython()
# ip.register_magics(jupyternotify.JupyterNotifyMagics)

mlflow.set_tracking_uri("sqlite:///mlflow_tracking.db") 

In [None]:
# ->Load data
df = pd.read_csv('SampleDataSet.csv')
print("Shape of dataframe is: {}".format(df))

In [None]:
print(df.head(5))

In [None]:
# Make a copy of the original sourcefile to working sourcefile
df_subset = df.copy()

## EDA

In [None]:
# view new or copied Dataset columns
df_subset.columns

In [None]:
# Display the first few rows of the copied or new DataFrame
print(df_subset.head(5))

In [None]:
df_subset.columns.to_series().groupby(df.dtypes).groups

In [None]:
# Columns datatypes and missign values
df_subset.info()

Numerical features overview

In [None]:
df_subset.describe()

In [None]:
df_subset.hist(figsize=(20,20))
plt.show()

Correlation : Examining some of most significant correlations. note that correlation coefficients only measure linear correlations.

In [None]:
print(df_subset.columns)

In [None]:
# make a copy of the working dataset to correlation dataset
df_cor = df_subset.copy()

# Handle categorical features
df_cor['Gender'] = df_cor['Gender'].map({'Female': 0, 'Male': 1})

In [None]:
# Filter to only numeric columns
numeric_cols = df_cor.select_dtypes(include=['number']).columns  
df_cor = df_cor[numeric_cols]

df_cor[numeric_cols].corrwith(df_cor['Exited'])
df_cor.corrwith(df_cor['Exited'], numeric_only=True)
print(df_cor.corrwith(df_cor['Exited']).sort_values(ascending=False))



In [None]:
plt.figure(figsize=(15, 10))
sns.heatmap(df_cor.select_dtypes(include=['int', 'float']).corr(), annot=True, center=0,cmap='viridis',annot_kws={'size': 12})
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

In [None]:
from scipy import stats

# Test relationship between Balance and HasCrCard 
corr, p_value = stats.pearsonr(df_cor['Balance'], df_cor['HasCrCard'])  

alpha = 0.05
if p_value < alpha:
    print('Significant correlation between Balance and HasCrCard')
else:
    print('No significant correlation found')
    
# Can repeat for other features

In [None]:
plt.figure(figsize=(9, 6))
sns.set(style="whitegrid")
sns.set_palette("Set1")
sns.countplot(x="Complain", data=df_subset,hue='Exited')
plt.xlabel("Complain",fontsize=18)
plt.ylabel("Exited",fontsize=18)
plt.title("Complain vs Exited",fontsize=18)
plt.legend(title="Exited?", labels=["No", "Yes"],fontsize=16)
plt.xticks(rotation=90)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

In [None]:
# Extract the columns of interest
columna1 = df_subset['Complain']
columna2 = df_subset['Exited']

# Calculate the values of a, b, and c
a = sum((columna1 == 1) & (columna2 == 1))
b = sum((columna1 == 1) & (columna2 == 0))
c = sum((columna1 == 0) & (columna2 == 1))

# Calculate the adjusted Jaccard coefficient
adjusted_jaccard = a / (a + b + c)
print(f"Adjusted Jaccard Coefficient: {adjusted_jaccard:.4f}")

In [None]:
import matplotlib.pyplot as plt
plt.hist(df_subset['CreditScore'])
plt.show()

## Encoding

In [None]:
# Create a label encoder object
le = LabelEncoder()
encoded_columns = []  # List to store columns that were label encoded

In [None]:
print(df_subset.shape)
df_subset.head()



In [None]:
# Label Encoding will be used for columns with 2 or less unique values
le_count = 0
for col in df_subset.columns[1:]:
    if df_subset[col].dtype == 'object':
        if len(list(df_subset[col].unique())) <= 2:
            le.fit(df_subset[col])
            df_subset[col] = le.transform(df_subset[col])
            le_count += 1
print('{} columns were label encoded.'.format(le_count))
print('Columns label encoded:', encoded_columns)

In [None]:
# Check if any column name contains 'Unnamed'
unnamed_columns = [col for col in df_subset.columns if 'Unnamed' in col]

# Print the result
if len(unnamed_columns) > 0:
    print(f'Columns with "Unnamed": {unnamed_columns}')
else:
    print('No columns with "Unnamed" found.')


# Splitting data into training and testing sets

In [None]:
print(df_subset.columns)

In [None]:
# Split data into features (X) and target variable (y)
X = df_subset.drop(['RowNumber', 'CustomerId', 'Surname', 'Geography', 'Card Type'], axis=1)
Y = df_subset['Exited']

X.head()
X.shape

In [None]:
# Split categorical and numerical columns into separate dataframes

# Identify categorical and numerical columns
categorical_cols = df_subset.select_dtypes(include=['object']).columns
numerical_cols = df_subset.select_dtypes(include=['int32', 'int64', 'float64']).columns

# Create dataframes for categorical and numerical columns
df_categorical = df_subset[categorical_cols]
df_numerical = df_subset[numerical_cols]

# Example: Print the first few rows of each dataframe
print("Categorical Dataframe:")
print(df_categorical.head())

print("\nNumerical Dataframe:")
print(df_numerical.head())

In [None]:
              
# Split data into features (X) and target variable (y)
X = df_subset.drop(['RowNumber', 'CustomerId', 'Surname', 'Geography', 'Card Type'], axis=1)
y = df_subset['Exited']

# Split the data into training and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y) #, 

# Check the lengths of temporary sets
print("length of X_temp sets:", len(X_temp))  
print("length of y_temp sets:", len(y_temp))

# Split the temporary set into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


# Drop 'Churn' column if it exists
if 'Exited' in X_train.columns:
    X_train = X_train.drop('Exited', axis=1)

if 'Exited' in X_test.columns:
    X_test = X_test.drop('Exited', axis=1)

print("X_train", X_train.dtypes)
print("X_test", X_test.dtypes)
print("X_train columns:", X_train.columns)
print("X_test columns:", X_test.columns)

# Check the shapes of the resulting sets
print("Train Set Shape:     ", X_train.shape)
print("Validation Set Shape:", X_val.shape)
print("Test Set Shape:      ", X_test.shape)
print("Temporary Set Shape: ", X_temp.shape)

In [None]:
# One-hot encoding for handling categorical features
cat_cols = ['CreditScore', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
            'IsActiveMember', 'EstimatedSalary', 'Complain', 'Satisfaction Score', 'Point Earned']

# Step 1: Handle unknown categories in training data
for col in cat_cols:
    if col in X_train.columns:
        X_train[col] = X_train[col].replace(['unknown_category_train'], np.nan)

# Step 2: Handle unknown categories in validation data
for col in cat_cols:
    if col in X_val.columns:
        X_val[col] = X_val[col].replace(['unknown_category_val'], np.nan)

# Step 3: Handle unknown categories in test data
for col in cat_cols:
    if col in X_test.columns:
        X_test[col] = X_test[col].replace(['unknown_category_test'], np.nan)

# Step 4: One-hot encoding
X_train_encoded = pd.get_dummies(X_train, columns=cat_cols, dummy_na=True)
X_val_encoded = pd.get_dummies(X_val, columns=cat_cols, dummy_na=True)
X_test_encoded = pd.get_dummies(X_test, columns=cat_cols, dummy_na=True)


In [None]:
# Inspect the Columns
print("Columns in the DataFrame:")
print(df_subset.columns)

# Check for NaN Values
print("\nNaN Values in the DataFrame:")
print(df_subset.isnull().sum())

# Check Unique Values
print("\nUnique Values in Categorical Columns:")
categorical_cols = df_subset.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"Unique values in {col}: {df_subset[col].unique()}")

# Verify Encoding in Original DataFrame
print("\nVerification of Encoding in Original DataFrame:")
print(df_subset[cat_cols].head())  # Replace with your categorical columns


In [None]:
from sklearn.preprocessing import LabelEncoder

# List of remaining categorical columns
remaining_cat_cols = ['RowNumber', 'CustomerId', 'Surname', 'Geography', 'Card Type']

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Step 1: Handle unknown categories in training data
for col in remaining_cat_cols:
    if col in X_train.columns:
        X_train[col] = X_train[col].replace(['unknown_category_train'], np.nan)

# Step 2: Handle unknown categories in validation data
for col in remaining_cat_cols:
    if col in X_val.columns:
        X_val[col] = X_val[col].replace(['unknown_category_val'], np.nan)

# Step 3: Handle unknown categories in test data
for col in remaining_cat_cols:
    if col in X_test.columns:
        X_test[col] = X_test[col].replace(['unknown_category_test'], np.nan)

# Step 4: Apply Label Encoding
for col in remaining_cat_cols:
    if col in X_train.columns:
        X_train[col] = label_encoder.fit_transform(X_train[col].astype(str))

for col in remaining_cat_cols:
    if col in X_val.columns:
        X_val[col] = label_encoder.transform(X_val[col].astype(str))

for col in remaining_cat_cols:
    if col in X_test.columns:
        X_test[col] = label_encoder.transform(X_test[col].astype(str))


In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Create an imputer instance
imputer = SimpleImputer(strategy='mean')

# Check missing values before and after imputation
print("Before imputation:", X_train.isnull().sum())

# Fit and transform the imputer on your training data
X_train_imputed = imputer.fit_transform(X_train)

# Transform the test data using the same imputer
X_test_imputed = imputer.transform(X_test)

## Scale the Imputed Data: ##

# Scale the numerical features
scaler = StandardScaler()

# Fit and transform the scaler on the training data
X_train_scaled = scaler.fit_transform(X_train_imputed)

# Transform the test data using the same scaler
X_test_scaled = scaler.transform(X_test_imputed)

# Convert the scaled data back to a DataFrame
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)

# Check missing values after imputation and scaling
print("After imputation and scaling:", X_train_scaled_df.isnull().sum())

# Check the shapes of your training and testing datasets:
print("X_train scaled shape: ", X_train_scaled.shape)
print("y_train shape:        ", y_train.shape)
print("X_test scaled shape:  ", X_test_scaled.shape)
print("y_test shape:         ", y_test.shape)

# Create a DataFrame after imputation with column names
X_train_imputed_df = pd.DataFrame(X_train_imputed, columns=X_train.columns)

# Check for missing values after imputation
print("After imputation:")
print(X_train_imputed_df.isnull().sum())

# Check shapes after feature scaling
print("X_train_scaled shape:")
print(X_train_scaled.shape)
print("X_test_scaled shape:")
print(X_test_scaled.shape)

# Note: The following lines are related to X_train_selected, which is not defined
# print("X_train_selected shape:", X_train_selected.shape)
# print("X_test_selected shape:", X_test_selected.shape)


In [None]:
# Combine scaled and imputed features
X_train_combined = np.concatenate([X_train_scaled, X_train_imputed], axis=1)

# Convert the combined data back to a DataFrame
columns_combined = list(X_train.columns) + [f"{col}_imputed" for col in X_train.columns]
X_train_combined_df = pd.DataFrame(X_train_combined, columns=columns_combined)

# Check the shapes of your training datasets:
print("X_train_combined shape: ", X_train_combined.shape)
print("X_train_combined_df shape: ", X_train_combined_df.shape)
print("y_train shape:           ", y_train.shape)

In [None]:
### Feature Selection - SelectKBest
k = 10  # choose the number of top features you want
selector_kbest = SelectKBest(f_classif, k=k)
X_train_selected_kbest = selector_kbest.fit_transform(X_train_scaled, y_train)
X_test_selected_kbest = selector_kbest.transform(X_test_scaled)

# Feature Selection - Recursive Feature Elimination (RFE) 
# Assuming you're using a classifier like RandomForestClassifier
estimator = RandomForestClassifier()
selector_rfe = RFE(estimator, n_features_to_select=k)
X_train_selected_rfe = selector_rfe.fit_transform(X_train_imputed, y_train)
X_test_selected_rfe = selector_rfe.transform(X_test_imputed)

# Dimensionality Reduction - PCA
pca = PCA(n_components=k)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Check shapes after feature selection
print("X_train_selected_kbest shape:", X_train_selected_kbest.shape)
print("X_test_selected_kbest shape:", X_test_selected_kbest.shape)

In [None]:
# Remove Constant Features
X_train_selected = X_train.copy()
selector = VarianceThreshold()
X_train_selected = selector.fit_transform(X_train_selected)

BASELINE
1. Trains a set of models like Logistic Regression, Random Forest, SVM etc.
2. Applies different feature selection methods for some models (RFE, SelectKBest etc).
3. Evaluates each model on train and test sets using classification metrics and cross-validation.
4. The approach is more basic, training each model separately without ensembling

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Define a list of models along with their corresponding feature selection methods
models = [
    ('Logistic Regression', LogisticRegression(), None),
    ('Random Forest', RandomForestClassifier(n_estimators=100), SelectKBest(k=10)),  
    ('Support Vector Machine', SVC(), RFE(estimator=RandomForestClassifier(), n_features_to_select=10)),
    ('PCA + Logistic Regression', LogisticRegression(), PCA(n_components=10)),
    ('Gradient Boosting', GradientBoostingClassifier(), None),
    ('AdaBoost', AdaBoostClassifier(), None),
    ('K-Nearest Neighbors', KNeighborsClassifier(), None),
    ('Naive Bayes', GaussianNB(), None),
    ('Neural Network', MLPClassifier(max_iter=500), None),
    ('XGBoost', XGBClassifier(), None),
]

# Increase the number of iterations for the neural network
model_nn = MLPClassifier(learning_rate='adaptive', max_iter=500, tol=1e-4)
    
# Adjust the smoothing parameter for Gaussian Naive Bayes
model_nb = GaussianNB(var_smoothing=1e-3)

# Loop over each model
for model_name, model, feature_selector in models:
    # Apply feature selection if a feature_selector is specified
    if feature_selector is not None:
        # Apply feature selection and transformation
        X_train_selected = feature_selector.fit_transform(X_train_scaled, y_train)
        X_test_selected = feature_selector.transform(X_test_scaled)
    else:
        # If no feature selection is specified, use the original features
        X_train_selected = X_train_scaled
        X_test_selected = X_test_scaled

    # Train the model
    model.fit(X_train_selected, y_train)
    
    # For Neural Network and Naive Bayes, use the pre-defined models
    if model_name == 'Neural Network':
        model_nn.fit(X_train_selected, y_train)
    elif model_name == 'Naive Bayes':
        model_nb.fit(X_train_selected, y_train)
    
    # Predictions
    y_pred_train = model.predict(X_train_selected)
    y_pred_test = model.predict(X_test_selected)
    
    # Evaluate the model
    print(f"Model: {model_name}")
    print("Training Set:")
    print(classification_report(y_train, y_pred_train))
    print("Test Set:")
    print(classification_report(y_test, y_pred_test))
    
    # Out-of-time Cross-Validation
    cv_scores = cross_val_score(model, X_train_selected, y_train, cv=5)
    print(f"Cross-Validation Scores: {cv_scores}")
    print(f"Mean CV Score: {np.mean(cv_scores)}\n")

ML modeling pipeline
1. Focuses on creating a stacked ensemble model with base models and meta model
2. Tuned Random Forest hyperparameters using GridSearchCV
3. Stacks base models like Logistic Regression, Random Forest using meta-learner
4. Logs metrics and parameters with MLflow for tracking experiments
5. Includes model explanations using LIME and SHAP
6. Handles validation set and converting probabilities to metrics
7. More productionized approach with MLflow and model diagnostics

In [None]:
import mlflow
import numpy as np
#import pandas as pd
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score,classification_report, roc_curve, precision_recall_curve
from sklearn.model_selection import cross_val_score
import lime
import shap
from IPython.display import display

class ChurnPredictor:
    def __init__(self):
        self.models = []
        self.stack = None

    def add_model(self, model):
        self.models.append(model)

    def fit(self, X_train, y_train):
        # Start an MLflow run
        with mlflow.start_run():
            try:
                # Tune RF hyperparameters
                rf_grid = {'n_estimators': [100, 200], 'max_depth': [4, 6]}
                rf_gs = GridSearchCV(RandomForestClassifier(), rf_grid, cv=5)
                rf_gs.fit(X_train, y_train)

                self.rf = rf_gs.best_estimator_

                # Log RF params with MLflow
                mlflow.log_params(self.rf.get_params())
                
                
                # Train base models
                for model in self.models:
                    model.fit(X_train, y_train)

                # Create stacking ensemble
                self.stack = StackingClassifier(
                    estimators=[(str(i), model) for i, model in enumerate(self.models)],
                    final_estimator=LogisticRegression(),
                    stack_method='predict_proba'
                )

                # Create stack ensemble
                self.stack.fit(X_train, y_train)
                
                # Use feature names from X_train.columns
                feature_names = X_train.columns

                # Log feature names as tags
                mlflow.log_param('feature_names', ', '.join(feature_names))

                

            finally:
                # Ensure the MLflow run is always ended
                mlflow.end_run()

    def predict(self, X_test):
        try:
            
            # Try to get probabilities using predict_proba. Assuming binary classification
            proba_scores = self.stack.predict_proba(X_test)
            
            # Ensure probabilities sum to 1
            assert np.allclose(np.sum(proba_scores, axis=1), 1.0), "Probabilities do not sum to 1"
            
            ## Return the probabilitiess
            return proba_scores[:, 1]
        
        except NotImplementedError:
            # Handle classifiers without probability scores. Return probability scores
            proba_scores = self.stack.predict_proba(X_test)
            return proba_scores[:, 1]
           
    def _validate_input(self, X):
        if isinstance(X, pd.DataFrame):
            return X
        elif isinstance(X, np.ndarray):
            # Convert NumPy array to DataFrame, assuming columns are named numerically
            return pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
        else:
            raise ValueError("Input must be a DataFrame or a NumPy array.")

    def evaluate(self, X_test, y_test):
        with mlflow.start_run():
            try:
                # Extract feature names
                feature_names = X_train.columns

                # Check if X_test is a DataFrame
                X_test_df = self._validate_input(X_test)

                if y_test is None:
                    raise ValueError("y_test cannot be None.")

                # Get predictions
                predictions = self.predict(X_test)
                # Convert probabilities to binary predictions
                binary_predictions = (predictions > 0.5).astype(int)
                # Convert y_test to binary labels
                binary_labels = (y_test > 0.5).astype(int)
                # Convert pandas Series to NumPy array
                binary_labels = binary_labels.values
                
                # Model evaluation
                print(classification_report(binary_labels, binary_predictions))
                
                print(type(binary_predictions))
                print(type(binary_labels))
                print(binary_predictions.shape)
                print(binary_labels.shape)

                # Model evaluation
                #print(classification_report(y_test, predictions))

                # Log metrics with MLflow
                mlflow.log_metric("test_accuracy", accuracy_score(binary_labels, binary_predictions))
                mlflow.log_metric("test_roc_auc", roc_auc_score(binary_labels, predictions))
                
                

                # Print and display feature_names
                print("Feature Names:", feature_names)
                display(feature_names)

                # Choose the number of features for explanation (num_features)
                num_features = 10
                
                X_test_df = pd.DataFrame(X_test, columns=feature_names)

                # Explainability
                #explainer = lime.lime_tabular.LimeTabularExplainer(
                    #X_test_df.values,
                    #feature_names=feature_names,
                    #class_names=["Negative", "Positive"],
                    #discretize_continuous=True
                #)
                #exp = explainer.explain_instance(X_test_df.values[0], self.predict, num_features=10)
                #exp.show_in_notebook()

                # Shap
                background = shap.sample(X_test_df, 100)
                explainer = shap.KernelExplainer(self.stack.predict, background)
                shap_values = explainer.shap_values(X_test_df)
                shap.summary_plot(shap_values, X_test_df)

                return predictions
            
            finally:
                # Ensure the MLflow run is always ended
                mlflow.end_run()

    def evaluate_cv(self, X, y, cv=5):
        with mlflow.start_run():
            try:
                # Use cross_val_score for cross-validated evaluation
                scores = cross_val_score(self.stack, X, y, cv=cv, scoring='accuracy')

                # Log the average and standard deviation of accuracy across folds
                mlflow.log_metric("cv_accuracy_mean", scores.mean())
                mlflow.log_metric("cv_accuracy_std", scores.std())

            finally:
                # Ensure the MLflow run is always ended
                mlflow.end_run()




In [None]:
predictor = ChurnPredictor()

# Add models
predictor.add_model(LogisticRegression(max_iter=1000))
predictor.add_model(RandomForestClassifier())

# Fit and evaluate
predictor.fit(X_train, y_train)
predictions = predictor.evaluate(X_test, y_test)
predictor.evaluate_cv(X_train, y_train)

# Model evaluation
#print(classification_report(y_test, predictions))

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, predictions)
plt.plot(fpr, tpr)
plt.title('ROC Curve')

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, predictions) 
plt.plot(recall, precision)

# Cross-validation
print("cross_val_score:")
print(cross_val_score(model, X, y, cv=5))

# Log metrics
mlflow.log_metric("test_roc_auc", roc_auc_score(y_test, predictions))  

In [None]:
print(y_test.unique())



In [None]:
# customize model

import torch
import torch.nn as nn
import torch.nn.functional as F


class RNNModule(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNModule, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first =True)
        self.fc = nn.Linear(hidden_size, output_size, 32) #output size 32
        
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out
    
# this create an instance of RNNModel
#input_size = 1000  # Adjust the input size based on your data
#hidden_size = 32  # Adjust the hidden size based on your preference
#output_size = 32   # Adjust the output size based on your task
#rn_module = rn_module(input_size, hidden_size, output_size)
        
class GraphConv(nn.Module):
    def __init__(self, in_features, out_features):
        super(GraphConv, self).__init__()
        # Define your graph convolution layer parameters here

    def forward(self, adj, features):
        # Implement the forward pass of your graph convolution layer
        # The input 'adj' is the adjacency matrix and 'features' are node features
        # implementation will depend on the specific type of graph convolution used

        # Assuming a basic graph convolution operation
        output = torch.matmul(adj, features)  # Adjust this based on the actual graph convolution operation

        return output

class GNNModule(nn.Module):
    def __init__(self, input_features, hidden_features, output_features):
        super(GNNModule, self).__init__()
        
        # Assuming GraphConv is a custom graph convolution layer
        self.gc = GraphConv(input_features, output_features)
        self.fc = nn.Linear(output_features, output_features)  # Adjust the linear layer based on your needs

    def forward(self, adj, features):
        # Apply graph convolution
        gc_output = self.gc(adj, features)

        # Apply a linear layer or any other operations based on your architecture
        output = self.fc(gc_output)

        return output

# Now you can create an instance of GNNModule
#input_features = 64  # Adjust based on your graph node features
#hidden_features = 32  # Adjust based on your preference
#output_features = 16  # Adjust based on your needs
#gnn_module(input_features=64, hidden_features=32,output_features=16)
        
class NCFModule(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NCFModule, self).__init__()

        # Embedding layers for user and item
        self.user_embedding = nn.Embedding(input_size, hidden_size)
        self.item_embedding = nn.Embedding(input_size, hidden_size)

        # Fully connected layers
        self.fc1 = nn.Linear(2 * hidden_size, 64)  # Concatenate user and item embeddings
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, output_size)

        # Activation function
        self.relu = nn.ReLU()

    def forward(self, user_ids, item_ids):
        # Embed user and item IDs
        user_embedded = self.user_embedding(user_ids)
        item_embedded = self.item_embedding(item_ids)

        # Concatenate user and item embeddings
        concatenated = torch.cat([user_embedded, item_embedded], dim=1)

        # Forward pass through fully connected layers
        x = self.relu(self.fc1(concatenated))
        x = self.relu(self.fc2(x))
        output = self.fc3(x)

        return output

# usage
#input_size = 1000  # Adjust based on the number of unique users/items in your dataset
#hidden_size = 50  # Adjust based on your preference
#output_size = 32  # Output size, e.g., for binary classification
#ncf_module = ncf_module(input_size, hidden_size, output_size)

class TNRFinalModel(nn.Module):
    def __init__(self, rn_input_size, gnn_input_features, ncf_input_size, hidden_size, output_size):
        super(TNRFinalModel, self).__init__()

        self.rnn_module = RNNModule(rn_input_size, hidden_size, output_size)
        self.gnn_module = GNNModule(gnn_input_features, hidden_size, output_size)
        self.ncf_module = NCFModule(ncf_input_size, hidden_size, output_size)

    def forward(self, rnn_input, gnn_adj, gnn_features, ncf_user_ids, ncf_item_ids):
        rnn_output = self.rnn_module(rnn_input)
        gnn_output = self.gnn_module(gnn_adj, gnn_features)
        ncf_output = self.ncf_module(ncf_user_ids, ncf_item_ids)

        # Combine the outputs of the three modules as needed
        TNRFinalModel_output = torch.cat([rnn_output, gnn_output, ncf_output], dim=1)

        return TNRFinalModel_output





class tnr_final_model(nn.Module):
    def __init__(self, rn_input_size, gnn_input_features, ncf_input_size, hidden_size, output_size):
        super(tnr_final_model, self).__init__()

        self.rnn_module = RNNModule(rn_input_size, hidden_size, output_size)  # Corrected here
        self.gnn_module = GNNModule(gnn_input_features, hidden_size, output_features=16)
        self.ncf_module = NCFModule(ncf_input_size, hidden_size, output_size=16)

        self.layer1 = nn.Linear(rn_input_size + gnn_input_features + ncf_input_size, 32)
        self.layer2 = nn.Linear(32, 1)

    def forward(self, x, adj, features, user_ids, item_ids):
        rnn_out = self.rnn_module(x)
        gnn_out = self.gnn_module(adj, features)
        ncf_out = self.ncf_module(user_ids, item_ids)

        concat = torch.cat([rnn_out, gnn_out, ncf_out], dim=1)
        x = F.relu(self.layer1(concat))  # Apply ReLU activation
        x = torch.sigmoid(self.layer2(x))
        return x

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

# Model training hyperparameters
BATCH_SIZE = 64
EPOCHS = 10
LEARNING_RATE = 1e-3


rn_input_size = 1000  
gnn_input_size = 64
ncf_input_size = 1000
hidden_size = 50
output_size = 1

# Instantiate model
model = TNRFinalModel(rn_input_size, gnn_input_size, ncf_input_size, hidden_size, output_size)

# Get model parameters
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)  
loss_fn = nn.BCELoss()

# Extract input tensors from preprocessed data
X_seq = rnn_sequence_data
A_train = adjacency_matrix
X_feat = customer_features
user_ids = user_ids_data
item_ids = item_ids_data
y_train = churn_labels

dataset = TensorDataset(X_seq, A_train, X_feat, user_ids, item_ids, y_train)
train_loader = DataLoader(dataset, batch_size=BATCH_SIZE)

for epoch in range(EPOCHS):
    for seq_batch, adj_batch, feat_batch, user_batch, item_batch, label_batch in train_loader:
        optimizer.zero_grad()

        # Forward pass
        output = model(seq_batch, adj_batch, feat_batch, user_batch, item_batch)
        loss = loss_fn(output, label_batch.unsqueeze(1).float())  # Ensure labels are in the correct format

        # Backward pass
        loss.backward()

        optimizer.step()

    print(f'Epoch {epoch + 1} | Loss: {loss.item():.4f}')

# Usage example
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Assuming X_train, y_train, X_test, y_test are defined
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle unknown categories in training data
for col in cat_cols:
    X_train[col] = X_train[col].replace(['unknown_category_train'], np.nan)

# Handle unknown categories in test data
for col in cat_cols:
    X_test[col] = X_test[col].replace(['unknown_category_test'], np.nan)

# Create a ColumnTransformer to apply OneHotEncoder to categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), cat_cols)
    ],
    remainder='passthrough'
)


# Fit and transform the training data
X_train_encoded = preprocessor.fit_transform(X_train)

# Transform the test data
X_test_encoded = preprocessor.transform(X_test)
