In [4]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import shap
import lime.lime_tabular
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from sklearn.tree import DecisionTreeClassifier, plot_tree


In [5]:
def preprocess_data(df, target_variable='churn', cap_limit=0.95):
    irrelevant_cols = ['clientnum', 'customerid', 'surname', 'visitorid', 'id']
    df = df.drop([col for col in irrelevant_cols if col in df.columns], axis=1, errors='ignore')
    
    for col in df.columns:
        if df[col].dtype in ['float64', 'int64']:
            df[col].fillna(df[col].median(), inplace=True)
        else:
            df[col].fillna(df[col].mode()[0], inplace=True)
    
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        upper_limit = df[col].quantile(cap_limit)
        df[col] = df[col].clip(upper=upper_limit)
    
    categorical_cols = df.select_dtypes(include=['object']).columns
    df[categorical_cols] = df[categorical_cols].astype(str)
    
    X = df.drop(target_variable, axis=1)
    y = df[target_variable]
    
    return X, y

def perform_eda(df):
    print("Dataset Information:")
    print(df.info())
    print("\nMissing Values:")
    print(df.isnull().sum())
    print("\nDescriptive Statistics:")
    print(df.describe(include='all'))
    
    plt.figure(figsize=(8, 4))
    sns.countplot(x='churn', data=df)
    plt.title('Distribution of Target Variable')
    plt.show()
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Heatmap')
    plt.show()

def feature_engineering(X):
    numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
    if not numeric_cols.empty:
        X[numeric_cols] = (X[numeric_cols] - X[numeric_cols].mean()) / X[numeric_cols].std()
    
    categorical_cols = X.select_dtypes(include=['object']).columns
    if not categorical_cols.empty:
        X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
    
    return X


In [6]:
def build_pipeline(X):
    categorical_cols = X.select_dtypes(include=['object']).columns
    numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
    
    numerical_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(transformers=[
        ('num', numerical_pipeline, numerical_cols),
        ('cat', categorical_pipeline, categorical_cols)
    ])
    
    return preprocessor

def split_data(X, y):
    return train_test_split(X, y, test_size=0.3, random_state=42)

def train_models(X_train, X_test, y_train, y_test):
    models = {
        'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
        'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
        'XGBoost': xgb.XGBClassifier(eval_metric='logloss', random_state=42)
    }
    
    for model_name, model in models.items():
        print(f"Training {model_name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(f"{model_name} Accuracy:", accuracy_score(y_test, y_pred))
        print(f"{model_name} Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print(f"{model_name} Classification Report:\n", classification_report(y_test, y_pred))

def global_explanations(model, X):
    explainer = shap.Explainer(model)
    shap_values = explainer(X)
    shap.summary_plot(shap_values, X)

def local_explanations(model, X, sample_index=0):
    explainer = lime.lime_tabular.LimeTabularExplainer(
        training_data=X.values,
        feature_names=X.columns,
        class_names=['not_churn', 'churn'],
        mode='classification'
    )
    explanation = explainer.explain_instance(X.iloc[sample_index], model.predict_proba)
    explanation.show_in_notebook()

def surrogate_model(model, X):
    dt = DecisionTreeClassifier()
    dt.fit(X, y)
    plt.figure(figsize=(20, 10))
    _ = plot_tree(dt, feature_names=X.columns, filled=True)
    plt.show()


In [7]:
def churn_prediction_pipeline(file_path, target_variable='churn'):
    df = pd.read_csv(file_path)
    perform_eda(df)
    X, y = preprocess_data(df, target_variable)
    X = feature_engineering(X)
    preprocessor = build_pipeline(X)
    X = preprocessor.fit_transform(X)
    X_train, X_test, y_train, y_test = split_data(X, y)
    train_models(X_train, X_test, y_train, y_test)


In [8]:
app = FastAPI()

class Features(BaseModel):
    features: dict

models = {
    'dataset1': joblib.load('model_dataset1.pkl'),
    'dataset2': joblib.load('model_dataset2.pkl'),
}

@app.post("/predict/")
def predict(dataset_name: str, features: Features):
    if dataset_name not in models:
        raise HTTPException(status_code=404, detail="Dataset not found")
    
    model = models[dataset_name]
    df = pd.DataFrame([features.features])
    X = feature_engineering(df)
    X_transformed = preprocessor.transform(X)
    prediction = model.predict(X_transformed)
    
    return {"prediction": prediction[0]}


FileNotFoundError: [Errno 2] No such file or directory: 'model_dataset1.pkl'