<a href="https://colab.research.google.com/github/Sriram624/Analytics-app/blob/main/Ml_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install scikit-learn pandas numpy




In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import (StratifiedKFold, cross_val_score,cross_validate,
                                     train_test_split)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (roc_auc_score,precision_score,accuracy_score,recall_score,f1_score,confusion_matrix,classification_report)

In [None]:
from google.colab import files
uploaded = files.upload()


Saving Telco_customer_churn.csv to Telco_customer_churn.csv


In [None]:
df = pd.read_csv("Telco_customer_churn.csv")



In [None]:
def data_audit(df):
  audit = pd.DataFrame({
      "dtype": df.dtypes,
      "missing": df.isna().sum(),
      "unique": df.nunique().values,
      "missing_percent": (df.isnull().sum() / len(df) * 100).values,
      "unique_values": df.nunique().values
  })
  audit = audit.sort_values("missing_percent", ascending=False)
  audit.reset_index(drop=True, inplace=True)
  return audit.sort_values("missing_percent", ascending=False)
audit_report =  data_audit(df)
audit_report.head(10)


Unnamed: 0,dtype,missing,unique,missing_percent,unique_values
0,object,5174,20,73.463013,20
1,object,0,7043,0.0,7043
2,int64,0,1,0.0,1
3,object,0,1,0.0,1
4,object,0,1,0.0,1
5,int64,0,1652,0.0,1652
6,object,0,1652,0.0,1652
7,float64,0,1652,0.0,1652
8,object,0,1129,0.0,1129
9,object,0,2,0.0,2


In [None]:
TARGET_COL = "Churn Value"
y = df[TARGET_COL].values


In [None]:
LEAKAGE_COLS = [
    "Churn Value",
    "Churn Label",
    "Churn Score",
    "Churn Reason",
    "CustomerID",
    "Lat Long"
]

X = df.drop(columns=LEAKAGE_COLS)

In [None]:
for col in X.select_dtypes(include=["int64","float64"]).columns:
  X[col] = X[col].fillna(X[col].mean())
for col in X.select_dtypes(include=["object"]).columns:
  X[col] = X[col].fillna(X[col].mode()[0])




In [None]:
categorical_cols = X.select_dtypes(include=["object"]).columns
numerical_cols = X.select_dtypes(include=["int64","float64"]).columns

In [None]:
preprocessor = ColumnTransformer(
    transformers = [
        ("num", StandardScaler(), numerical_cols),
        ("cat",OneHotEncoder(
            drop = "first",
            handle_unknown = "ignore",
            sparse_output = False
        ),categorical_cols)

    ]

)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [None]:
def build_model(model_type="logistic"):
    if model_type == "logistic":
        model = LogisticRegression(
            max_iter=1000,
            class_weight="balanced"
        )
    elif model_type == "random_forest":
        model = RandomForestClassifier(
            n_estimators=300,
            max_depth=14,
            min_samples_split=10,
            class_weight="balanced",
            random_state=42,
            n_jobs=-1
        )
    else:
        raise ValueError("Unknown model type")

    return Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])


In [None]:
def evaluate_model(pipeline,X,y):
  cv = StratifiedKFold(n_splits=5, shuffle = True, random_state = 42)
  scores = cross_validate(
      pipeline,
      X,
      y,
      cv=cv,
      scoring=["roc_auc","accuracy","precision","recall","f1"],
      return_train_score=False
  )
  return {
        "ROC_AUC": scores["test_roc_auc"].mean(),
        "Precision": scores["test_precision"].mean(),
        "Recall": scores["test_recall"].mean(),
        "F1": scores["test_f1"].mean()
    }


In [None]:
logistic_pipeline = build_model(model_type="logistic")
rf_pipeline = build_model(model_type = "random_forest")
logistic_scores = evaluate_model(logistic_pipeline,X_train,y_train)
rf_scores = evaluate_model(rf_pipeline,X_train,y_train)
print("Logistic Regression:", logistic_scores)
print("Random Forest:", rf_scores)



Logistic Regression: {'ROC_AUC': np.float64(0.8475068205174388), 'Precision': np.float64(0.5473387666882392), 'Recall': np.float64(0.7454203524413179), 'F1': np.float64(0.6310454742547206)}
Random Forest: {'ROC_AUC': np.float64(0.8361017625689267), 'Precision': np.float64(0.5019396214353712), 'Recall': np.float64(0.827101762206589), 'F1': np.float64(0.6245465743082126)}


In [None]:
final_model = rf_pipeline
final_model.fit(X_train, y_train)


In [None]:
y_pred = final_model.predict(X_test)
y_prob = final_model.predict_proba(X_test)[:, 1]
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



Accuracy: 0.7260468417317246
ROC-AUC: 0.8335926660059465
[[691 318]
 [ 68 332]]
              precision    recall  f1-score   support

           0       0.91      0.68      0.78      1009
           1       0.51      0.83      0.63       400

    accuracy                           0.73      1409
   macro avg       0.71      0.76      0.71      1409
weighted avg       0.80      0.73      0.74      1409



In [None]:
def predict_with_threshold(probs, threshold):
    return (probs >= threshold).astype(int)
threshold = 0.35
custom_pred = predict_with_threshold(y_prob, threshold)
print("Custom Threshold Confusion Matrix:")
print(confusion_matrix(y_test, custom_pred))


Custom Threshold Confusion Matrix:
[[367 642]
 [ 23 377]]


In [None]:
ohe = final_model.named_steps["preprocessor"] \
                 .named_transformers_["cat"]
cat_features = ohe.get_feature_names_out(categorical_cols)
all_features = np.concatenate([numerical_cols, cat_features])
importances = final_model.named_steps["model"].feature_importances_
feature_importance = (
    pd.DataFrame({
        "feature": all_features,
        "importance": importances
    })
    .sort_values("importance", ascending=False)
)

feature_importance.head(20)


Unnamed: 0,feature,importance
1157,Contract_Two year,0.09075
4,Tenure Months,0.073744
1142,Internet Service_Fiber optic,0.055032
1138,Dependents_Yes,0.052596
5,Monthly Charges,0.047761
1160,Payment Method_Electronic check,0.045345
1143,Internet Service_No,0.042564
1144,Online Security_No internet service,0.037345
1154,Streaming Movies_No internet service,0.035637
1150,Tech Support_No internet service,0.035252
