In [None]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, confusion_matrix

# Load prepared dataset
TCC_df = pd.read_csv('Telco-Customer-Churn-prepared.csv')

# Split data (70/20/10)
X = TCC_df.drop('Churn', axis=1)
y = TCC_df['Churn']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)  # 70% train, 30% temp
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.3333, random_state=42)  # 20% val, 10% test

# Scale numerical features
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges', 'charge_per_month']
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# Save scaler for Flask deployment
pickle.dump(scaler, open('models/scaler.pkl', 'wb'))

# LogisticRegression GridSearchCV
lr_param_grid = {
    'C': [0.01],
    'penalty': ['l2'],
    'solver': ['liblinear'],
    'class_weight': ['balanced'],
    'max_iter': [1000]
}
lr = LogisticRegression(random_state=42)
lr_grid = GridSearchCV(lr, lr_param_grid, cv=5, scoring='f1', n_jobs=-1)
lr_grid.fit(X_train, y_train)

# RandomForest GridSearchCV
rf_param_grid = {
    'n_estimators': [197],
    'max_depth': [7],
    'min_samples_split': [7],
    'min_samples_leaf': [2],
    'class_weight': ['balanced'],
    'max_features': ['sqrt']
}
rf = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf, rf_param_grid, cv=5, scoring='f1', n_jobs=-1)
rf_grid.fit(X_train, y_train)

# Best models
best_lr = lr_grid.best_estimator_
best_rf = rf_grid.best_estimator_
print("\nBest LogisticRegression Parameters:", lr_grid.best_params_)
print("Best LogisticRegression F1-Score (CV):", lr_grid.best_score_)
print("\nBest RandomForest Parameters:", rf_grid.best_params_)
print("Best RandomForest F1-Score (CV):", rf_grid.best_score_)

# Evaluate on validation set
y_pred_lr = best_lr.predict(X_val)
y_pred_rf = best_rf.predict(X_val)

print("\nLogistic Regression (Validation):")
print("Accuracy:", accuracy_score(y_val, y_pred_lr))
print("F1-Score:", f1_score(y_val, y_pred_lr))
print("Recall:", recall_score(y_val, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_lr))

print("\nRandom Forest (Validation):")
print("Accuracy:", accuracy_score(y_val, y_pred_rf))
print("F1-Score:", f1_score(y_val, y_pred_rf))
print("Recall:", recall_score(y_val, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_rf))

# Evaluate on test set
y_pred_lr_test = best_lr.predict(X_test)
y_pred_rf_test = best_rf.predict(X_test)
print("\nLogistic Regression (Test):")
print("Accuracy:", accuracy_score(y_test, y_pred_lr_test))
print("F1-Score:", f1_score(y_test, y_pred_lr_test))
print("Recall:", recall_score(y_test, y_pred_lr_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr_test))
print("\nRandom Forest (Test):")
print("Accuracy:", accuracy_score(y_test, y_pred_rf_test))
print("F1-Score:", f1_score(y_test, y_pred_rf_test))
print("Recall:", recall_score(y_test, y_pred_rf_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf_test))

# Save best model (based on testing F1-score)
best_model = best_rf if f1_score(y_test, y_pred_rf_test) > f1_score(y_test, y_pred_lr_test) else best_lr
pickle.dump(best_model, open('models/churn_model.pkl', 'wb'))
print("\nBest model saved as 'models/churn_model.pkl'")


Best LogisticRegression Parameters: {'C': 0.01, 'class_weight': 'balanced', 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'}
Best LogisticRegression F1-Score (CV): 0.6209600101651646

Best RandomForest Parameters: {'class_weight': 'balanced', 'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 7, 'n_estimators': 197}
Best RandomForest F1-Score (CV): 0.630201319016279

Logistic Regression (Validation):
Accuracy: 0.7805397727272727
F1-Score: 0.670926517571885
Recall: 0.8224543080939948
Confusion Matrix:
 [[784 241]
 [ 68 315]]

Random Forest (Validation):
Accuracy: 0.7734375
F1-Score: 0.6588235294117647
Recall: 0.804177545691906
Confusion Matrix:
 [[781 244]
 [ 75 308]]

Logistic Regression (Test):
Accuracy: 0.7602836879432624
F1-Score: 0.6457023060796646
Recall: 0.806282722513089
Confusion Matrix:
 [[382 132]
 [ 37 154]]

Random Forest (Test):
Accuracy: 0.7773049645390071
F1-Score: 0.6680761099365751
Recall: 0.8272251308900523
Confusion Matrix:
