In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load the data
churn_data = pd.read_csv('telecom_churn_data.csv')

# Data Preprocessing Function
def preprocess_data(df):
    # Drop unnecessary columns including dates
    date_columns = [col for col in df.columns if 'date' in col.lower()]
    df = df.drop(columns=['mobile_number'] + date_columns)
    
    # Fill missing values with 0
    df = df.fillna(0)
    
    # Define churn based on 'arpu_9' as a proxy for churn
    df['churn'] = np.where(df['arpu_9'] == 0, 1, 0)
    
    # Drop all columns related to month 9 as it represents churn phase data
    df = df.drop(columns=[col for col in df.columns if col.endswith('_9')])
    
    return df

# Apply preprocessing
churn_data_processed = preprocess_data(churn_data)

# Split data into features (X) and target (y)
X = churn_data_processed.drop(columns=['churn'])
y = churn_data_processed['churn']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize models
log_reg = LogisticRegression(max_iter=1000, random_state=42)
rf_model = RandomForestClassifier(random_state=42)
xgb_model = XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=42)

# Train and evaluate Logistic Regression
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)
y_pred_log_proba = log_reg.predict_proba(X_test)[:, 1]

# Train and evaluate Random Forest
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
y_pred_rf_proba = rf_model.predict_proba(X_test)[:, 1]

# Train and evaluate XGBoost
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
y_pred_xgb_proba = xgb_model.predict_proba(X_test)[:, 1]

# Calculate performance metrics for each model
metrics = {
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost'],
    'Accuracy': [
        accuracy_score(y_test, y_pred_log),
        accuracy_score(y_test, y_pred_rf),
        accuracy_score(y_test, y_pred_xgb)
    ],
    'Precision': [
        precision_score(y_test, y_pred_log),
        precision_score(y_test, y_pred_rf),
        precision_score(y_test, y_pred_xgb)
    ],
    'Recall': [
        recall_score(y_test, y_pred_log),
        recall_score(y_test, y_pred_rf),
        recall_score(y_test, y_pred_xgb)
    ],
    'F1 Score': [
        f1_score(y_test, y_pred_log),
        f1_score(y_test, y_pred_rf),
        f1_score(y_test, y_pred_xgb)
    ],
    'AUC-ROC': [
        roc_auc_score(y_test, y_pred_log_proba),
        roc_auc_score(y_test, y_pred_rf_proba),
        roc_auc_score(y_test, y_pred_xgb_proba)
    ]
}

# Convert metrics to DataFrame and display
metrics_df = pd.DataFrame(metrics)
print(metrics_df)


AttributeError: 'str' object has no attribute 'decode'