In [1]:
from google.colab import files
uploaded = files.upload()


Saving customers_clean.csv to customers_clean.csv


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Manually define column names based on the first row of the CSV
column_names = ["customerID", "gender", "SeniorCitizen", "Partner", "Dependents", "tenure", "PhoneService", "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "Contract", "PaperlessBilling", "PaymentMethod", "MonthlyCharges", "TotalCharges", "Churn", "Churn_Flag", "tenure_group", "Charge_Group", "Total_Charge_Group"]

# Load dataset with explicit column names and header=0
df = pd.read_csv("customers_clean.csv", header=0, names=column_names)


# Drop ID
df = df.drop(columns=["customerID"])

# Separate target
y = df["Churn_Flag"]
X = df.drop(columns=["Churn_Flag", "Churn"])  # exclude raw churn

# One-hot encode categorical variables
cat_cols = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Model training
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1]

# Evaluation
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1035
           1       0.62      0.49      0.55       374

    accuracy                           0.79      1409
   macro avg       0.73      0.69      0.71      1409
weighted avg       0.77      0.79      0.78      1409

ROC-AUC: 0.8220917616058281


In [7]:
# Save churn predictions
out = pd.DataFrame({
    "Actual_Churn": y_test.reset_index(drop=True),
    "Predicted_Churn": y_pred,
    "Churn_Probability": y_proba
})
out.to_csv("churn_predictions.csv", index=False)

# Save top 20 feature importances
fi = pd.DataFrame({
    "feature": X.columns,
    "importance": model.feature_importances_
}).sort_values("importance", ascending=False)

fi.head(20).to_csv("feature_importance_top20.csv", index=False)


In [8]:
files.download("churn_predictions.csv")
files.download("feature_importance_top20.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
df[["customerID", "Churn", "tenure", "MonthlyCharges", "TotalCharges"]].dtypes

Unnamed: 0,0
customerID,object
Churn,int64
tenure,object
MonthlyCharges,float64
TotalCharges,object
