In [1]:
import pandas as pd

data_path = "../data/cleaned_churn_data.csv"
df = pd.read_csv(data_path)

df.head()



Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,False,True,False,False,True,...,False,False,False,False,False,False,True,False,True,False
1,0,34,56.95,1889.5,0,True,False,False,True,False,...,False,False,False,False,True,False,False,False,False,True
2,0,2,53.85,108.15,1,True,False,False,True,False,...,False,False,False,False,False,False,True,False,False,True
3,0,45,42.3,1840.75,0,True,False,False,False,True,...,False,False,False,False,True,False,False,False,False,False
4,0,2,70.7,151.65,1,False,False,False,True,False,...,False,False,False,False,False,False,True,False,True,False


In [10]:
# Rename columns: remove "_Yes" suffix to get original names
df.rename(columns=lambda x: x.replace('_Yes', '') if x.endswith('_Yes') else x, inplace=True)

# Optional: also remove "No internet service" or similar if you want cleaner names
df.rename(columns=lambda x: x.replace('_No internet service', '').replace('_No phone service', ''), inplace=True)

# Check updated columns
print(df.columns)


Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn',
       'gender_Male', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'MultipleLines', 'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity', 'OnlineSecurity', 'OnlineBackup', 'OnlineBackup',
       'DeviceProtection', 'DeviceProtection', 'TechSupport', 'TechSupport',
       'StreamingTV', 'StreamingTV', 'StreamingMovies', 'StreamingMovies',
       'Contract_One year', 'Contract_Two year', 'PaperlessBilling',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')


In [11]:
from sklearn.preprocessing import LabelEncoder

# Copy dataset
df_encoded = df.copy()

# Binary columns (Yes/No)
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
le = LabelEncoder()
for col in binary_cols:
    df_encoded[col] = le.fit_transform(df_encoded[col])

# One-hot encoding for multi-category columns
df_encoded = pd.get_dummies(df_encoded, drop_first=True)

df_encoded.head()


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner,Dependents,PhoneService,MultipleLines,...,StreamingTV,StreamingTV.1,StreamingMovies,StreamingMovies.1,Contract_One year,Contract_Two year,PaperlessBilling,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,False,1,0,0,True,...,False,False,False,False,False,False,1,False,True,False
1,0,34,56.95,1889.5,0,True,0,0,1,False,...,False,False,False,False,True,False,0,False,False,True
2,0,2,53.85,108.15,1,True,0,0,1,False,...,False,False,False,False,False,False,1,False,False,True
3,0,45,42.3,1840.75,0,True,0,0,0,True,...,False,False,False,False,True,False,0,False,False,False
4,0,2,70.7,151.65,1,False,0,0,1,False,...,False,False,False,False,False,False,1,False,True,False


In [16]:
from sklearn.model_selection import train_test_split

# Separate features and target
X = df.drop("Churn", axis=1).values
y = df["Churn"].values

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print(type(y))
print(type(X))

Training set shape: (5634, 30)
Test set shape: (1409, 30)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit scaler on training data only
X_train_scaled = scaler.fit_transform(X_train)

# Apply the same transformation to test data
X_test_scaled = scaler.transform(X_test)

print("Features scaled.")


Features scaled.


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the model
model = LogisticRegression(random_state=42, max_iter=1000)

# Train the model on scaled training data
model.fit(X_train_scaled, y_train)

# Predict on test data
y_pred = model.predict(X_test_scaled)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Test Accuracy: 0.8070

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.66      0.57      0.61       374

    accuracy                           0.81      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.81      0.80      1409


Confusion Matrix:
[[925 110]
 [162 212]]
