Loading Telco customer churn dataset

In [2]:
import pandas as pd

file_path = 'sample-data.csv' 
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


Data Pre-processing - checking missing values

In [5]:
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

df['TotalCharges'] = df['TotalCharges'].replace(' ', pd.NA)  
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce') 

# Fill missing values
df['TotalCharges'].fillna(df['TotalCharges'].mean(), inplace=True)  
df['tenure'].fillna(df['tenure'].median(), inplace=True) 
df['MonthlyCharges'].fillna(df['MonthlyCharges'].median(), inplace=True)


Series([], dtype: int64)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['tenure'].fillna(df['tenure'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object 

Feature Engineering

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


df_encoded = pd.get_dummies(df, drop_first=True)

# Separate features and target variable
X = df_encoded.drop('Churn_Yes', axis=1)
y = df_encoded['Churn_Yes']  # Target variable

# Identify the numerical columns to scale
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Apply scaling only to numerical features
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])

X.head()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)



(5634, 7072) (1409, 7072) (5634,) (1409,)


Model Selection and Training

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
import joblib


# model = LogisticRegression()
# model.fit(X_train, y_train)

model = LogisticRegression(max_iter=200)  # Increase the number of iterations
model.fit(X_train, y_train)


# Save model and scaler
joblib.dump(model, 'model.pkl')
joblib.dump(scaler, 'scaler.pkl')

y_pred = model.predict(X_test)

# Evaluate the model
print(f"Accuracy: {round(accuracy_score(y_test, y_pred), 4)*100}%")
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 82.19%
Classification Report:
               precision    recall  f1-score   support

       False       0.86      0.90      0.88      1036
        True       0.69      0.60      0.64       373

    accuracy                           0.82      1409
   macro avg       0.77      0.75      0.76      1409
weighted avg       0.82      0.82      0.82      1409

