In [1]:
import pandas as pd

# Correct working link
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
df = pd.read_csv(url)

# Show first 5 rows
df.head()



Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [2]:
print("Shape of dataset:", df.shape)


Shape of dataset: (7043, 21)


In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [4]:
df['Churn'].value_counts()


Churn
No     5174
Yes    1869
Name: count, dtype: int64

In [5]:
df.describe()


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [6]:
# Convert TotalCharges to numeric (some rows are blank, so we set errors='coerce')
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Check how many null values appeared
print("Missing values in TotalCharges:", df['TotalCharges'].isnull().sum())


Missing values in TotalCharges: 11


In [7]:
# Drop rows with missing TotalCharges
df = df.dropna(subset=['TotalCharges'])
print("New shape:", df.shape)


New shape: (7032, 21)


In [8]:
# Convert target column: Yes = 1, No = 0
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

print(df['Churn'].value_counts())


Churn
0    5163
1    1869
Name: count, dtype: int64


In [9]:
# One-hot encode categorical features
df_encoded = pd.get_dummies(df, drop_first=True)

print("Original shape:", df.shape)
print("Encoded shape:", df_encoded.shape)

# Show first 5 rows of the new encoded dataset
df_encoded.head()


Original shape: (7032, 21)
Encoded shape: (7032, 7062)


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,customerID_0003-MKNFE,customerID_0004-TLHLJ,customerID_0011-IGKFF,customerID_0013-EXCHZ,customerID_0013-MHZWF,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,False,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
1,0,34,56.95,1889.5,0,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
2,0,2,53.85,108.15,1,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
3,0,45,42.3,1840.75,0,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
4,0,2,70.7,151.65,1,False,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False


In [10]:
# Drop customerID before encoding
df_no_id = df.drop('customerID', axis=1)

# One-hot encode again
df_encoded = pd.get_dummies(df_no_id, drop_first=True)

print("Original shape without ID:", df_no_id.shape)
print("Encoded shape:", df_encoded.shape)

# Show first 5 rows
df_encoded.head()


Original shape without ID: (7032, 20)
Encoded shape: (7032, 31)


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,False,True,False,False,True,...,False,False,False,False,False,False,True,False,True,False
1,0,34,56.95,1889.5,0,True,False,False,True,False,...,False,False,False,False,True,False,False,False,False,True
2,0,2,53.85,108.15,1,True,False,False,True,False,...,False,False,False,False,False,False,True,False,False,True
3,0,45,42.3,1840.75,0,True,False,False,False,True,...,False,False,False,False,True,False,False,False,False,False
4,0,2,70.7,151.65,1,False,False,False,True,False,...,False,False,False,False,False,False,True,False,True,False


In [11]:
# 1. Drop the customerID column first
df_no_id = df.drop('customerID', axis=1)

# 2. One-hot encode the remaining data
df_encoded = pd.get_dummies(df_no_id, drop_first=True)

# 3. Check the new shape
print("Original shape (without ID):", df_no_id.shape)
print("Encoded shape:", df_encoded.shape)

# 4. Show the first 3 rows to see the new columns
df_encoded.head(3)

Original shape (without ID): (7032, 20)
Encoded shape: (7032, 31)


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,False,True,False,False,True,...,False,False,False,False,False,False,True,False,True,False
1,0,34,56.95,1889.5,0,True,False,False,True,False,...,False,False,False,False,True,False,False,False,False,True
2,0,2,53.85,108.15,1,True,False,False,True,False,...,False,False,False,False,False,False,True,False,False,True


In [12]:
from sklearn.model_selection import train_test_split

# 1. Define the target variable (y) and the features (X)
y = df_encoded['Churn']  # This is what we want to predict
X = df_encoded.drop('Churn', axis=1)  # Everything else is a feature

# 2. Split the data: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Print the shapes to confirm
print("Training features shape:", X_train.shape)
print("Testing features shape:", X_test.shape)
print("Training target shape:", y_train.shape)
print("Testing target shape:", y_test.shape)

Training features shape: (5625, 30)
Testing features shape: (1407, 30)
Training target shape: (5625,)
Testing target shape: (1407,)


In [13]:
# 1. Import the model
from sklearn.linear_model import LogisticRegression

# 2. Create the model instance
model = LogisticRegression(max_iter=1000, random_state=42) # max_iter ensures it runs fully

# 3. Train the model on the training data
model.fit(X_train, y_train)

print("Model training complete! ✅")

Model training complete! ✅


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
from sklearn.metrics import accuracy_score

# 1. Use the trained model to make predictions on the test set
y_pred = model.predict(X_test)

# 2. Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2%}")

# 3. (Optional but impressive) Show a breakdown of the predictions vs actual values
print("\nBreakdown of predictions (Confusion Matrix):")
print(pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True))

Model Accuracy: 78.75%

Breakdown of predictions (Confusion Matrix):
Predicted     0    1   All
Actual                    
0           915  118  1033
1           181  193   374
All        1096  311  1407


In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# 1. Scale the features (important for Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 2. Create and train a new model on the SCALED data
model_scaled = LogisticRegression(max_iter=1000, random_state=42)
model_scaled.fit(X_train_scaled, y_train)

# 3. Make new predictions and check accuracy
y_pred_scaled = model_scaled.predict(X_test_scaled)
accuracy_scaled = accuracy_score(y_test, y_pred_scaled)

print(f"New Model Accuracy (with scaled data): {accuracy_scaled:.2%}")

# 4. Show the new breakdown
print("\nNew Breakdown of predictions (Confusion Matrix):")
print(pd.crosstab(y_test, y_pred_scaled, rownames=['Actual'], colnames=['Predicted'], margins=True))

New Model Accuracy (with scaled data): 78.75%

New Breakdown of predictions (Confusion Matrix):
Predicted     0    1   All
Actual                    
0           915  118  1033
1           181  193   374
All        1096  311  1407


In [16]:
from sklearn.ensemble import RandomForestClassifier

# 1. Create a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# 2. Train it on the training data (use the original X_train, y_train - no need to scale for Random Forest!)
rf_model.fit(X_train, y_train)

# 3. Make predictions and check accuracy
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

print(f"Random Forest Model Accuracy: {accuracy_rf:.2%}")

# 4. Show the new breakdown
print("\nRandom Forest Breakdown (Confusion Matrix):")
print(pd.crosstab(y_test, y_pred_rf, rownames=['Actual'], colnames=['Predicted'], margins=True))

Random Forest Model Accuracy: 78.54%

Random Forest Breakdown (Confusion Matrix):
Predicted     0    1   All
Actual                    
0           927  106  1033
1           196  178   374
All        1123  284  1407


In [17]:
from sklearn.metrics import classification_report

# Generate a detailed performance report
print(classification_report(y_test, y_pred, target_names=['Did Not Churn (0)', 'Churned (1)']))

# For the Random Forest model, you would use:
# print(classification_report(y_test, y_pred_rf, target_names=['Did Not Churn (0)', 'Churned (1)']))

                   precision    recall  f1-score   support

Did Not Churn (0)       0.83      0.89      0.86      1033
      Churned (1)       0.62      0.52      0.56       374

         accuracy                           0.79      1407
        macro avg       0.73      0.70      0.71      1407
     weighted avg       0.78      0.79      0.78      1407

