In [1]:
# Cell 1: Import libraries and load the data
import pandas as pd
import numpy as np

print("Loading the dataset...")
df = pd.read_csv('Churn.csv')
print("Dataset loaded successfully.")

# Cell 2: Clean the data
print("Starting data cleaning...")
# The 'TotalCharges' column has some empty spaces. We need to fix this.
# We will turn the empty spaces into 'NaN' (Not a Number)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Now we find any rows with NaN and fill them with the median value of the column.
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())
print("Data cleaning complete! The data is now ready.")

# Cell 3: Check your work
# Display the first 5 rows and info to confirm it's clean.
print(df.head())
df.info()

Loading the dataset...
Dataset loaded successfully.
Starting data cleaning...
Data cleaning complete! The data is now ready.
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4             

In [6]:
# Cell 1 (Corrected): Import libraries and prepare data
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

print("Preparing data for modeling...")

# Make a copy of your dataframe to work with
df_model = df.copy()

# IMPORTANT FIX: Drop the customerID column here, as it's not a predictive feature.
df_model = df_model.drop('customerID', axis=1)

# Convert the Churn column to 0s and 1s
df_model['Churn'] = df_model['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

# Now, convert all other categorical columns into numeric ones
df_model = pd.get_dummies(df_model, drop_first=True)

print("Data is now ready for modeling.")
df_model.head()

Preparing data for modeling...
Data is now ready for modeling.


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,False,True,False,False,True,...,False,False,False,False,False,False,True,False,True,False
1,0,34,56.95,1889.5,0,True,False,False,True,False,...,False,False,False,False,True,False,False,False,False,True
2,0,2,53.85,108.15,1,True,False,False,True,False,...,False,False,False,False,False,False,True,False,False,True
3,0,45,42.3,1840.75,0,True,False,False,False,True,...,False,False,False,False,True,False,False,False,False,False
4,0,2,70.7,151.65,1,False,False,False,True,False,...,False,False,False,False,False,False,True,False,True,False


In [7]:
# Cell 2 (Corrected)
X = df_model.drop('Churn', axis=1)
y = df_model['Churn']

In [8]:
# Cell 3: Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data size: {X_train.shape[0]} samples")
print(f"Testing data size: {X_test.shape[0]} samples")

Training data size: 5634 samples
Testing data size: 1409 samples


In [10]:
# Cell 4: Create and train the model
model = LogisticRegression(max_iter=5000)

print("Training the model...")
model.fit(X_train, y_train)
print("Model training complete.")

Training the model...
Model training complete.


In [11]:
# Cell 5: Make predictions and evaluate the model
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Print a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No Churn', 'Churn']))

Model Accuracy: 0.8219

Classification Report:
              precision    recall  f1-score   support

    No Churn       0.86      0.90      0.88      1036
       Churn       0.69      0.60      0.64       373

    accuracy                           0.82      1409
   macro avg       0.77      0.75      0.76      1409
weighted avg       0.82      0.82      0.82      1409

