In [4]:
import pandas as pd

data = pd.read_csv('archive.zip', compression='zip')

print("First 5 rows:")
print(data.head())

print("\nShape:", data.shape)

print("\nColumns:")
print(data.columns)

First 5 rows:
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies       

In [5]:
print("\nTarget column distribution (Churn):")
print(data['Churn'].value_counts())

print(data.dtypes)

print(data.isnull().sum())


Target column distribution (Churn):
Churn
No     5174
Yes    1869
Name: count, dtype: int64
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV        

In [7]:
data.drop('customerID', axis=1, inplace=True)

In [6]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
print(data.dtypes)

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object


In [8]:
data['Churn']=data['Churn'].map({"Yes": 1, "No":0})

In [10]:
data.isnull().sum()

Unnamed: 0,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0
OnlineBackup,0


In [13]:
data["TotalCharges"].fillna(data["TotalCharges"].median(), inplace=True)

In [23]:
categorical_cols=data.select_dtypes(include=['object']).columns

data=pd.get_dummies(data, columns=categorical_cols, drop_first=True)
print(data.head())

   SeniorCitizen  tenure  MonthlyCharges  TotalCharges  Churn  gender_Male  \
0              0       1           29.85         29.85      0        False   
1              0      34           56.95       1889.50      0         True   
2              0       2           53.85        108.15      1         True   
3              0      45           42.30       1840.75      0         True   
4              0       2           70.70        151.65      1        False   

   Partner_Yes  Dependents_Yes  PhoneService_Yes  \
0         True           False             False   
1        False           False              True   
2        False           False              True   
3        False           False             False   
4        False           False              True   

   MultipleLines_No phone service  ...  StreamingTV_No internet service  \
0                            True  ...                            False   
1                           False  ...                            Fa

In [25]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data[["tenure", "MonthlyCharges", "TotalCharges"]] = scaler.fit_transform(data[["tenure", "MonthlyCharges", "TotalCharges"]])
print(data.head())

   SeniorCitizen    tenure  MonthlyCharges  TotalCharges  Churn  gender_Male  \
0              0 -1.277445       -1.160323     -0.994242      0        False   
1              0  0.066327       -0.259629     -0.173244      0         True   
2              0 -1.236724       -0.362660     -0.959674      1         True   
3              0  0.514251       -0.746535     -0.194766      0         True   
4              0 -1.236724        0.197365     -0.940470      1        False   

   Partner_Yes  Dependents_Yes  PhoneService_Yes  \
0         True           False             False   
1        False           False              True   
2        False           False              True   
3        False           False             False   
4        False           False              True   

   MultipleLines_No phone service  ...  StreamingTV_No internet service  \
0                            True  ...                            False   
1                           False  ...                  

In [26]:
x=data.drop('Churn', axis=1)
y=data['Churn']

print(x.shape)
print(y.shape)

(7043, 30)
(7043,)


In [29]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42,stratify=y)

y_train.value_counts(normalize=True)
y_test.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Churn,Unnamed: 1_level_1
0,0.734564
1,0.265436


In [34]:
from sklearn.linear_model import LogisticRegression
model= LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)
y_pred=model.predict(x_test)
y_prob=model.predict_proba(x_test)
y_pred[:5]
y_prob[:5]

array([[0.9550013 , 0.0449987 ],
       [0.31628668, 0.68371332],
       [0.94019879, 0.05980121],
       [0.59992082, 0.40007918],
       [0.9785383 , 0.0214617 ]])

In [35]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.8048261178140526


In [36]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)


[[925 110]
 [165 209]]


In [37]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print(report)


              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.66      0.56      0.60       374

    accuracy                           0.80      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.80      0.80      1409

