In [208]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB


In [2]:
data = pd.read_csv("custommers.csv")
data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [3]:
data["charges_type"] = pd.cut(data["MonthlyCharges"], bins=[0,20,40,60,80,100,np.infty], labels=[1,2,3,4,5,6])

In [15]:
# plt.hist(data["charges_type"], bins=[1,2,3,4,5,6])

In [14]:
# split = StratifiedShuffleSplit(n_splits=1, random_state=42, test_size=0.2)

In [6]:
for train_ix, test_ix in split.split(data, data["charges_type"]):
    trainmixdata = data.loc[train_ix]
    testmixdata = data.loc[test_ix]
    

In [7]:
trainmixdata

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,charges_type
2884,5248-RPYWW,Female,1,Yes,Yes,72,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),90.15,6716.45,No,5
4404,1307-ATKGB,Male,0,No,No,24,Yes,No,Fiber optic,No,...,No,No,Yes,Month-to-month,Yes,Electronic check,89.55,2187.15,No,5
6642,4747-LCAQL,Male,0,No,No,25,Yes,No,DSL,Yes,...,Yes,Yes,Yes,Month-to-month,Yes,Mailed check,79.00,1902,No,4
3662,6410-LEFEN,Female,0,No,No,9,Yes,No,DSL,No,...,No,No,No,Month-to-month,No,Electronic check,45.15,416.45,Yes,3
3063,1666-JXLKU,Female,0,No,No,37,Yes,Yes,Fiber optic,No,...,No,Yes,Yes,Month-to-month,Yes,Electronic check,100.05,3810.55,No,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3701,3812-LRZIR,Female,0,Yes,Yes,27,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,Two year,No,Electronic check,24.50,761.95,No,2
3237,6734-JDTTV,Male,0,Yes,Yes,65,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,Two year,Yes,Credit card (automatic),19.85,1267.05,No,1
3942,6959-UWKHF,Male,0,No,No,1,Yes,No,DSL,No,...,No,No,No,Month-to-month,Yes,Electronic check,42.90,42.9,Yes,3
2734,2371-JUNGC,Male,0,No,No,11,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,Month-to-month,No,Mailed check,20.25,208,No,2


In [13]:
# testmixdata

In [33]:
trainmixdata.select_dtypes(include=["number"])
# select_dtypes(trainmixdata["TotalCharges"])
trainmixdata["TotalCharges"].dtype

dtype('O')

In [98]:
trainmixdata.select_dtypes(include=["number"])
# trainmixdata["tenure"].dtype

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
2884,1,72,90.15
4404,0,24,89.55
6642,0,25,79.00
3662,0,9,45.15
3063,0,37,100.05
...,...,...,...
3701,0,27,24.50
3237,0,65,19.85
3942,0,1,42.90
2734,0,11,20.25


In [167]:
def preprocessing(data):
    data["TotalCharges"] = pd.to_numeric(data["TotalCharges"], errors='coerce')
    num_att = ['tenure', 'MonthlyCharges', 'TotalCharges']
    # Include all other features as categorical
    cat_Att = [col for col in data.columns if col not in num_att]


    num = Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("scalling", StandardScaler())
    ])
    cat = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoding", OneHotEncoder(handle_unknown="ignore"))
    ])

    combine = ColumnTransformer([
        ("num", num, num_att),
        ("cat", cat, cat_Att)
    ])

    return combine
    

In [195]:
x_train = trainmixdata.drop(columns=["customerID", "Churn", "charges_type"])
y_train = trainmixdata["Churn"]
y_test = testmixdata["Churn"]
x_test = testmixdata.drop(columns=["customerID", "Churn", "charges_type"])
pip = preprocessing(x_train)
x_processtraing = pip.fit_transform(x_train)
encoder = OrdinalEncoder()
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1)).ravel()
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1)).ravel()
y_train_encoded[:7]


array([0., 0., 0., 1., 0., 1., 0.])

In [197]:
lin_reg = LogisticRegression(max_iter=1000)
lin_reg.fit(x_processtraing, y_train_encoded)
pred = lin_reg.predict(x_processtraing)

In [199]:
pred[:10]

array([0., 1., 0., 0., 1., 1., 0., 0., 0., 0.])

In [201]:
y_train_encoded[:10]

array([0., 0., 0., 1., 0., 1., 0., 0., 0., 0.])

In [202]:
accuracy_score(y_train_encoded, pred)

0.8067092651757188

In [203]:
classification_report(y_train_encoded,pred)

'              precision    recall  f1-score   support\n\n         0.0       0.85      0.90      0.87      4133\n         1.0       0.66      0.56      0.61      1501\n\n    accuracy                           0.81      5634\n   macro avg       0.76      0.73      0.74      5634\nweighted avg       0.80      0.81      0.80      5634\n'

In [215]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    # "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
}

results = {}

def accuracy(x,y , model):
    
    val = cross_val_score(
        model,
        x,
        y,
        scoring="accuracy",
        cv=10,
        n_jobs=-1
    )
    return val

for name, model in models.items():
    # model.fit(x_processtraing, y_train_encoded)
    # preds = model.predict(x_processtraing)

    acc = cross_val_score(
        model,
        x_processtraing,
        y_train_encoded,
        scoring="accuracy",
        cv = 10,
        n_jobs = -1
    )
    
    # acc = accuracy(y_train_encoded, preds, model)
    
    results[name] = {
        "mean_acc": acc.mean(),
        "std_dev": acc.std()
    }
    print(f"{name}: Mean Accuracy = {results[name]['mean_acc']:.4f} (+/- {results[name]['std_dev']:.4f})")


Logistic Regression: Mean Accuracy = 0.8023 (+/- 0.0134)
Decision Tree: Mean Accuracy = 0.7355 (+/- 0.0163)
Random Forest: Mean Accuracy = 0.7868 (+/- 0.0128)
Gradient Boosting: Mean Accuracy = 0.8030 (+/- 0.0135)
SVM: Mean Accuracy = 0.8003 (+/- 0.0120)
KNN: Mean Accuracy = 0.7602 (+/- 0.0137)
Naive Bayes: Mean Accuracy = 0.7022 (+/- 0.0168)
