In [42]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression 

df = pd.read_csv("../data/processed/data.csv")

X = df.drop("Churn", axis=1)
y = df["Churn"]

X.head(), y.head()


(   tenure  MonthlyCharges  TotalCharges        Contract  \
 0       1           29.85         29.85  Month-to-month   
 1      34           56.95       1889.50        One year   
 2       2           53.85        108.15  Month-to-month   
 3      45           42.30       1840.75        One year   
 4       2           70.70        151.65  Month-to-month   
 
                PaymentMethod InternetService  SeniorCitizen Partner  \
 0           Electronic check             DSL              0     Yes   
 1               Mailed check             DSL              0      No   
 2               Mailed check             DSL              0      No   
 3  Bank transfer (automatic)             DSL              0      No   
 4           Electronic check     Fiber optic              0      No   
 
   Dependents PaperlessBilling  
 0         No              Yes  
 1         No               No  
 2         No              Yes  
 3         No               No  
 4         No              Yes  ,
 0   

In [43]:
numerical_features = [
    "tenure",
    "MonthlyCharges",
    "TotalCharges"
]

categorical_features = ["Contract","PaymentMethod","InternetService"]


binary_features = ["Partner","Dependents","PaperlessBilling"]


numerical_features, categorical_features, binary_features


(['tenure', 'MonthlyCharges', 'TotalCharges'],
 ['Contract', 'PaymentMethod', 'InternetService'],
 ['Partner', 'Dependents', 'PaperlessBilling'])

In [44]:
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])
binary_transformer = Pipeline(steps=[("onehot", OneHotEncoder(drop="if_binary"))])
preprocessor = ColumnTransformer(
    transformers=[("num", numeric_transformer, numerical_features),("cat", categorical_transformer, categorical_features),("bin", binary_transformer, binary_features),])


In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=42,stratify=y)

In [46]:

model = LogisticRegression(max_iter=1000)
clf = Pipeline(steps=[("preprocessor", preprocessor),("model", model)])


In [47]:
clf.fit(X_train, y_train)



In [48]:
clf.score(X_test, y_test)


0.8017057569296375

In [49]:
feature_names = clf.named_steps["preprocessor"] \
    .named_transformers_["cat"] \
    .named_steps["onehot"] \
    .get_feature_names_out(categorical_features)

feature_names


array(['Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No'], dtype=object)