In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
import os
import pickle
import joblib

In [2]:
df=pd.read_csv("C:\\Users\\nurs\\OneDrive\\Рабочий стол\\WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
df['TotalCharges'] = df['TotalCharges'].str.strip()
df = df[df['TotalCharges'] != ""]
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].info()

<class 'pandas.core.series.Series'>
Index: 7032 entries, 0 to 7042
Series name: TotalCharges
Non-Null Count  Dtype  
--------------  -----  
7032 non-null   float64
dtypes: float64(1)
memory usage: 109.9 KB


In [4]:
df=df.drop('customerID',axis=1)
df.shape

(7032, 20)

In [5]:
features=['SeniorCitizen', 'Partner', 'Dependents', 'tenure', 
 'PhoneService', 'MultipleLines', 'InternetService',
 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
 'TechSupport', 'StreamingTV', 'StreamingMovies',
 'Contract', 'PaperlessBilling', 'PaymentMethod',
 'MonthlyCharges','TotalCharges']

In [9]:
df[features].info()

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SeniorCitizen     7032 non-null   int64  
 1   Partner           7032 non-null   object 
 2   Dependents        7032 non-null   object 
 3   tenure            7032 non-null   int64  
 4   PhoneService      7032 non-null   object 
 5   MultipleLines     7032 non-null   object 
 6   InternetService   7032 non-null   object 
 7   OnlineSecurity    7032 non-null   object 
 8   OnlineBackup      7032 non-null   object 
 9   DeviceProtection  7032 non-null   object 
 10  TechSupport       7032 non-null   object 
 11  StreamingTV       7032 non-null   object 
 12  StreamingMovies   7032 non-null   object 
 13  Contract          7032 non-null   object 
 14  PaperlessBilling  7032 non-null   object 
 15  PaymentMethod     7032 non-null   object 
 16  MonthlyCharges    7032 non-null   float64
 17  

In [10]:
numeric_cols=df[features].select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols=df[features].select_dtypes(include='object').columns.tolist()
numeric_cols,categorical_cols

(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'],
 ['Partner',
  'Dependents',
  'PhoneService',
  'MultipleLines',
  'InternetService',
  'OnlineSecurity',
  'OnlineBackup',
  'DeviceProtection',
  'TechSupport',
  'StreamingTV',
  'StreamingMovies',
  'Contract',
  'PaperlessBilling',
  'PaymentMethod'])

In [6]:
df['tenure_group'] = pd.cut(
    df['tenure'],
    bins=[-1, 12, 24, 48, df['tenure'].max()],
    labels=['0-12', '13-24', '25-48', '49+']
)
df['tenure_group'].value_counts()


tenure_group
49+      2239
0-12     2175
25-48    1594
13-24    1024
Name: count, dtype: int64

In [None]:
df['avg_monthly_spend'] = df['TotalCharges'] / (df['tenure'] + 1)
df['avg_monthly_spend'].describe()

count    7032.000000
mean       59.083067
std        30.514438
min         9.183333
25%        26.225944
50%        61.070387
75%        84.877538
max       118.969863
Name: avg_monthly_spend, dtype: float64

In [8]:
service_cols = [
    'OnlineSecurity',
    'OnlineBackup',
    'DeviceProtection',
    'TechSupport',
    'StreamingTV',
    'StreamingMovies'
]
df['num_services'] = (
    df[service_cols] == 'Yes'
).sum(axis=1)
df['num_services'].value_counts().sort_index()


num_services
0    2213
1     966
2    1033
3    1117
4     850
5     569
6     284
Name: count, dtype: int64

In [9]:
df['is_month_to_month'] = (df['Contract'] == 'Month-to-month').astype(int)
df['is_month_to_month'].value_counts()


is_month_to_month
1    3875
0    3157
Name: count, dtype: int64

In [10]:
df['has_internet'] = (df['InternetService'] != 'No').astype(int)
df['has_internet'].value_counts()


has_internet
1    5512
0    1520
Name: count, dtype: int64

In [12]:
df['Churn'] = (df['Churn'] == 'Yes').astype(int)
df['Churn'].value_counts()


Churn
0    5163
1    1869
Name: count, dtype: int64

In [None]:
df.groupby('tenure_group')['Churn'].mean().sort_values(ascending=False)

  df.groupby('tenure_group')['Churn'].mean().sort_values(ascending=False)


tenure_group
0-12     0.476782
13-24    0.287109
25-48    0.203890
49+      0.095132
Name: Churn, dtype: float64

In [14]:
df.groupby('num_services')['Churn'].mean()


num_services
0    0.214641
1    0.457557
2    0.358180
3    0.273948
4    0.223529
5    0.124780
6    0.052817
Name: Churn, dtype: float64

In [15]:
df.groupby('is_month_to_month')['Churn'].mean()

is_month_to_month
0    0.067786
1    0.427097
Name: Churn, dtype: float64

In [None]:
numeric_cols = [
    'SeniorCitizen',
    'tenure',
    'MonthlyCharges',
    'TotalCharges',
    'avg_monthly_spend',
    'num_services',
    'is_month_to_month',
    'has_internet'
]

In [17]:
categorical_cols = [
    'Partner',
    'Dependents',
    'PhoneService',
    'MultipleLines',
    'InternetService',
    'OnlineSecurity',
    'OnlineBackup',
    'DeviceProtection',
    'TechSupport',
    'StreamingTV',
    'StreamingMovies',
    'Contract',
    'PaperlessBilling',
    'PaymentMethod',
    'tenure_group'
]


In [18]:
len(numeric_cols), len(categorical_cols)


(8, 15)

In [19]:
X = df[numeric_cols + categorical_cols]
y = df['Churn']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
y_train.mean(), y_test.mean()

(np.float64(0.2657777777777778), np.float64(0.2658137882018479))

In [23]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(
    handle_unknown='ignore',
    sparse_output=False
)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [24]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
X_train_processed.shape, X_test_processed.shape

((5625, 51), (1407, 51))

In [27]:
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(
        max_iter=1000,
        solver='lbfgs',
        random_state=42
    ))
])
lr_pipeline

0,1,2
,steps,"[('preprocessor', ...), ('smote', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,sampling_strategy,'auto'
,random_state,42
,k_neighbors,5

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [28]:
X_lr_train = X_train[numeric_cols + categorical_cols]
X_lr_test  = X_test[numeric_cols + categorical_cols]
X_lr_train.shape, X_lr_test.shape

((5625, 23), (1407, 23))

In [None]:
X_lr_train_processed = preprocessor.fit_transform(X_lr_train)
X_lr_test_processed  = preprocessor.transform(X_lr_test)
X_lr_train_processed.shape, X_lr_test_processed.shape

((5625, 51), (1407, 51))

In [31]:
X_lr_train_processed = preprocessor.fit_transform(X_lr_train)
X_lr_test_processed  = preprocessor.transform(X_lr_test)
X_lr_train_processed.shape, X_lr_test_processed.shape

((5625, 51), (1407, 51))

In [33]:
joblib.dump(preprocessor, "lr_preprocessor.joblib")
joblib.dump(
    (X_lr_train, X_lr_test, y_train, y_test),
    "lr_data_split.joblib"
)

['lr_data_split.joblib']

In [34]:
tree_features = numeric_cols + categorical_cols
X_dt_rf=df[tree_features]
y_dt_rf=df["Churn"]

In [35]:
X_dt_rf.shape, y_dt_rf.shape

((7032, 23), (7032,))

In [None]:
y_dt_rf.value_counts(normalize=True)

Churn
0    0.734215
1    0.265785
Name: proportion, dtype: float64

In [37]:
X_train_dt, X_test_dt, y_train_dt, y_test_dt = train_test_split(
    X_dt_rf,
    y_dt_rf,
    test_size=0.2,
    random_state=42,
    stratify=y_dt_rf
)
X_train_dt.shape, X_test_dt.shape

((5625, 23), (1407, 23))

In [38]:
y_train_dt.mean(), y_test_dt.mean()

(np.float64(0.2657777777777778), np.float64(0.2658137882018479))

In [39]:
tree_preprocessor = ColumnTransformer(
    transformers=[
        (
            "cat",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            categorical_cols
        )
    ],
    remainder="passthrough"
)

In [40]:
X_train_tree = tree_preprocessor.fit_transform(X_train_dt)
X_test_tree  = tree_preprocessor.transform(X_test_dt)
X_train_tree.shape, X_test_tree.shape

((5625, 51), (1407, 51))

In [41]:
joblib.dump(tree_preprocessor, "tree_preprocessor.joblib")
joblib.dump(
    (X_train_dt, X_test_dt, y_train_dt, y_test_dt),
    "tree_data_split.joblib"
)

['tree_data_split.joblib']

In [42]:
cat_features_all = numeric_cols + categorical_cols

X_cb = df[cat_features_all]
y_cb = df["Churn"]


In [None]:
X_cb = df[cat_features_all].copy()
X_cb.loc[:, categorical_cols] = X_cb[categorical_cols].astype(str)
# features + copy (ВАЖНО)
X_cb = df[cat_features_all].copy()
y_cb = df["Churn"]
# categorical -> string
X_cb.loc[:, categorical_cols] = X_cb[categorical_cols].astype(str)


In [47]:
cat_feature_indices = [X_cb.columns.get_loc(c) for c in categorical_cols]
cat_feature_indices

[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]

In [48]:
X_train_cb, X_test_cb, y_train_cb, y_test_cb = train_test_split(
    X_cb, y_cb,
    test_size=0.2,
    random_state=42,
    stratify=y_cb
)

X_train_cb.shape, X_test_cb.shape

((5625, 23), (1407, 23))

In [49]:
y_train_cb.mean(), y_test_cb.mean()

(np.float64(0.2657777777777778), np.float64(0.2658137882018479))

In [50]:
joblib.dump(
    {
        "X_train": X_train_cb,
        "X_test": X_test_cb,
        "y_train": y_train_cb,
        "y_test": y_test_cb,
        "cat_features": cat_feature_indices
    },
    "catboost_data.joblib"
)

['catboost_data.joblib']