In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import os
import pickle

In [4]:
df=pd.read_csv("C:\\Users\\nurs\\OneDrive\\Рабочий стол\\WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
df['TotalCharges'] = df['TotalCharges'].str.strip()
df = df[df['TotalCharges'] != ""]
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].info()

<class 'pandas.core.series.Series'>
Index: 7032 entries, 0 to 7042
Series name: TotalCharges
Non-Null Count  Dtype  
--------------  -----  
7032 non-null   float64
dtypes: float64(1)
memory usage: 109.9 KB


In [7]:
df=df.drop('customerID',axis=1)
df.shape

(7032, 20)

In [8]:
features=['SeniorCitizen', 'Partner', 'Dependents', 'tenure', 
 'PhoneService', 'MultipleLines', 'InternetService',
 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
 'TechSupport', 'StreamingTV', 'StreamingMovies',
 'Contract', 'PaperlessBilling', 'PaymentMethod',
 'MonthlyCharges','TotalCharges']

In [9]:
df[features].info()

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SeniorCitizen     7032 non-null   int64  
 1   Partner           7032 non-null   object 
 2   Dependents        7032 non-null   object 
 3   tenure            7032 non-null   int64  
 4   PhoneService      7032 non-null   object 
 5   MultipleLines     7032 non-null   object 
 6   InternetService   7032 non-null   object 
 7   OnlineSecurity    7032 non-null   object 
 8   OnlineBackup      7032 non-null   object 
 9   DeviceProtection  7032 non-null   object 
 10  TechSupport       7032 non-null   object 
 11  StreamingTV       7032 non-null   object 
 12  StreamingMovies   7032 non-null   object 
 13  Contract          7032 non-null   object 
 14  PaperlessBilling  7032 non-null   object 
 15  PaymentMethod     7032 non-null   object 
 16  MonthlyCharges    7032 non-null   float64
 17  

In [10]:
numeric_cols=df[features].select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols=df[features].select_dtypes(include='object').columns.tolist()
numeric_cols,categorical_cols

(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'],
 ['Partner',
  'Dependents',
  'PhoneService',
  'MultipleLines',
  'InternetService',
  'OnlineSecurity',
  'OnlineBackup',
  'DeviceProtection',
  'TechSupport',
  'StreamingTV',
  'StreamingMovies',
  'Contract',
  'PaperlessBilling',
  'PaymentMethod'])

In [12]:
X=df[features]
y=df['Churn']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [18]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_ohe = ohe.fit_transform(X_train[categorical_cols])
X_test_ohe = ohe.transform(X_test[categorical_cols])

In [19]:
X_train_combined = np.hstack([
    X_train[numeric_cols].values,
    X_train_ohe
])
X_test_combined = np.hstack([
    X_test[numeric_cols].values,
    X_test_ohe
])

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_combined)
X_test_scaled = scaler.transform(X_test_combined)

In [27]:
smote = SMOTE(random_state=42)
X_train_lr, y_train_lr = smote.fit_resample(X_train_scaled, y_train)
X_train_dt_rf, y_train_dt_rf = smote.fit_resample(X_train_combined, y_train)
pd.Series(y_train_lr).value_counts()

Churn
No     4130
Yes    4130
Name: count, dtype: int64

In [28]:
pd.Series(y_train_dt_rf).value_counts()

Churn
No     4130
Yes    4130
Name: count, dtype: int64

In [29]:
X_train_cat = X_train.copy()
X_test_cat = X_test.copy()
cat_features = [X_train_cat.columns.get_loc(col) for col in categorical_cols]

In [31]:
os.makedirs("data/processed", exist_ok=True)
os.makedirs("models", exist_ok=True)

In [None]:
#LogisticReg 
pd.DataFrame(X_train_lr).to_csv("data/processed/X_train_lr.csv", index=False)
pd.DataFrame(X_test_scaled).to_csv("data/processed/X_test_lr.csv", index=False)
pd.DataFrame(y_train_lr).to_csv("data/processed/y_train_lr.csv", index=False)
pd.DataFrame(y_test).to_csv("data/processed/y_test.csv", index=False)


In [None]:
#DT/RF
pd.DataFrame(X_train_dt_rf).to_csv("data/processed/X_train_dt_rf.csv", index=False)
pd.DataFrame(X_test_combined).to_csv("data/processed/X_test_dt_rf.csv", index=False)
pd.DataFrame(y_train_dt_rf).to_csv("data/processed/y_train_dt_rf.csv", index=False)

In [35]:
#Catboost
X_train_cat.to_csv("data/processed/X_train_cat.csv", index=False)
X_test_cat.to_csv("data/processed/X_test_cat.csv", index=False)
pd.DataFrame(y_train).to_csv("data/processed/y_train.csv", index=False)
pd.DataFrame(y_test).to_csv("data/processed/y_test.csv", index=False)

In [37]:
with open("models/cat_features.pkl", "wb") as f:
    pickle.dump(cat_features, f)

In [38]:
with open("models/X_train_lr.pkl", "wb") as f:
    pickle.dump(X_train_lr, f)
with open("models/X_train_dt_rf.pkl", "wb") as f:
    pickle.dump(X_train_dt_rf, f)
with open("models/X_train_cat.pkl", "wb") as f:
    pickle.dump(X_train_cat, f)