In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
pd.set_option('future.no_silent_downcasting', True)

In [4]:
df = pd.read_csv(r'C:\Projects\Project 2\data_science_team_project\DataScience_team_project\datasets\internet_service_churn.csv')
df_churn = df.copy()
df_churn.head()

Unnamed: 0,id,is_tv_subscriber,is_movie_package_subscriber,subscription_age,bill_avg,reamining_contract,service_failure_count,download_avg,upload_avg,download_over_limit,churn
0,15,1,0,11.95,25,0.14,0,8.4,2.3,0,0
1,18,0,0,8.22,0,,0,0.0,0.0,0,1
2,23,1,0,8.91,16,0.0,0,13.7,0.9,0,1
3,27,0,0,6.87,21,,1,0.0,0.0,0,1
4,34,0,0,6.39,0,,0,0.0,0.0,0,1


In [5]:
df_churn.isna().sum()

id                                 0
is_tv_subscriber                   0
is_movie_package_subscriber        0
subscription_age                   0
bill_avg                           0
reamining_contract             21572
service_failure_count              0
download_avg                     381
upload_avg                       381
download_over_limit                0
churn                              0
dtype: int64

In [6]:
# Обробка відсутніх значень
df_churn['reamining_contract'] = df_churn['reamining_contract'].fillna(0)
df_churn['download_avg'] = df_churn['download_avg'].fillna(0)
df_churn['upload_avg'] = df_churn['upload_avg'].fillna(0)

In [7]:
df_churn.isna().sum()

id                             0
is_tv_subscriber               0
is_movie_package_subscriber    0
subscription_age               0
bill_avg                       0
reamining_contract             0
service_failure_count          0
download_avg                   0
upload_avg                     0
download_over_limit            0
churn                          0
dtype: int64

In [8]:
# Заміна негативних значень subscription_age на медіану
if (df_churn['subscription_age'] < 0).any():
  median_age = df_churn.loc[df_churn['subscription_age'] >= 0, 'subscription_age'].median()
  df_churn.loc[df_churn['subscription_age'] < 0, 'subscription_age'] = median_age

# Обмеження викидів у download_avg за допомогою IQR
Q1 = df_churn['download_avg'].quantile(0.25)
Q3 = df_churn['download_avg'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
df_churn['download_avg'] = np.where(df_churn['download_avg'] > upper, upper,
                       np.where(df_churn['download_avg'] < lower, lower, df_churn['download_avg']))

# Видалення екстремальних значень bill_avg
bill_upper = df_churn['bill_avg'].quantile(0.99)
df_churn = df_churn[df_churn['bill_avg'] <= bill_upper]

In [9]:
#  One-Hot Encoding
download_dummies = pd.get_dummies(df_churn['download_over_limit'], prefix='download_over_limit')
df_churn = pd.concat([df_churn.drop(columns=['download_over_limit']), download_dummies], axis=1)

In [10]:
# Нормалізація числових ознак за допомогою StandardScaler
scaler = StandardScaler()
numeric_cols = ['subscription_age', 'bill_avg', 'reamining_contract',
                'service_failure_count', 'download_avg', 'upload_avg']
df_churn[numeric_cols] = scaler.fit_transform(df_churn[numeric_cols])

In [11]:
df_churn.head()

Unnamed: 0,id,is_tv_subscriber,is_movie_package_subscriber,subscription_age,bill_avg,reamining_contract,service_failure_count,download_avg,upload_avg,churn,download_over_limit_0,download_over_limit_1,download_over_limit_2,download_over_limit_3,download_over_limit_4,download_over_limit_5,download_over_limit_6,download_over_limit_7
0,15,1,0,4.669609,0.757921,-0.544562,-0.334731,-0.775141,-0.189118,0,True,False,False,False,False,False,False,False
1,18,0,0,2.835973,-2.00743,-0.753378,-0.334731,-0.986643,-0.455279,1,True,False,False,False,False,False,False,False
2,23,1,0,3.175171,-0.237605,-0.753378,-0.334731,-0.641693,-0.351129,1,True,False,False,False,False,False,False,False
3,27,0,0,2.172325,0.315465,-0.753378,0.91074,-0.986643,-0.455279,1,True,False,False,False,False,False,False,False
4,34,0,0,1.936361,-2.00743,-0.753378,-0.334731,-0.986643,-0.455279,1,True,False,False,False,False,False,False,False


In [12]:
# Виключення id та bill_avg (слабка кореляція)
df_churn.drop(columns=['id', 'bill_avg'], inplace=True)

In [13]:
df_churn.head()

Unnamed: 0,is_tv_subscriber,is_movie_package_subscriber,subscription_age,reamining_contract,service_failure_count,download_avg,upload_avg,churn,download_over_limit_0,download_over_limit_1,download_over_limit_2,download_over_limit_3,download_over_limit_4,download_over_limit_5,download_over_limit_6,download_over_limit_7
0,1,0,4.669609,-0.544562,-0.334731,-0.775141,-0.189118,0,True,False,False,False,False,False,False,False
1,0,0,2.835973,-0.753378,-0.334731,-0.986643,-0.455279,1,True,False,False,False,False,False,False,False
2,1,0,3.175171,-0.753378,-0.334731,-0.641693,-0.351129,1,True,False,False,False,False,False,False,False
3,0,0,2.172325,-0.753378,0.91074,-0.986643,-0.455279,1,True,False,False,False,False,False,False,False
4,0,0,1.936361,-0.753378,-0.334731,-0.986643,-0.455279,1,True,False,False,False,False,False,False,False
