In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Load the dataset
df = pd.read_csv(r'C:\Project\myProject\Prognozowanie-rezygnacji-klientów-dla-firmy-telekomunikacyjnej\internet_service_churn.csv')

# Handling missing values (for 'reamining_contract', 'download_avg', and 'upload_avg')
imputer = SimpleImputer(strategy='mean')
df['reamining_contract'] = imputer.fit_transform(df[['reamining_contract']])
df[['download_avg', 'upload_avg']] = imputer.fit_transform(df[['download_avg', 'upload_avg']])

# Encoding categorical variables (for 'is_tv_subscriber' and 'is_movie_package_subscriber')
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_columns = encoder.fit_transform(df[['is_tv_subscriber', 'is_movie_package_subscriber']])

# Adding the encoded columns back to the DataFrame
df_encoded = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(['is_tv_subscriber', 'is_movie_package_subscriber']))
df = pd.concat([df, df_encoded], axis=1)

# Drop original categorical columns after encoding
df.drop(['is_tv_subscriber', 'is_movie_package_subscriber'], axis=1, inplace=True)

# Normalizing numerical features (for 'bill_avg', 'subscription_age', 'service_failure_count')
scaler = StandardScaler()
df[['bill_avg', 'subscription_age', 'service_failure_count']] = scaler.fit_transform(df[['bill_avg', 'subscription_age', 'service_failure_count']])

# Display the updated DataFrame
print(df.head())


   id  subscription_age  bill_avg  reamining_contract  service_failure_count  \
0  15          4.668335  0.458372            0.140000              -0.335818   
1  18          2.835389 -1.433376            0.716039              -0.335818   
2  23          3.174460 -0.222657            0.000000              -0.335818   
3  27          2.171991  0.155692            0.716039               0.888749   
4  34          1.936116 -1.433376            0.716039              -0.335818   

   download_avg  upload_avg  download_over_limit  churn  is_tv_subscriber_1  \
0           8.4         2.3                    0      0                 1.0   
1           0.0         0.0                    0      1                 0.0   
2          13.7         0.9                    0      1                 1.0   
3           0.0         0.0                    0      1                 0.0   
4           0.0         0.0                    0      1                 0.0   

   is_movie_package_subscriber_1  
0        