In [63]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [64]:
df = pd.read_csv('../data/data_before_preprocessing.csv')
df.head()

Unnamed: 0,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,...,passenger_number_in_group,total_luxury_amenities,deck,num,side,first name,last name,group_size,HomePlanet,cabin_number_distribution_chunk
0,False,TRAPPIST-1e,39,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,...,1,0.0,B,0,P,Maham,Ofracculy,1,Europa,chunk1
1,False,TRAPPIST-1e,24,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,...,1,736.0,F,0,S,Juanna,Vines,1,Earth,chunk1
2,False,TRAPPIST-1e,58,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,...,1,10383.0,A,0,S,Altark,Susent,2,Europa,chunk1
3,False,TRAPPIST-1e,33,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,...,2,5176.0,A,0,S,Solam,Susent,2,Europa,chunk1
4,False,TRAPPIST-1e,16,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,...,1,1091.0,F,1,S,Willy,Santantines,1,Earth,chunk1


In [65]:
df.drop(columns=['VIP', 'Name', 'last name', 'first name', 'num', 'passenger_group'], inplace=True)

In [66]:
df['CryoSleep'] = df['CryoSleep'].astype(int)


In [67]:
df.columns

Index(['CryoSleep', 'Destination', 'Age', 'RoomService', 'FoodCourt',
       'ShoppingMall', 'Spa', 'VRDeck', 'Transported',
       'passenger_number_in_group', 'total_luxury_amenities', 'deck', 'side',
       'group_size', 'HomePlanet', 'cabin_number_distribution_chunk'],
      dtype='object')

In [68]:
X = df.drop(columns=['Transported'])
y = df['Transported']

In [69]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [70]:
encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

In [71]:
categ_cols = ['Destination', 'HomePlanet', 'deck', 'side', 'cabin_number_distribution_chunk']
categories_enc = encoder.fit_transform(X_train[categ_cols])
categories_enc_df = pd.DataFrame(categories_enc, columns = encoder.get_feature_names_out(categ_cols), index=X_train.index)

In [72]:
X_train.drop(columns=categ_cols, inplace=True)
X_train = pd.concat([X_train, categories_enc_df], axis = 1)


In [73]:
# Для тесту
categories_enc_test = encoder.transform(X_test[categ_cols])

# Перетворюємо у DataFrame
categories_enc_test_df = pd.DataFrame(
    categories_enc_test,
    columns=encoder.get_feature_names_out(categ_cols),
    index=X_test.index
)

In [74]:
X_test.drop(columns=categ_cols, inplace=True)
X_test = pd.concat([X_test, categories_enc_test_df], axis = 1)

In [75]:
expenses= ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck', 'total_luxury_amenities']
X_train[expenses] = np.log1p(X_train[expenses])
X_test[expenses] = np.log1p(X_train[expenses])

# Scaling

In [76]:
scaler = StandardScaler()

In [77]:
numerical_columns = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa',
       'VRDeck', 'passenger_number_in_group', 'total_luxury_amenities',
       'group_size'] 

In [78]:
X_train_scaled = scaler.fit_transform(X_train[numerical_columns])
X_test_scaled = scaler.transform(X_test[numerical_columns])

In [79]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=numerical_columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=numerical_columns, index=X_test.index)

In [80]:
X_train = X_train.drop(columns=numerical_columns).join(X_train_scaled)
X_test = X_test.drop(columns=numerical_columns).join(X_test_scaled)

In [81]:
%store X_train
%store X_test

%store y_train
%store y_test

Stored 'X_train' (DataFrame)
Stored 'X_test' (DataFrame)
Stored 'y_train' (Series)
Stored 'y_test' (Series)
