In [229]:
import numpy as np
import pandas as pd
import scipy.stats
import warnings
import graphviz
from tqdm import tqdm_notebook

from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import datasets
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz, plot_tree
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
import statistics as st

In [230]:
df = pd.read_csv('c://data/train__.csv')

In [231]:
# Обзор датасета
df.shape

(8693, 14)

In [232]:
df.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [233]:
df.tail(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False
8692,9280_02,Europa,False,E/608/S,TRAPPIST-1e,44.0,False,126.0,4688.0,0.0,0.0,12.0,Propsh Hontichre,True


In [234]:
# предсказывать будем Transported

In [235]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [236]:
print(df['Age'].max())
print(df['Age'].min())

79.0
0.0


In [237]:
# Посмотрим что с уникальностью данных
df.nunique()

PassengerId     8693
HomePlanet         3
CryoSleep          2
Cabin           6560
Destination        3
Age               80
VIP                2
RoomService     1273
FoodCourt       1507
ShoppingMall    1115
Spa             1327
VRDeck          1306
Name            8473
Transported        2
dtype: int64

In [238]:
# Удалим столбцы PassengerId, Name, Cabin
df.drop(['PassengerId', 'Cabin', 'Name'], axis=1, inplace=True)

In [239]:
# Проверим датасет на пропуски
df.isnull().sum()

HomePlanet      201
CryoSleep       217
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
dtype: int64

In [240]:
# Количественные признаки заполним медианой
num = ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck", 'Age']
for i in num:
    df[i] = df[i].fillna(df[i].median())

df.isnull().sum()

HomePlanet      201
CryoSleep       217
Destination     182
Age               0
VIP             203
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Transported       0
dtype: int64

In [241]:
# Теперь обработаем пропуски в категориальных признаках.
cat = ["HomePlanet","CryoSleep","Destination","VIP"]
for i in cat:
    df[i] = df[i].fillna(st.mode(df[i]))

df.isnull().sum()

# Пропусков нет

HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
dtype: int64

In [242]:
# В количественных признаках кроме возраста уберем выбросы
num = ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck", 'Age']

for i in num:
    Q1 = np.percentile(df[i],25,interpolation="midpoint")
    Q3 = np.percentile(df[i],75,interpolation="midpoint")
    IQR = Q3-Q1
    upper = Q3 + 1.5*IQR
    lower = Q1 - 1.5*IQR
    df[i] = np.where(df[i] > upper,df[i].median(), np.where(df[i] < lower,df[i].median(),df[i]))

Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  Q1 = np.percentile(df[i],25,interpolation="midpoint")
Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  Q3 = np.percentile(df[i],75,interpolation="midpoint")
Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  Q1 = np.percentile(df[i],25,interpolation="midpoint")
Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  Q3 = np.percentile(df[i],75,interpolation="midpoint")
Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  Q1 = np.percentile(df[i],25,interpolation="midpoint")
Users of the modes 'nearest', 'lower', 'higher', o

In [243]:
# Сменим тип количественных признаков на int
df[num] = df[num].astype(dtype='int')

In [244]:
# Категориальные призныки обработаем методом LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in cat:
    df[i] = le.fit_transform(df[i])

df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,1,0,2,39,0,0,0,0,0,0,False
1,0,0,2,24,0,0,9,25,0,44,True
2,1,0,2,58,1,43,0,0,0,49,False
3,1,0,2,33,0,0,0,0,0,0,False
4,0,0,2,16,0,0,70,0,0,2,True


In [245]:
# разделим данные на train и test
X = df.drop('Transported', axis =1 )
y = df['Transported']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=33)

In [246]:
# Попробцем обучить по умолчанию и проверим на тесте
clf = DecisionTreeClassifier(random_state=55)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy_score(predictions, y_test)

0.6963772282921219

In [247]:
# Поизменяем значение глубины max_depth
for x in range(10):
    clf = DecisionTreeClassifier(random_state=42, max_depth = x+1)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print( x+1, accuracy_score(predictions, y_test))


clf = DecisionTreeClassifier(random_state=42, max_depth = 100)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print( 100, accuracy_score(predictions, y_test))

# Видими что вначале результат растёт, а затем падает. Берём максимум = 6

1 0.7291546866014951
2 0.7389304197814837
3 0.7389304197814837
4 0.7389304197814837
5 0.7320299022426682
6 0.7446808510638298
7 0.7446808510638298
8 0.7297297297297297
9 0.7331799884991375
10 0.730879815986199
100 0.6969522714203565


In [248]:
# Добавим параметр min_samples_leaf

for x in range(10):
    clf = DecisionTreeClassifier(random_state=42, max_depth = 6, min_samples_leaf = x+1)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print( x+1, accuracy_score(predictions, y_test))

clf = DecisionTreeClassifier(random_state=42, max_depth = 6, min_samples_leaf = 100)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print( 100, accuracy_score(predictions, y_test))

# значение плавно начинает уменьшаться. Берём 3

1 0.7446808510638298
2 0.7446808510638298
3 0.7446808510638298
4 0.7429557216791259
5 0.7429557216791259
6 0.7441058079355952
7 0.7441058079355952
8 0.7441058079355952
9 0.7435307648073606
10 0.7435307648073606
100 0.7418056354226567


In [249]:
# Потестируем с параметром min_samples_split

for x in range(10):
    clf = DecisionTreeClassifier(random_state=42, max_depth = 6, min_samples_leaf = 3, min_samples_split = x+2)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print( x+2, accuracy_score(predictions, y_test))

clf = DecisionTreeClassifier(random_state=42, max_depth = 6, min_samples_leaf = 3, min_samples_split = 100)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print( 100, accuracy_score(predictions, y_test))

# Влияние очень незначительно. Можно пренебречь

2 0.7446808510638298
3 0.7446808510638298
4 0.7446808510638298
5 0.7446808510638298
6 0.7446808510638298
7 0.7446808510638298
8 0.7446808510638298
9 0.7446808510638298
10 0.7446808510638298
11 0.7446808510638298
100 0.7452558941920644


Результат: наивысший 0.74468 с параметрами max_depth = 6, min_samples_leaf = 3 и достаточно простой предобработкой начальных данных 