<a href="https://colab.research.google.com/github/W-Hailey/python_ML/blob/main/project_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import os
import seaborn as sn


In [3]:
train = pd.read_csv("/content/drive/MyDrive/Colab_Notebooks/project_2/train.csv")
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
test = pd.read_csv("/content/drive/MyDrive/Colab_Notebooks/project_2/test.csv")
test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [5]:
#分類
cat_features = train.columns[train.dtypes == 'object']
num_features = train.columns[train.dtypes != 'object']

train['Transported'] = train['Transported'].map({True:1, False:0})
df = pd.concat([train, test], axis=0).copy() #合併train和test的第0行

df1 = df.copy()
df1 = df1.drop(['Cabin','Name'], axis=1)


In [6]:
df2 = df1.copy()

df2['group'] = df2.apply(lambda row: row['PassengerId'][0:4], axis=1)
df2['number'] = df2.apply(lambda row: int(row['PassengerId'][5:7]), axis=1)
df2['FamSize'] = df2.groupby(['group'])['number'].transform(np.sum)

df2 = df2.drop(['PassengerId','group','number'], axis=1)

features = df2.columns.tolist()
features.remove('Transported')
features.append('Transported')
df2 = df2[features].copy()

In [7]:
#標準化
num_missing_features = df2.columns[5:-1].to_list()
num_missing_features.append('Age')

cat_missing_features = df2.columns[:5].to_list()
cat_missing_features.remove('Age')

In [8]:
df2.columns[:5]

Index(['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP'], dtype='object')

In [9]:
from sklearn.preprocessing import StandardScaler

df3 = df2.copy()

scaler = StandardScaler()
df3[num_missing_features] = scaler.fit_transform(df3[num_missing_features])

In [10]:
#處理NA
df3.isnull().sum()

HomePlanet       288
CryoSleep        310
Destination      274
Age              270
VIP              296
RoomService      263
FoodCourt        289
ShoppingMall     306
Spa              284
VRDeck           268
FamSize            0
Transported     4277
dtype: int64

In [11]:
from sklearn.impute import KNNImputer

df4 = df3.copy()

imputer = KNNImputer(n_neighbors=3) #用kNN，用距離NA最近的3個值計算替代NA的值
df4[num_missing_features] = imputer.fit_transform(df4[num_missing_features])

In [12]:
df5 = df4.copy()

for feature in cat_missing_features:
    df5[feature] = df5[feature].fillna(value=df5[feature].mode()[0])

In [13]:
df5.isnull().sum()

HomePlanet         0
CryoSleep          0
Destination        0
Age                0
VIP                0
RoomService        0
FoodCourt          0
ShoppingMall       0
Spa                0
VRDeck             0
FamSize            0
Transported     4277
dtype: int64

In [14]:
df6 = df5.copy()

for feature in ['HomePlanet','Destination']:
    df6 = pd.get_dummies(df6, columns=[feature])
    df6 = df6.drop([df6.columns[-1]], axis=1)

for feature in ['CryoSleep','VIP']:
    df6[feature] = df6[feature].map({True:1, False:0})

In [15]:
#Split Data
X_train = df6.iloc[:train.shape[0],:].copy()
X_train = X_train.drop(['Transported'], axis=1).copy()
y_train = train['Transported'].copy()

X_test = df6.iloc[train.shape[0]:,:]


In [16]:
#Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate

RF = RandomForestClassifier(n_estimators = 200,
                            max_depth = 30,
                            max_features = 4,
                            min_samples_split = 10,
                            min_samples_leaf = 10,
                            random_state = 0)

RF_cv_scores = cross_validate(RF, X_train, y_train, scoring='accuracy', cv=5)
round(RF_cv_scores['test_score'].mean(),5)

0.79697

In [17]:
RF.fit(X_train, y_train)

RandomForestClassifier(max_depth=30, max_features=4, min_samples_leaf=10,
                       min_samples_split=10, n_estimators=200, random_state=0)

In [19]:
submission = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/project_2/sample_submission.csv')

In [20]:
y_pred = RF.predict(X_test.drop(['Transported'],axis=1))
submission['Transported'] = y_pred
submission['Transported'] = submission['Transported'].map({1:True, 0:False})
submission.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [21]:
submission.to_csv('submission.csv', index=False)