In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv('dataset/train.csv',  index_col = 0)
test = pd.read_csv('dataset/test.csv', index_col = 0)
train.shape, test.shape


((1253, 9), (2924, 8))

In [2]:
train.rename(columns={'Lenght':'Length'}, inplace=True)
test.rename(columns={'Lenght':'Length'}, inplace=True)
train.head()

Unnamed: 0_level_0,Gender,Length,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,M,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,15
2,I,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,8
3,I,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,18
4,M,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,13
5,I,0.31,0.235,0.09,0.127,0.048,0.031,0.04,6


# 1. Preprocessing

In [3]:
# Categorical: tree기반이기 때문에 one-hot이 필요없음
train.loc[train[train['Gender']=='M'].index, 'Gender']=1
train.loc[train[train['Gender']=='I'].index, 'Gender']=3
train.loc[train[train['Gender']=='F'].index, 'Gender']=2
test.loc[test[test['Gender']=='M'].index, 'Gender']=1
test.loc[test[test['Gender']=='I'].index, 'Gender']=3
test.loc[test[test['Gender']=='F'].index, 'Gender']=2

In [5]:
# PCA
from sklearn.decomposition import PCA
features = ['Whole Weight','Shucked Weight','Viscra Weight', 'Shell Weight']

x = train.loc[:, features]
pca = PCA(n_components=2)
pca.fit(x)



new_train_feature = pca.transform(x)
train['PCA1'] = new_train_feature[:,0]
# train['PCA2'] = new_train_feature[:,1]
train.drop(features, axis=1, inplace=True)

x = test.loc[:, features]
new_test_feature = pca.transform(x)
test['PCA1'] = new_test_feature[:,0]
# test['PCA2'] = new_test_feature[:,1]
test.drop(features, axis=1, inplace=True)

print(pca.explained_variance_ratio_)
print(train.shape, test.shape)

[0.98310174 0.01243965]
(1253, 6) (2924, 5)


In [6]:
# normalize
# numerical_features = [x for x in train.keys() if x not in ['S_F','S_I','S_M','Target']]
# train[numerical_features]=(train[numerical_features] -train[numerical_features].mean())/train[numerical_features].std()
# test[numerical_features]=(test[numerical_features] -test[numerical_features].mean())/test[numerical_features].std()

In [6]:
y_train = train.pop('Target').to_numpy()
x_train = train.to_numpy()
x_test = test.to_numpy()

# 2. Train a Random forest

In [9]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
random_seed = 123

params={
    'n_estimators':[100,102,104,106,108],
    'max_depth':[3,4,5],
    'min_samples_leaf':[2,4,6,8,10],
    # 'min_samples_split':[2,4,6,8,10],   
}

# modeling
RF = RandomForestRegressor(random_state=random_seed,criterion='mae', n_jobs=-1)
print(RF)
gridRF = GridSearchCV(RF, param_grid=params, cv=5)
gridRF.fit(x_train, y_train)

RandomForestRegressor(criterion='mae', n_jobs=-1, random_state=123)


GridSearchCV(cv=5,
             estimator=RandomForestRegressor(criterion='mae', n_jobs=-1,
                                             random_state=123),
             param_grid={'max_depth': [3, 4, 5],
                         'min_samples_leaf': [2, 4, 6, 8, 10],
                         'n_estimators': [100, 102, 104, 106, 108]})

In [10]:
def NMAE(true, pred):
    mae = np.mean(np.abs(true-pred))
    score = mae / np.mean(np.abs(true))
    return score

In [11]:
print('Best parameters:', gridRF.best_params_)
print('Best score:', gridRF.best_score_)
bestRF = gridRF.best_estimator_
y_pred = bestRF.predict(x_train)
NMAE(y_train, y_pred)

Best parameters: {'max_depth': 5, 'min_samples_leaf': 8, 'n_estimators': 108}
Best score: 0.3432878619161478


0.16214215423152625

# 3. Inference

In [12]:
predictions = bestRF.predict(x_test)
sub = pd.read_csv('dataset/sample_submission.csv')
sub.loc[:,'Target']=predictions
sub.to_csv(f'result/RF_.csv', index=False)
