# 데이터 확인

### 1. train.csv : 학습 데이터
- id : 샘플 아이디
- Species: 펭귄의 종을 나타내는 문자열
- Island : 샘플들이 수집된 Palmer Station 근처 섬 이름
- Clutch Completion : 관찰된 펭귄 둥지의 알이 2개인 경우 Full Clutch이며 Yes로 표기
- Culmen Length (mm) : 펭귄 옆모습 기준 부리의 가로 길이
- Culmen Depth (mm) : 펭귄 옆모습 기준 부리의 세로 길이
- Flipper Length (mm) : 펭귄의 팔(날개) 길이
- Sex : 펭귄의 성별
- Delta 15 N (o/oo)  : 토양에 따라 변화하는 안정 동위원소 15N:14N의 비율
- Delta 13 C (o/oo) : 먹이에 따라 변화하는 안정 동위원소 13C:12C의 비율
- Body Mass (g): 펭귄의 몸무게를 나타내는 숫자 (g)


### 2. test.csv : 테스트 데이터
- id : 샘플 아이디
- Species: 펭귄의 종을 나타내는 문자열
- Island : 샘플들이 수집된 Palmer Station 근처 섬 이름
- Clutch Completion : 관찰된 펭귄 둥지의 알이 2개인 경우 Full Clutch이며 Yes로 표기
- Culmen Length (mm) : 펭귄 옆모습 기준 부리의 가로 길이
- Culmen Depth (mm) : 펭귄 옆모습 기준 부리의 세로 길이
- Flipper Length (mm) : 펭귄의 팔(날개) 길이
- Sex : 펭귄의 성별
- Delta 15 N (o/oo)  : 토양에 따라 변화하는 안정 동위원소 15N:14N의 비율
- Delta 13 C (o/oo) : 먹이에 따라 변화하는 안정 동위원소 13C:12C의 비율

# 데이터 불러오기

In [117]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


df= pd.read_csv('dataset/train.csv')
tst_df= pd.read_csv('dataset/test.csv')

# 데이터 인코딩

In [145]:
df['Sex']=np.where(df['Sex'].values=='MALE',1,np.where(df['Sex'].values=='FEMALE',0,np.nan))
tst_df['Sex']=np.where(tst_df['Sex'].values=='MALE',1,np.where(tst_df['Sex'].values=='FEMALE',0,np.nan))
df['Clutch Completion']=np.where(df['Clutch Completion'].values=='Yes',1,0)
tst_df['Clutch Completion']=np.where(tst_df['Clutch Completion'].values=='Yes',1,0)
df = pd.concat([df,pd.get_dummies(df[['Island','Species']])],axis=1)
tst_df = pd.concat([tst_df,pd.get_dummies(tst_df[['Island','Species']])],axis=1)
df = df.drop(['Island','Species'],axis=1)
tst_df = tst_df.drop(['Island','Species'],axis=1)

# 결측값 삭제 및 대체

In [146]:
sex_features = ['Culmen Length (mm)', 'Culmen Depth (mm)',
       'Flipper Length (mm)','Species_Adelie Penguin (Pygoscelis adeliae)',
       'Species_Chinstrap penguin (Pygoscelis antarctica)',
       'Species_Gentoo penguin (Pygoscelis papua)', 'Island_Biscoe', 'Island_Dream', 'Island_Torgersen'
       ]


In [147]:
# sex
from sklearn.ensemble import AdaBoostClassifier

sex_model = AdaBoostClassifier()
sex_model.fit(df[sex_features].iloc[df['Sex'].dropna().index],df['Sex'].iloc[df['Sex'].dropna().index])
df['Sex'].iloc[np.where(df['Sex'].isnull()==True)] =  sex_model.predict(df[df['Sex'].isnull()][sex_features])
tst_df['Sex'].iloc[np.where(tst_df['Sex'].isnull()==True)] =  sex_model.predict(tst_df[tst_df['Sex'].isnull()][sex_features])

In [122]:
Delta_features = ['Culmen Length (mm)', 'Culmen Depth (mm)',
       'Flipper Length (mm)','Species_Adelie Penguin (Pygoscelis adeliae)',
       'Species_Chinstrap penguin (Pygoscelis antarctica)',
       'Species_Gentoo penguin (Pygoscelis papua)', 'Island_Biscoe', 'Island_Dream', 'Island_Torgersen','Sex'
       ]

In [148]:
# delta
from sklearn.ensemble import AdaBoostRegressor

ada_model = AdaBoostRegressor()
ada_model.fit(df[Delta_features].iloc[df['Delta 15 N (o/oo)'].dropna().index]
                               ,df['Delta 15 N (o/oo)'].iloc[df['Delta 15 N (o/oo)'].dropna().index])
df['Delta 15 N (o/oo)'].iloc[np.where(df['Delta 15 N (o/oo)'].isnull()==True)] =  ada_model.predict(df[df['Delta 15 N (o/oo)'].isnull()][Delta_features])
tst_df['Delta 15 N (o/oo)'].iloc[np.where(tst_df['Delta 15 N (o/oo)'].isnull()==True)] =  ada_model.predict(tst_df[tst_df['Delta 15 N (o/oo)'].isnull()][Delta_features])

In [149]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
lr_model.fit(df[Delta_features].iloc[df['Delta 13 C (o/oo)'].dropna().index]
                               ,df['Delta 13 C (o/oo)'].iloc[df['Delta 13 C (o/oo)'].dropna().index])
df['Delta 13 C (o/oo)'].iloc[np.where(df['Delta 13 C (o/oo)'].isnull()==True)] =  lr_model.predict(df[df['Delta 13 C (o/oo)'].isnull()][Delta_features])
tst_df['Delta 13 C (o/oo)'].iloc[np.where(tst_df['Delta 13 C (o/oo)'].isnull()==True)] =  lr_model.predict(tst_df[tst_df['Delta 13 C (o/oo)'].isnull()][Delta_features])

In [150]:
tst_df.isnull().sum()

id                                                   0
Clutch Completion                                    0
Culmen Length (mm)                                   0
Culmen Depth (mm)                                    0
Flipper Length (mm)                                  0
Sex                                                  0
Delta 15 N (o/oo)                                    0
Delta 13 C (o/oo)                                    0
Island_Biscoe                                        0
Island_Dream                                         0
Island_Torgersen                                     0
Species_Adelie Penguin (Pygoscelis adeliae)          0
Species_Chinstrap penguin (Pygoscelis antarctica)    0
Species_Gentoo penguin (Pygoscelis papua)            0
dtype: int64

In [151]:
df.drop(['Clutch Completion'],axis=1,inplace=True)
tst_df.drop(['Clutch Completion'],axis=1,inplace=True)

In [152]:
tst_df.columns

Index(['id', 'Culmen Length (mm)', 'Culmen Depth (mm)', 'Flipper Length (mm)',
       'Sex', 'Delta 15 N (o/oo)', 'Delta 13 C (o/oo)', 'Island_Biscoe',
       'Island_Dream', 'Island_Torgersen',
       'Species_Adelie Penguin (Pygoscelis adeliae)',
       'Species_Chinstrap penguin (Pygoscelis antarctica)',
       'Species_Gentoo penguin (Pygoscelis papua)'],
      dtype='object')

In [153]:
df.columns = ["id",'clength','cdepth','flength','sex','nrate','crate','g',
                 'i_biscoe','i_dream','i_torgersen','s_adelie','s_chin','s_gentoo',]

tst_df.columns= ["id",'clength','cdepth','flength','sex','nrate','crate',
                 'i_biscoe','i_dream','i_torgersen','s_adelie','s_chin','s_gentoo',]

# predict

In [154]:
X =df[['clength','cdepth','flength','sex','nrate','crate',
                 'i_biscoe','i_dream','i_torgersen','s_adelie','s_chin','s_gentoo',]]
Y= df[['g']]

In [155]:
from sklearn.model_selection import train_test_split
seed=0
x_train, x_val, y_train, y_val = train_test_split(X,Y,test_size=0.2,random_state=seed)

In [156]:
def get_model_cv_prediction(model, X_data, y_target):
    neg_mse_scores = cross_val_score(model, X_data, y_target, scoring="neg_mean_squared_error", cv = 5)
    rmse_scores  = np.sqrt(-1 * neg_mse_scores)
    avg_rmse = np.mean(rmse_scores)
    print('##### ',model.__class__.__name__ , ' #####')
    print(' 5 교차 검증의 평균 RMSE : {0:.3f} '.format(avg_rmse))

### LinearRegression

In [157]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score


reg = LinearRegression()
reg.fit(x_train,y_train)

print(reg.score(x_val,y_val))
print()
print(get_model_cv_prediction(reg,x_val,y_val))

0.8308896122735631

#####  LinearRegression  #####
 5 교차 검증의 평균 RMSE : 329.127 
None


### GradientBoostingRegressor

In [158]:
from sklearn import ensemble
clf = ensemble.GradientBoostingRegressor(n_estimators = 400, max_depth = 5, min_samples_split = 2,
                                         learning_rate = 0.1, loss = 'ls')

clf.fit(x_train, y_train)

print(clf.score(x_val,y_val))
print()
print(get_model_cv_prediction(clf,x_val,y_val))

0.7318649402209413

#####  GradientBoostingRegressor  #####
 5 교차 검증의 평균 RMSE : 523.372 
None


### Ridge/ Lasso


In [159]:
from sklearn.linear_model import Ridge , Lasso

ridge_reg = Ridge(alpha=10)
ridge_reg.fit(x_train, y_train)

lasso_reg = Lasso(alpha=0.01)
lasso_reg.fit(x_train, y_train)

print(ridge_reg.score(x_val,y_val))
print(lasso_reg.score(x_val,y_val))

print()
print(get_model_cv_prediction(ridge_reg,x_val,y_val))
print(get_model_cv_prediction(lasso_reg,x_val,y_val))

0.8422696036016932
0.8309268798080788

#####  Ridge  #####
 5 교차 검증의 평균 RMSE : 340.120 
None
#####  Lasso  #####
 5 교차 검증의 평균 RMSE : 345.034 
None


# Elasticnet

In [160]:
from sklearn.linear_model import ElasticNet

enet = ElasticNet(alpha=1.0, l1_ratio=0.5)
enet.fit(x_train, y_train)

print(enet.score(x_val,y_val))
print()
print(get_model_cv_prediction(enet,x_val,y_val))

0.7987146983311875

#####  ElasticNet  #####
 5 교차 검증의 평균 RMSE : 338.175 
None


# tst_df 예측

In [161]:
from sklearn.metrics import mean_squared_error

lr_pred = reg.predict(x_val)
ridge_pred = ridge_reg.predict(x_val)
lasso_pred = lasso_reg.predict(x_val)


print('Linear - RMSE for test data: ', np.sqrt(mean_squared_error(y_val, lr_pred)))
print('Ridge - RMSE for test data: ', np.sqrt(mean_squared_error(y_val, ridge_pred)))
print('Lasso - RMSE for test data: ', np.sqrt(mean_squared_error(y_val, lasso_pred)))




Linear - RMSE for test data:  306.66671879806097
Ridge - RMSE for test data:  296.1687287370829
Lasso - RMSE for test data:  306.6329262487135


In [162]:
from sklearn.model_selection import GridSearchCV

params = {'alpha':[0.5,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2,10]}
params_2 = {'alpha':[0.5,1,2,2.5,3,3.5,4]}

ridge_model = Ridge()
lasso_model = Lasso()

# define the grid search
Ridge_reg= GridSearchCV(ridge_model, params, scoring='neg_mean_squared_error',cv=5)
Lasso_reg= GridSearchCV(lasso_model, params, scoring='neg_mean_squared_error',cv=5)

#fit the grid search
Ridge_reg.fit(x_val,y_val)
Lasso_reg.fit(x_val,y_val)

# best estimator
print(Ridge_reg.best_estimator_)
print(Lasso_reg.best_estimator_)

Ridge(alpha=0.5)
Lasso(alpha=10)


In [163]:
from sklearn.linear_model import Ridge , Lasso

ridge_reg = Ridge(alpha=10)
ridge_reg.fit(x_train, y_train)


Ridge(alpha=10)

In [165]:
X =tst_df[['clength','cdepth','flength','sex','nrate','crate',
                 'i_biscoe','i_dream','i_torgersen','s_adelie','s_chin','s_gentoo',]]

In [166]:
pred = ridge_reg.predict(X)

tst_df['Body Mass (g)'] = pred

In [167]:
test= tst_df[['id','Body Mass (g)']]

In [168]:
test.head()

Unnamed: 0,id,Body Mass (g)
0,0,4455.996803
1,1,5572.55855
2,2,3662.054305
3,3,3469.317357
4,4,3231.869958


In [144]:
test.to_csv('dataset/predicted_test_5.csv',index=False)