In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv('dataset/train.csv',  index_col = 0)
test = pd.read_csv('dataset/test.csv', index_col = 0)
train.shape, test.shape


((1253, 9), (2924, 8))

In [15]:
train.rename(columns={'Lenght':'Length'}, inplace=True)
test.rename(columns={'Lenght':'Length'}, inplace=True)
train.head()

Unnamed: 0_level_0,Gender,Length,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,M,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,15
2,I,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,8
3,I,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,18
4,M,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,13
5,I,0.31,0.235,0.09,0.127,0.048,0.031,0.04,6


# 1. Preprocessing

In [16]:
# sex -> one hot encoding
train = train.join(pd.get_dummies(train['Gender'], prefix='S'))
test = test.join(pd.get_dummies(test['Gender'], prefix='S'))
train.drop('Gender', axis=1, inplace=True)
test.drop('Gender', axis=1, inplace=True)

In [17]:
# PCA
from sklearn.decomposition import PCA
features = ['Whole Weight','Shucked Weight','Viscra Weight', 'Shell Weight']

x = train.loc[:, features]
pca = PCA(n_components=2)
pca.fit(x)



new_train_feature = pca.transform(x)
train['PCA1'] = new_train_feature[:,0]
train['PCA2'] = new_train_feature[:,1]
train.drop(features, axis=1, inplace=True)

x = test.loc[:, features]
new_test_feature = pca.transform(x)
test['PCA1'] = new_test_feature[:,0]
test['PCA2'] = new_test_feature[:,1]
test.drop(features, axis=1, inplace=True)

print(pca.explained_variance_ratio_)
print(train.shape, test.shape)

[0.98310174 0.01243965]
(1253, 9) (2924, 8)


In [18]:
# normalize
numerical_features = [x for x in train.keys() if x not in ['S_F','S_I','S_M','Target']]
train[numerical_features]=(train[numerical_features] -train[numerical_features].mean())/train[numerical_features].std()
test[numerical_features]=(test[numerical_features] -test[numerical_features].mean())/test[numerical_features].std()

In [19]:
y_train = train.pop('Target')
x_train = train
x_test = test

# 2. Model selection

In [20]:
# LR, DT, RF, SVM
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, plot_confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# define models
LR = LinearRegression()
DT = DecisionTreeRegressor(random_state=123)
RF = RandomForestRegressor(random_state=123)
SVM = SVR()


In [21]:
train, val = train_test_split(range(x_train.shape[0]), test_size=0.1)

In [22]:
x_train_, x_val_ = x_train.iloc[train], x_train.iloc[val]
y_train_, y_val_ = y_train.iloc[train], y_train.iloc[val]

In [23]:
# train models
names = ["LR", "DT", "RF", "SVM"]
models = [LR, DT, RF, SVM]
preds = []
for i, model in enumerate(models):
    model.fit(x_train_, y_train_)
    pred = model.predict(x_val_)
    preds.append(pred)
    

In [24]:
import numpy as np

def NMAE(true, pred):
    mae = np.mean(np.abs(true-pred))
    score = mae / np.mean(np.abs(true))
    return score
scores = [0]*4
for i in range(4):
    scores[i] = (NMAE(y_val_, preds[i]))
    print(names[i], scores[i])

LR 0.15638796507221026
DT 0.21342925659472423
RF 0.15872102318145484
SVM 0.1451653599871068


# 3. Inference

In [25]:
# define models
LR = LinearRegression()
DT = DecisionTreeRegressor(random_state=123)
RF = RandomForestRegressor(random_state=123)
SVM = SVR()

names = ["LR", "DT", "RF", "SVM"]
models = [LR, DT, RF, SVM]
preds = []
for i, model in enumerate(models):
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    preds.append(pred)

In [28]:
for i in range(4):
    sub = pd.read_csv('dataset/sample_submission.csv')
    sub.loc[:,'Target']=preds[i]
    sub.to_csv(f'result/PCA/{names[i]}.csv', index=False)