# Модель машинного обучения соревнования на kaggle при помощи catboost
Regression with an Abalone Dataset
## Цели
- Построить простейшую модель
  1. с данными train
  2. с данными abalne
  3. с совместными данными
  4. сравнить результаты

In [1]:
import pandas as pd

from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error

In [2]:
# загрузка даных
df = pd.read_csv('data/train.csv')
df_pred = pd.read_csv('data/test.csv')

df_abalone = pd.read_csv('data/abalone.csv')

In [3]:
df

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,F,0.550,0.430,0.150,0.7715,0.3285,0.1465,0.2400,11
1,1,F,0.630,0.490,0.145,1.1300,0.4580,0.2765,0.3200,11
2,2,I,0.160,0.110,0.025,0.0210,0.0055,0.0030,0.0050,6
3,3,M,0.595,0.475,0.150,0.9145,0.3755,0.2055,0.2500,10
4,4,I,0.555,0.425,0.130,0.7820,0.3695,0.1600,0.1975,9
...,...,...,...,...,...,...,...,...,...,...
90610,90610,M,0.335,0.235,0.075,0.1585,0.0685,0.0370,0.0450,6
90611,90611,M,0.555,0.425,0.150,0.8790,0.3865,0.1815,0.2400,9
90612,90612,I,0.435,0.330,0.095,0.3215,0.1510,0.0785,0.0815,6
90613,90613,I,0.345,0.270,0.075,0.2000,0.0980,0.0490,0.0700,6


In [4]:
df_pred

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight
0,90615,M,0.645,0.475,0.155,1.2380,0.6185,0.3125,0.3005
1,90616,M,0.580,0.460,0.160,0.9830,0.4785,0.2195,0.2750
2,90617,M,0.560,0.420,0.140,0.8395,0.3525,0.1845,0.2405
3,90618,M,0.570,0.490,0.145,0.8740,0.3525,0.1865,0.2350
4,90619,I,0.415,0.325,0.110,0.3580,0.1575,0.0670,0.1050
...,...,...,...,...,...,...,...,...,...
60406,151021,I,0.345,0.260,0.085,0.1775,0.0735,0.0265,0.0500
60407,151022,F,0.525,0.410,0.145,0.8445,0.3885,0.1670,0.2050
60408,151023,I,0.590,0.440,0.155,1.1220,0.3930,0.2000,0.2650
60409,151024,F,0.660,0.525,0.190,1.4935,0.5885,0.3575,0.4350


In [5]:
# привожу название признаков к единому виду
df_abalone.rename(columns={'Shucked weight': 'Whole weight.1',
                           'Viscera weight': 'Whole weight.2'}, inplace=True)
df_abalone

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


## Подготовка данных

In [6]:
# удаляю id
df.drop(columns='id', inplace=True)
df_pred.drop(columns='id', inplace=True)

In [7]:
# привожу признак Sex к категориальному типу
df['Sex'] = df['Sex'].astype('category')
df_pred['Sex'] = df_pred['Sex'].astype('category')
df_abalone['Sex'] = df_abalone['Sex'].astype('category')

## Модель catboost с train данными
- параметры по умолчанию
- kaggle public score = 0.14857

In [8]:
X = df.drop(columns='Rings')
y = df['Rings']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [9]:
cat_model = CatBoostRegressor(verbose=0, eval_metric='MSLE', cat_features=['Sex'], random_seed=42)
cat_model.fit(X_train, y_train, eval_set=(X_test, y_test))
y_pred = cat_model.predict(X_test)
rmsle = mean_squared_log_error(y_test, y_pred)
pred = cat_model.predict(df_pred)
rmsle

0.02271990852061915

In [10]:
df_pred_duble = pd.read_csv('data/test.csv')
df_pred_Id = df_pred_duble['id']

output = pd.DataFrame({'id': df_pred_Id, 'Rings': pred})
output.to_csv('data/cat_model.csv', index=False)

## Модель catboost с abalon данными
- параметры по умолчанию
- kaggle public score = 0.15585

In [11]:
X = df_abalone.drop(columns='Rings')
y = df_abalone['Rings']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [12]:
cat_model = CatBoostRegressor(verbose=0, eval_metric='MSLE', cat_features=['Sex'], random_seed=42)
cat_model.fit(X_train, y_train, eval_set=(X_test, y_test))
y_pred = cat_model.predict(X_test)
rmsle = mean_squared_log_error(y_test, y_pred)
pred = cat_model.predict(df_pred)
rmsle

0.030146924450527034

In [13]:
df_pred_duble = pd.read_csv('data/test.csv')
df_pred_Id = df_pred_duble['id']

output = pd.DataFrame({'id': df_pred_Id, 'Rings': pred})
output.to_csv('data/cat_model.csv', index=False)

## Модель catboost с совместными данными
- параметры по умолчанию
- kaggle public score = 0.14823

In [14]:
df_all = pd.concat([df, df_abalone], ignore_index=True)

In [15]:
X = df_all.drop(columns='Rings')
y = df_all['Rings']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [16]:
cat_model = CatBoostRegressor(verbose=0, eval_metric='MSLE', cat_features=['Sex'], random_seed=42)
cat_model.fit(X_train, y_train, eval_set=(X_test, y_test))
y_pred = cat_model.predict(X_test)
rmsle = mean_squared_log_error(y_test, y_pred)
pred = cat_model.predict(df_pred)
rmsle

0.02240209280976738

In [17]:
df_pred_duble = pd.read_csv('data/test.csv')
df_pred_Id = df_pred_duble['id']

output = pd.DataFrame({'id': df_pred_Id, 'Rings': pred})
output.to_csv('data/cat_model.csv', index=False)