In [134]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [135]:
df = pd.read_csv("train_dataset.csv")
df.head()

Unnamed: 0,length,diameter,height,whole-weight,shucked-weight,viscera-weight,shell-weight,sex_F,sex_I,sex_M,rings
0,124,97,41,243.8,77.5,50.1,77.0,0,0,1,14
1,109,88,27,183.7,85.8,40.3,47.5,1,0,0,10
2,136,108,38,324.6,143.3,70.8,94.3,0,0,1,12
3,110,88,33,172.1,62.4,33.8,60.0,0,1,0,17
4,78,60,20,53.3,22.1,11.8,16.8,0,1,0,7


In [136]:
df.isnull().sum()

length            0
diameter          0
height            0
whole-weight      0
shucked-weight    0
viscera-weight    0
shell-weight      0
sex_F             0
sex_I             0
sex_M             0
rings             0
dtype: int64

In [137]:
df.dtypes

length              int64
diameter            int64
height              int64
whole-weight      float64
shucked-weight    float64
viscera-weight    float64
shell-weight      float64
sex_F               int64
sex_I               int64
sex_M               int64
rings               int64
dtype: object

In [138]:
X = df.drop(['rings'], axis = 1)
y = df.rings

In [139]:
#Split the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state= 0)
print("X_train:", X_train.shape)
print("X_test:", X_test.shape) 
print("y_train:", y_train.shape) 
print("y_test:", y_test.shape)

X_train: (1775, 10)
X_test: (1775, 10)
y_train: (1775,)
y_test: (1775,)


In [140]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from math import sqrt

In [141]:
def score(model, title = "Default"):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    print("RMSE estimate: {}, std: {}".format(rmse.mean(), rmse.std()))

In [142]:
from sklearn.linear_model import LinearRegression
lir = LinearRegression()
score(lir, "Linear Regression")

RMSE estimate: 2.21168461595196, std: 0.0


In [143]:
from sklearn.linear_model import Ridge
rl = Ridge(alpha = 50, random_state=42)
score(rl, "Ridge Model")

RMSE estimate: 2.2124526041742114, std: 0.0


In [144]:
from sklearn.linear_model import Lasso
lm = Lasso(alpha = 0.1, random_state= 42)
score(lm, "Lasso Model")

RMSE estimate: 2.2293979679238483, std: 0.0


In [145]:
from sklearn.ensemble import RandomForestRegressor
fl = RandomForestRegressor()
score(fl, "Random Forest")

RMSE estimate: 2.2097593903817407, std: 0.0


In [146]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
score(dt, "Decision Tree")

RMSE estimate: 2.962583569867811, std: 0.0


In [147]:
from sklearn.svm import SVR
sv = SVR()
score(sv, "SVR")

RMSE estimate: 2.2795888256845998, std: 0.0


In [148]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor()
score(gbr, "Gradient Boosting")

RMSE estimate: 2.221171723533025, std: 0.0


In [149]:
from sklearn.ensemble import HistGradientBoostingRegressor
hgbr =HistGradientBoostingRegressor()
score(hgbr, "Histogram GB")

RMSE estimate: 2.2915336234293306, std: 0.0


In [150]:
from xgboost import XGBRegressor
xgbr = XGBRegressor()
score(xgbr, "XG Boosting")

RMSE estimate: 2.379368757298989, std: 0.0


In [151]:
from catboost import CatBoostRegressor
catbr = CatBoostRegressor(verbose= 0, n_estimators= 100)
score(catbr, "Cat Boosting")

RMSE estimate: 2.1927184041291587, std: 0.0


In [152]:
df_t = pd.read_csv("test_dataset.csv")
df_t.head()

Unnamed: 0,length,diameter,height,whole-weight,shucked-weight,viscera-weight,shell-weight,sex_F,sex_I,sex_M
0,111,87,29,139.5,52.4,31.5,48.0,0,1,0
1,70,53,19,39.8,14.6,9.8,12.0,0,1,0
2,77,57,17,48.8,24.3,8.9,13.6,0,1,0
3,122,99,37,230.6,107.2,58.1,49.0,1,0,0
4,115,95,32,222.8,99.1,54.9,58.0,0,0,1


In [153]:
df_t.isnull().sum()

length            0
diameter          0
height            0
whole-weight      0
shucked-weight    0
viscera-weight    0
shell-weight      0
sex_F             0
sex_I             0
sex_M             0
dtype: int64

In [154]:
df_t.dtypes

length              int64
diameter            int64
height              int64
whole-weight      float64
shucked-weight    float64
viscera-weight    float64
shell-weight      float64
sex_F               int64
sex_I               int64
sex_M               int64
dtype: object

In [155]:
target = catbr.predict(df_t)
d = pd.DataFrame(target)
#d.index = df_t.id
d.columns = ['rings']
d.to_csv('submission.csv', index= False)

In [156]:
d.head()

Unnamed: 0,rings
0,10.217614
1,7.525539
2,6.902094
3,9.313872
4,9.514016
