In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
def remove_label(df, label):
    x = df.drop(label, axis=1)
    y = df[label].copy()
    return (x,y)

In [3]:
train_data = pd.read_csv("/kaggle/input/playground-series-s4e4/train.csv", index_col=0)
train_data.head(10)

Unnamed: 0_level_0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9
5,F,0.61,0.48,0.17,1.201,0.5335,0.3135,0.3085,10
6,M,0.415,0.325,0.11,0.3315,0.1655,0.0715,0.13,9
7,F,0.61,0.49,0.15,1.1165,0.4955,0.2945,0.295,9
8,I,0.205,0.15,0.04,0.046,0.0145,0.0105,0.01,4
9,I,0.565,0.425,0.125,0.651,0.3795,0.142,0.18,8


In [4]:
train_data.isna().any()

Sex               False
Length            False
Diameter          False
Height            False
Whole weight      False
Whole weight.1    False
Whole weight.2    False
Shell weight      False
Rings             False
dtype: bool

In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 90615 entries, 0 to 90614
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             90615 non-null  object 
 1   Length          90615 non-null  float64
 2   Diameter        90615 non-null  float64
 3   Height          90615 non-null  float64
 4   Whole weight    90615 non-null  float64
 5   Whole weight.1  90615 non-null  float64
 6   Whole weight.2  90615 non-null  float64
 7   Shell weight    90615 non-null  float64
 8   Rings           90615 non-null  int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 6.9+ MB


In [6]:
sex = train_data[['Sex']]
oh = OneHotEncoder()
sex_oh = oh.fit_transform(sex)
print(oh.categories_)

[array(['F', 'I', 'M'], dtype=object)]


In [7]:
sex_oh = sex_oh.toarray()
print(sex_oh)

[[1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 ...
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]


In [8]:
encoded_oh = pd.DataFrame(sex_oh, index=train_data.index, columns=oh.categories_[0])
encoded_oh

Unnamed: 0_level_0,F,I,M
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,0.0,1.0,0.0
...,...,...,...
90610,0.0,0.0,1.0
90611,0.0,0.0,1.0
90612,0.0,1.0,0.0
90613,0.0,1.0,0.0


In [9]:
train_data = pd.concat([train_data, encoded_oh], axis=1)
train_data = train_data.drop("Sex", axis=1)
train_data

Unnamed: 0_level_0,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings,F,I,M
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0.550,0.430,0.150,0.7715,0.3285,0.1465,0.2400,11,1.0,0.0,0.0
1,0.630,0.490,0.145,1.1300,0.4580,0.2765,0.3200,11,1.0,0.0,0.0
2,0.160,0.110,0.025,0.0210,0.0055,0.0030,0.0050,6,0.0,1.0,0.0
3,0.595,0.475,0.150,0.9145,0.3755,0.2055,0.2500,10,0.0,0.0,1.0
4,0.555,0.425,0.130,0.7820,0.3695,0.1600,0.1975,9,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
90610,0.335,0.235,0.075,0.1585,0.0685,0.0370,0.0450,6,0.0,0.0,1.0
90611,0.555,0.425,0.150,0.8790,0.3865,0.1815,0.2400,9,0.0,0.0,1.0
90612,0.435,0.330,0.095,0.3215,0.1510,0.0785,0.0815,6,0.0,1.0,0.0
90613,0.345,0.270,0.075,0.2000,0.0980,0.0490,0.0700,6,0.0,1.0,0.0


In [10]:
x_train, y_train = remove_label(train_data, "Rings")

In [11]:
x_train

Unnamed: 0_level_0,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,F,I,M
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.550,0.430,0.150,0.7715,0.3285,0.1465,0.2400,1.0,0.0,0.0
1,0.630,0.490,0.145,1.1300,0.4580,0.2765,0.3200,1.0,0.0,0.0
2,0.160,0.110,0.025,0.0210,0.0055,0.0030,0.0050,0.0,1.0,0.0
3,0.595,0.475,0.150,0.9145,0.3755,0.2055,0.2500,0.0,0.0,1.0
4,0.555,0.425,0.130,0.7820,0.3695,0.1600,0.1975,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
90610,0.335,0.235,0.075,0.1585,0.0685,0.0370,0.0450,0.0,0.0,1.0
90611,0.555,0.425,0.150,0.8790,0.3865,0.1815,0.2400,0.0,0.0,1.0
90612,0.435,0.330,0.095,0.3215,0.1510,0.0785,0.0815,0.0,1.0,0.0
90613,0.345,0.270,0.075,0.2000,0.0980,0.0490,0.0700,0.0,1.0,0.0


In [12]:
clf = RandomForestRegressor(n_estimators=250)
clf.fit(x_train, y_train)

In [13]:
test_data = pd.read_csv("/kaggle/input/playground-series-s4e4/test.csv", index_col=0)
test_data

Unnamed: 0_level_0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
90615,M,0.645,0.475,0.155,1.2380,0.6185,0.3125,0.3005
90616,M,0.580,0.460,0.160,0.9830,0.4785,0.2195,0.2750
90617,M,0.560,0.420,0.140,0.8395,0.3525,0.1845,0.2405
90618,M,0.570,0.490,0.145,0.8740,0.3525,0.1865,0.2350
90619,I,0.415,0.325,0.110,0.3580,0.1575,0.0670,0.1050
...,...,...,...,...,...,...,...,...
151021,I,0.345,0.260,0.085,0.1775,0.0735,0.0265,0.0500
151022,F,0.525,0.410,0.145,0.8445,0.3885,0.1670,0.2050
151023,I,0.590,0.440,0.155,1.1220,0.3930,0.2000,0.2650
151024,F,0.660,0.525,0.190,1.4935,0.5885,0.3575,0.4350


In [14]:
test_data.isna().any()

Sex               False
Length            False
Diameter          False
Height            False
Whole weight      False
Whole weight.1    False
Whole weight.2    False
Shell weight      False
dtype: bool

In [15]:
sex = test_data[['Sex']]
oh = OneHotEncoder()
sex_oh = oh.fit_transform(sex)
print(oh.categories_)

[array(['F', 'I', 'M'], dtype=object)]


In [16]:
sex_oh = sex_oh.toarray()
print(sex_oh)

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 ...
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


In [17]:
encoded_oh = pd.DataFrame(sex_oh, index=test_data.index, columns=oh.categories_[0])
encoded_oh

Unnamed: 0_level_0,F,I,M
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
90615,0.0,0.0,1.0
90616,0.0,0.0,1.0
90617,0.0,0.0,1.0
90618,0.0,0.0,1.0
90619,0.0,1.0,0.0
...,...,...,...
151021,0.0,1.0,0.0
151022,1.0,0.0,0.0
151023,0.0,1.0,0.0
151024,1.0,0.0,0.0


In [18]:
test_data = pd.concat([test_data, encoded_oh], axis=1)
test_data = test_data.drop("Sex", axis=1)
test_data

Unnamed: 0_level_0,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,F,I,M
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
90615,0.645,0.475,0.155,1.2380,0.6185,0.3125,0.3005,0.0,0.0,1.0
90616,0.580,0.460,0.160,0.9830,0.4785,0.2195,0.2750,0.0,0.0,1.0
90617,0.560,0.420,0.140,0.8395,0.3525,0.1845,0.2405,0.0,0.0,1.0
90618,0.570,0.490,0.145,0.8740,0.3525,0.1865,0.2350,0.0,0.0,1.0
90619,0.415,0.325,0.110,0.3580,0.1575,0.0670,0.1050,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
151021,0.345,0.260,0.085,0.1775,0.0735,0.0265,0.0500,0.0,1.0,0.0
151022,0.525,0.410,0.145,0.8445,0.3885,0.1670,0.2050,1.0,0.0,0.0
151023,0.590,0.440,0.155,1.1220,0.3930,0.2000,0.2650,0.0,1.0,0.0
151024,0.660,0.525,0.190,1.4935,0.5885,0.3575,0.4350,1.0,0.0,0.0


In [19]:
y_pred = clf.predict(test_data)
y_pred = np.rint(y_pred)
y_pred

array([10., 10., 10., ..., 12., 13.,  9.])

In [20]:
submission = pd.DataFrame(test_data.index, columns=['id'])
submission['Rings'] = y_pred.astype(int)
submission

Unnamed: 0,id,Rings
0,90615,10
1,90616,10
2,90617,10
3,90618,11
4,90619,8
...,...,...
60406,151021,6
60407,151022,9
60408,151023,12
60409,151024,13


In [21]:
submission.to_csv("/kaggle/working/submission.csv", index=False)