# Importing required modules

In [1]:
# importing required modules
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Loading data
train_data = pd.read_csv("../input/30-days-of-ml/train.csv")
test_data = pd.read_csv("../input/30-days-of-ml/test.csv")

# filtering rows with missing values
train_data = train_data.dropna(axis=0)

print(f"Shape: {train_data.shape}")
train_data.head()

Shape: (300000, 26)


Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
0,1,B,B,B,C,B,B,A,E,C,...,0.400361,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634
1,2,B,B,A,A,B,D,A,F,A,...,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233
2,3,A,A,A,C,B,D,A,D,A,...,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351
3,4,B,B,A,C,B,D,A,E,C,...,0.66898,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253
4,6,A,A,A,C,B,D,A,E,A,...,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226


***Columns***

In [3]:
[f"{c} - {i}" for c, i in enumerate(train_data.columns)]

['0 - id',
 '1 - cat0',
 '2 - cat1',
 '3 - cat2',
 '4 - cat3',
 '5 - cat4',
 '6 - cat5',
 '7 - cat6',
 '8 - cat7',
 '9 - cat8',
 '10 - cat9',
 '11 - cont0',
 '12 - cont1',
 '13 - cont2',
 '14 - cont3',
 '15 - cont4',
 '16 - cont5',
 '17 - cont6',
 '18 - cont7',
 '19 - cont8',
 '20 - cont9',
 '21 - cont10',
 '22 - cont11',
 '23 - cont12',
 '24 - cont13',
 '25 - target']

# Prediction target and features

In [4]:
# Prediction target and features
categorical_features = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']
numerical_features = ['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13']
y = train_data.target
X = train_data.drop(['id', 'target'], axis=1)
# numerical column X
num_X = X[numerical_features]

In [5]:
print(X.shape)
X.head()

(300000, 24)


Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,B,B,B,C,B,B,A,E,C,N,...,0.610706,0.400361,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985
1,B,B,A,A,B,D,A,F,A,O,...,0.276853,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083
2,A,A,A,C,B,D,A,D,A,F,...,0.285074,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846
3,B,B,A,C,B,D,A,E,C,K,...,0.284667,0.66898,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682
4,A,A,A,C,B,D,A,E,A,N,...,0.287595,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823


In [6]:
num_X.shape
num_X.head()

Unnamed: 0,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,0.20147,-0.014822,0.669699,0.136278,0.610706,0.400361,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985
1,0.743068,0.367411,1.021605,0.365798,0.276853,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083
2,0.742708,0.310383,-0.012673,0.576957,0.285074,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846
3,0.429551,0.620998,0.577942,0.28061,0.284667,0.66898,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682
4,1.058291,0.367492,-0.052389,0.232407,0.287595,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823


# Splitting the whole data for preprocessing

In [7]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# score_dataset function to get mae and predictions

In [8]:
def score_dataset(X_train, X_valid, y_train, y_valid, *, n_estimators=100):
    """Returns mean_absolute_error and predictions based on given data"""
    model = RandomForestRegressor(n_estimators=n_estimators, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds), preds

# Preprocessing the Data
***(Using One Hot Encoding)***

In [9]:
# Apply one-hot encoder to each column with categorical data
oh_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
oh_encoder.fit(train_X[categorical_features])
oh_train_cols = pd.DataFrame(oh_encoder.transform(train_X[categorical_features]))
oh_valid_cols = pd.DataFrame(oh_encoder.transform(valid_X[categorical_features]))

# Putting index columns back
oh_train_cols.index = train_X.index
oh_valid_cols.index = valid_X.index

# Removing categorical columns
num_X_train = train_X.drop(categorical_features, axis=1)
num_X_valid = valid_X.drop(categorical_features, axis=1)

# Adding one hot encoded columns with numerical columns
OH_X_train = pd.concat([num_X_train, oh_train_cols], axis=1)
OH_X_valid = pd.concat([num_X_valid, oh_valid_cols], axis=1)

***Whole data after One Hot Encoding***

In [10]:
print(OH_X_train.shape)
OH_X_train.head()

(240000, 70)


Unnamed: 0,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,...,46,47,48,49,50,51,52,53,54,55
154452,0.402449,0.718484,0.914274,0.398746,0.277709,0.398071,0.487289,0.869034,0.439229,0.352714,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
297220,0.554185,-0.04293,0.673048,0.47019,0.281611,0.388998,0.176158,1.000809,0.27151,0.440423,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
177,0.734878,0.415996,0.485986,0.554634,0.278146,0.403402,0.502907,0.324734,0.511943,0.328226,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
44603,1.005455,0.482119,0.073901,0.70036,0.686815,0.821791,0.538206,0.631913,0.405548,0.753464,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
158177,1.033922,0.35434,0.258197,0.185321,0.257302,0.748708,0.520747,0.828223,0.996511,0.926008,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Random Forest Regressor Model 1
**on whole data after preprocessing using One Hot Encoding**

In [11]:
mae_score_1, predictions_1 = score_dataset(OH_X_train, OH_X_valid, train_y, valid_y)
print("Training Dataset results based on model_1 (whole data)")
print(f"Predictions: {predictions_1}")
print(f"MAE: {mae_score_1}")

Training Dataset results based on model_1 (whole data)
Predictions: [8.39288293 8.04871903 8.28682213 ... 8.04482317 8.09938769 8.16729721]
MAE: 0.5845162985059503


# Test data

In [12]:
test_data = test_data.dropna(axis=0)
print(test_data.shape)
test_data.head()

(200000, 25)


Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,0,B,B,B,C,B,B,A,E,E,...,0.476739,0.37635,0.337884,0.321832,0.445212,0.290258,0.244476,0.087914,0.301831,0.845702
1,5,A,B,A,C,B,C,A,E,C,...,0.285509,0.860046,0.798712,0.835961,0.391657,0.288276,0.549568,0.905097,0.850684,0.69394
2,15,B,A,A,A,B,B,A,E,D,...,0.697272,0.6836,0.404089,0.879379,0.275549,0.427871,0.491667,0.384315,0.376689,0.508099
3,16,B,B,A,C,B,D,A,E,A,...,0.719306,0.77789,0.730954,0.644315,1.024017,0.39109,0.98834,0.411828,0.393585,0.461372
4,17,B,B,A,C,B,C,A,E,C,...,0.313032,0.431007,0.390992,0.408874,0.447887,0.390253,0.648932,0.385935,0.370401,0.900412


In [13]:
final_X = test_data.drop(["id"], axis=1)
final_X.head()

Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,B,B,B,C,B,B,A,E,E,I,...,0.476739,0.37635,0.337884,0.321832,0.445212,0.290258,0.244476,0.087914,0.301831,0.845702
1,A,B,A,C,B,C,A,E,C,H,...,0.285509,0.860046,0.798712,0.835961,0.391657,0.288276,0.549568,0.905097,0.850684,0.69394
2,B,A,A,A,B,B,A,E,D,K,...,0.697272,0.6836,0.404089,0.879379,0.275549,0.427871,0.491667,0.384315,0.376689,0.508099
3,B,B,A,C,B,D,A,E,A,N,...,0.719306,0.77789,0.730954,0.644315,1.024017,0.39109,0.98834,0.411828,0.393585,0.461372
4,B,B,A,C,B,C,A,E,C,F,...,0.313032,0.431007,0.390992,0.408874,0.447887,0.390253,0.648932,0.385935,0.370401,0.900412


# Applying One Hot Encoding in test data

In [14]:
# Apply one-hot encoder to each column with categorical data on final data(test data)
oh_final_cols = pd.DataFrame(oh_encoder.transform(final_X[categorical_features]))

# Putting index columns back
oh_final_cols.index = final_X.index

# Removing categorical columns
num_X_final = final_X.drop(categorical_features, axis=1)

# Adding one hot encoded columns with numerical columns
OH_X_final = pd.concat([num_X_final, oh_final_cols], axis=1)

In [15]:
print(OH_X_final.shape)
OH_X_final.head()

(200000, 70)


Unnamed: 0,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,...,46,47,48,49,50,51,52,53,54,55
0,0.296227,0.686757,0.587731,0.392753,0.476739,0.37635,0.337884,0.321832,0.445212,0.290258,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.543707,0.364761,0.452967,0.929645,0.285509,0.860046,0.798712,0.835961,0.391657,0.288276,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.408961,0.296129,0.690999,0.740027,0.697272,0.6836,0.404089,0.879379,0.275549,0.427871,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.031239,0.356062,0.303651,0.895591,0.719306,0.77789,0.730954,0.644315,1.024017,0.39109,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.530447,0.729004,0.281723,0.444698,0.313032,0.431007,0.390992,0.408874,0.447887,0.390253,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Random Forest Regressor final model

In [16]:
final_model = RandomForestRegressor(n_estimators=100, random_state=0)
final_model.fit(OH_X_train, train_y)

RandomForestRegressor(random_state=0)

***Predicting the target using final_model***

In [17]:
final_preds = final_model.predict(OH_X_final)

In [18]:
final_preds

array([7.88605983, 8.29970937, 8.42781575, ..., 8.45297864, 8.12792172,
       8.09799195])

# Submitting the final predictions

In [19]:
# Submitting predicted target to `submission.csv`
output = pd.DataFrame({'Id': test_data.id,'target': final_preds})
output.to_csv('submission.csv', index=False)