# Import libraries

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from joblib import dump
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Load data

In [8]:
dataset_train=pd.read_csv('internship_train.csv')
dataset_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,target
0,236,488,16,221,382,97,-4.472136,0.107472,0,132,...,13.340874,0.870542,1.962937,7.466666,11.547794,8.822916,9.046424,7.895535,11.010677,20.107472
1,386,206,357,232,1,198,7.81025,0.763713,1,143,...,12.484882,7.16868,2.885415,12.413973,10.260494,10.091351,9.270888,3.173994,13.921871,61.763713
2,429,49,481,111,111,146,8.602325,0.651162,1,430,...,14.030257,0.39497,8.160625,12.592059,8.937577,2.265191,11.255721,12.794841,12.080951,74.651162
3,414,350,481,370,208,158,8.306624,0.424645,1,340,...,2.789577,6.416708,10.549814,11.456437,6.468099,2.519049,0.258284,9.317696,5.383098,69.424645
4,318,359,20,218,317,301,8.124038,0.767304,1,212,...,1.88656,1.919999,2.268203,0.149421,4.105907,10.416291,6.816217,8.58696,4.512419,66.767304


In [10]:
dataset_test=pd.read_csv('internship_hidden_test.csv')
dataset_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,43,44,45,46,47,48,49,50,51,52
0,259,388,402,340,156,382,3.316625,0.21876,1,164,...,7.163474,10.227235,0.889111,9.190222,4.670908,0.931525,7.634177,0.158196,13.432551,2.511191
1,441,192,381,421,452,123,-8.888194,0.700228,0,349,...,10.619064,9.339161,4.221861,0.595563,13.865748,0.328453,11.507599,9.107966,1.302407,11.105604
2,83,129,107,156,247,191,2.645751,0.494556,1,297,...,1.81989,10.670237,10.350867,5.134417,5.898995,8.374986,4.638049,3.160023,2.243799,5.07303
3,183,438,191,116,491,403,-6.164414,0.511117,0,272,...,4.168135,12.782579,0.513072,0.321295,11.334062,11.735511,1.91152,8.365676,4.877288,11.601819
4,449,156,310,188,279,465,8.0,0.756416,1,90,...,7.908122,1.140498,14.165074,7.688796,5.079241,8.152186,1.680403,13.215111,5.823109,1.038015


Check for missing data

In [11]:
print("Presence of missing data in the training sample (True/False):", dataset_train.isnull().values.any())

Presence of missing data in the training sample (True/False): False


Data normalization using StandardScaler

In [12]:
scaler = StandardScaler()

In [13]:
X_data_train, y_train = dataset_train.loc[:, "0":"52"], dataset_train.loc[:, "target"].values
X_scaled_train = pd.DataFrame(scaler.fit_transform(X_data_train))
X_scaled_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,43,44,45,46,47,48,49,50,51,52
0,-0.093005,1.650892,-1.614340,-0.199167,0.920961,-1.057700,-0.633795,-1.354700,-0.998379,-0.814885,...,0.603259,1.354549,-1.539604,-1.280193,-0.001535,0.936529,0.311898,0.355316,0.094426,0.807135
1,0.946240,-0.307151,0.751961,-0.122928,-1.725961,-0.357907,1.111325,0.918543,1.001624,-0.738819,...,0.709447,1.156878,-0.082212,-1.067235,1.139537,0.639375,0.605313,0.407199,-0.997298,1.478992
2,1.244156,-1.397266,1.612434,-0.961551,-0.961758,-0.718197,1.223865,0.528661,1.001624,1.245803,...,0.637462,1.513746,-1.649652,0.150570,1.180611,0.334000,-1.205037,0.865978,1.227252,1.054137
3,1.140232,0.692701,1.612434,0.833518,-0.287870,-0.635053,1.181851,-0.256000,1.001624,0.623448,...,-0.144989,-1.082023,-0.256219,0.702124,0.918686,-0.236042,-1.146314,-1.675994,0.423260,-0.491620
4,0.475115,0.755191,-1.586582,-0.219959,0.469386,0.355743,1.155909,0.930981,1.001624,-0.261680,...,0.842910,-1.290554,-1.296760,-1.209721,-1.689221,-0.781319,0.680478,-0.160179,0.254298,-0.692559
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89995,0.544398,-1.431983,0.495207,0.791933,-1.684277,-1.223988,0.697682,0.226305,1.001624,0.450571,...,-0.522134,-1.664700,-0.309631,0.048119,0.616132,-1.333998,-0.838507,1.130909,1.551124,-1.186716
89996,-0.224643,1.539797,-0.517930,0.535495,0.038654,0.965464,-0.510668,1.257937,-0.998379,-0.731904,...,1.668388,-0.537619,-0.766931,1.610064,-1.406540,-0.254933,0.609067,1.185458,-0.210603,-0.150028
89997,-0.418635,1.081532,-0.247298,-1.335813,1.699059,-0.413336,0.877479,0.468187,1.001624,-0.199444,...,-0.050014,1.715240,-1.073103,0.271201,-0.680629,-1.369830,0.808836,0.065418,0.216300,-1.032045
89998,1.167945,1.581458,-0.594263,-1.488290,0.976539,-1.196273,-0.510668,0.653874,-0.998379,0.270780,...,1.485218,1.216491,-1.156621,0.817628,-0.203747,1.184538,0.722242,-1.586011,0.291156,-1.446239


In [14]:
X_data_test = dataset_test.loc[:, "0":"52"]
X_scaled_test = pd.DataFrame(scaler.fit_transform(X_data_test))
X_scaled_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,43,44,45,46,47,48,49,50,51,52
0,0.073639,0.951431,1.063198,0.625019,-0.660795,0.914640,0.482925,-1.002767,1.015724,-0.572814,...,-0.059678,0.633722,-1.529334,0.402090,-0.642416,-1.525417,0.028520,-1.674813,1.366861,-1.163416
1,1.336499,-0.401757,0.918155,1.187487,1.384278,-0.875636,-1.252706,0.682667,-0.984520,0.722297,...,0.737456,0.428223,-0.761469,-1.583440,1.491567,-1.664136,0.918263,0.376481,-1.429519,0.839376
2,-1.147589,-0.836711,-0.974306,-0.652687,-0.032073,-0.405602,0.387521,-0.037312,1.015724,0.358265,...,-1.292334,0.736233,0.650652,-0.534879,-0.357396,0.186742,-0.659704,-0.986792,-1.212498,-0.566420
3,-0.453710,1.296632,-0.394135,-0.930449,1.653731,1.059798,-0.865360,0.020661,-0.984520,0.183250,...,-0.750642,1.225026,-1.615973,-1.646802,0.904001,0.959736,-1.286000,0.206348,-0.605396,0.955011
4,1.392009,-0.650302,0.427773,-0.430477,0.189016,1.488358,1.148942,0.879359,1.015724,-1.090859,...,0.112097,-1.468937,1.529444,0.055232,-0.547648,0.135493,-1.339089,1.317842,-0.387355,-1.506717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-0.190035,-0.028940,0.455400,1.083326,1.467187,0.465343,0.965237,-0.239898,1.015724,-0.285790,...,-1.418639,1.114865,0.057912,-0.404566,0.599773,-0.598915,-0.793398,-0.647035,-0.684594,-0.174567
9996,-0.668812,-0.643398,-1.492315,-0.812400,0.810829,0.078256,0.815726,1.683773,1.015724,1.478361,...,-1.174386,1.316193,-0.191113,-0.769406,-1.218161,1.608510,0.063577,0.446654,-0.481861,-0.785225
9997,0.698130,-0.070364,0.738579,-1.062386,-1.096064,-0.509286,0.678290,1.138848,1.015724,0.288259,...,1.614955,-1.514980,-0.915046,1.183358,-0.088323,1.250470,-1.288227,-0.542141,-1.384329,0.137523
9998,1.239356,-1.244048,-1.278205,0.104215,-1.310244,-0.689005,0.580107,-0.010075,1.015724,-0.852838,...,-1.445794,-1.364045,0.547270,-1.132236,0.905196,1.524854,1.523445,0.573355,0.238739,-0.108896


# Create classifier

Creating training and validation sets (80:20 split).

In [15]:
train_X, val_X, train_y, val_y = train_test_split(X_scaled_train, y_train, test_size=0.2, random_state=0)

In [16]:
def check_metric_rmse(y_true, y_pred):
  return np.sqrt(mean_squared_error(y_true, y_pred))

# Use a regression classifier (Decision Tree Regressor)

In [17]:
decision_tree_regressor = DecisionTreeRegressor(random_state=0)
decision_tree_regressor.fit(train_X, train_y)

val_predictions = decision_tree_regressor.predict(val_X)
val_rmse_DTR =  '{:.6f}'.format(check_metric_rmse(val_y, val_predictions))
print("Validation RMSE:", val_rmse_DTR)

Validation RMSE: 0.007868


# Use a regression classifier (Random Forest Regressor)

In [18]:
random_forest_regressor = RandomForestRegressor()
random_forest_regressor.fit(train_X, train_y)

val_predictions = random_forest_regressor.predict(val_X)
val_rmse_RFR =  '{:.6f}'.format(check_metric_rmse(val_y, val_predictions))
print("Validation RMSE:", val_rmse_RFR)

Validation RMSE: 0.003822


In [24]:
dump(random_forest_regressor, 'random_forest_regressor_model.joblib')

['random_forest_regressor_model.joblib']

# Сonclusion

In [19]:
df = pd.DataFrame({'val_rmse': [val_rmse_DTR, val_rmse_RFR]}, 
                  index=['Decision Tree Regressor', 'Random Forest Regressor'])

header_style = {
    'selector': 'th',
    'props': [('background-color', 'lightgray'), ('border', '1px solid white')]
}
cell_style = {
    'selector': 'td',
    'props': [('border', '2px solid white')]
}

styled_table = df.style.set_table_styles([header_style, cell_style])
styled_table

Unnamed: 0,val_rmse
Decision Tree Regressor,0.007868
Random Forest Regressor,0.003822


**Decision Tree Regressor** builds a decision tree that splits the data into several levels according to certain rules.

**Random Forest Regressor** is an ensemble model that uses multiple Decision Tree Regressor models to make better predictions.

Since we have a large dataset, it is ***better to use Random Forest Regressor***. This can be seen from the results of the *RMSE* metric.