## Importing required modules

In [1]:
# importing required modules
import math
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score

## Loading data

In [2]:
train_data = pd.read_csv("../input/30-days-of-ml/train.csv")
test_data = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_subm_data = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

### Columns

In [3]:
train_data.columns

Index(['id', 'cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7',
       'cat8', 'cat9', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5',
       'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12',
       'cont13', 'target'],
      dtype='object')

## Predicting target and features

In [4]:
# Prediction target and features
categorical_features = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']
numerical_features = ['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13']
y = train_data.target
X = train_data.drop(['id', 'target'], axis=1)

In [5]:
print(X.shape)
X.head()

(300000, 24)


Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,B,B,B,C,B,B,A,E,C,N,...,0.610706,0.400361,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985
1,B,B,A,A,B,D,A,F,A,O,...,0.276853,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083
2,A,A,A,C,B,D,A,D,A,F,...,0.285074,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846
3,B,B,A,C,B,D,A,E,C,K,...,0.284667,0.66898,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682
4,A,A,A,C,B,D,A,E,A,N,...,0.287595,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823


## Splitting the whole data for preprocessing

In [6]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, train_size=0.95, test_size=0.05, random_state=0)

## Pipeline of Random Forest Regression model

In [7]:
# Preprocessing for categorical columns
numerical_transformer = Pipeline(steps=[("num_imputer", SimpleImputer(strategy="constant"))])

# Preprocessing for categorical columns
categorical_transformer = Pipeline(steps=[("oh_encoder", OneHotEncoder(handle_unknown="ignore", sparse=False))])

preprocessor = ColumnTransformer(transformers=[("num", numerical_transformer, numerical_features), ("cat", categorical_transformer, categorical_features)])

In [8]:
# making model for predicting target on training dataset
training_model_1 = RandomForestRegressor(n_estimators=200, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("training_model_1", training_model_1)])

# fitting model
my_pipeline.fit(train_X, train_y)


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('num_imputer',
                                                                   SimpleImputer(strategy='constant'))]),
                                                  ['cont0', 'cont1', 'cont2',
                                                   'cont3', 'cont4', 'cont5',
                                                   'cont6', 'cont7', 'cont8',
                                                   'cont9', 'cont10', 'cont11',
                                                   'cont12', 'cont13']),
                                                 ('cat',
                                                  Pipeline(steps=[('oh_encoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=Fal

### Predicting the target of training data & finding MAE, RMSE

In [9]:
# Preprocessing of validation data, get predictions
predictions_1 = my_pipeline.predict(valid_X)

# Evaluating the MAE
mae_1 = mean_absolute_error(valid_y, predictions_1)

# Evaluating the RMSE
rmse_1 = math.sqrt(mean_squared_error(valid_y, predictions_1))

In [10]:
print("Following scores using above Pipeline")
print(f"Predictions: {predictions_1}")
print(f"MAE: {mae_1}")
print(f"RMSE: {rmse_1}")

Following scores using above Pipeline
Predictions: [8.14181008 8.04930962 8.34919442 ... 8.26953067 8.2554711  8.28503139]
MAE: 0.5802834364932775
RMSE: 0.7351896036194409


## Test data

In [11]:
print(test_data.shape)
test_data.head()

(200000, 25)


Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,0,B,B,B,C,B,B,A,E,E,...,0.476739,0.37635,0.337884,0.321832,0.445212,0.290258,0.244476,0.087914,0.301831,0.845702
1,5,A,B,A,C,B,C,A,E,C,...,0.285509,0.860046,0.798712,0.835961,0.391657,0.288276,0.549568,0.905097,0.850684,0.69394
2,15,B,A,A,A,B,B,A,E,D,...,0.697272,0.6836,0.404089,0.879379,0.275549,0.427871,0.491667,0.384315,0.376689,0.508099
3,16,B,B,A,C,B,D,A,E,A,...,0.719306,0.77789,0.730954,0.644315,1.024017,0.39109,0.98834,0.411828,0.393585,0.461372
4,17,B,B,A,C,B,C,A,E,C,...,0.313032,0.431007,0.390992,0.408874,0.447887,0.390253,0.648932,0.385935,0.370401,0.900412


In [12]:
final_X = test_data.drop(["id"], axis=1)
final_X.head()

Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,B,B,B,C,B,B,A,E,E,I,...,0.476739,0.37635,0.337884,0.321832,0.445212,0.290258,0.244476,0.087914,0.301831,0.845702
1,A,B,A,C,B,C,A,E,C,H,...,0.285509,0.860046,0.798712,0.835961,0.391657,0.288276,0.549568,0.905097,0.850684,0.69394
2,B,A,A,A,B,B,A,E,D,K,...,0.697272,0.6836,0.404089,0.879379,0.275549,0.427871,0.491667,0.384315,0.376689,0.508099
3,B,B,A,C,B,D,A,E,A,N,...,0.719306,0.77789,0.730954,0.644315,1.024017,0.39109,0.98834,0.411828,0.393585,0.461372
4,B,B,A,C,B,C,A,E,C,F,...,0.313032,0.431007,0.390992,0.408874,0.447887,0.390253,0.648932,0.385935,0.370401,0.900412


### Predicting the target on final data

In [13]:
# Prediction on above pipeline
final_predictions = my_pipeline.predict(final_X)

In [14]:
final_predictions

array([7.94429968, 8.30837759, 8.38403064, ..., 8.38763922, 8.06077985,
       8.15546867])

## Submitting the final predictions

In [15]:
# Submitting predicted target to `submission.csv`
output = pd.DataFrame({'Id': test_data.id,'target': final_predictions})
output.to_csv('submission.csv', index=False)