## Importing required modules

In [1]:
# importing required modules
import math
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

## Loading data

In [2]:
train_data = pd.read_csv("../input/30-days-of-ml/train.csv")
test_data = pd.read_csv("../input/30-days-of-ml/test.csv")
#sample_subm_data = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

## Training data Columns

## Setting features and target

In [3]:
# Prediction target and features
cat_features = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']
numerical_features = ['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13']
y = train_data.target
X = train_data.drop(['id', 'target'], axis=1)

## Splitting the whole data for preprocessing

In [4]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, train_size=0.95, test_size=0.05, random_state=0)

## Applying One Hot Encoding on categorical data

In [5]:
# Apply one-hot encoder to each column with categorical data
oh_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
oh_encoder.fit(train_X[cat_features])
oh_train_cols = pd.DataFrame(oh_encoder.transform(train_X[cat_features]))
oh_valid_cols = pd.DataFrame(oh_encoder.transform(valid_X[cat_features]))

# Putting index columns back
oh_train_cols.index = train_X.index
oh_valid_cols.index = valid_X.index

# Removing categorical columns
num_X_train = train_X.drop(cat_features, axis=1)
num_X_valid = valid_X.drop(cat_features, axis=1)

# Adding one hot encoded columns with numerical columns
X_train = pd.concat([num_X_train, oh_train_cols], axis=1)
X_valid = pd.concat([num_X_valid, oh_valid_cols], axis=1)

## XGB Regressor model

In [6]:
# making model for predicting target on training dataset
model_1 = XGBRegressor(n_estimators=10000, learning_rate=0.05, n_jobs=4)
model_1.fit(X_train, train_y, early_stopping_rounds=5, eval_set=[(X_valid, valid_y)], verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=10000, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [7]:
training_predictions_1 = model_1.predict(X_valid)
training_mae_1 = mean_absolute_error(training_predictions_1, valid_y)
training_rmse_1 = math.sqrt(mean_squared_error(training_predictions_1, valid_y))

print("XGB Model scores on training data:")
print(f"Predictions: {training_predictions_1}")
print(f"MAE: {training_mae_1}")
print(f"RMSE: {training_rmse_1}")

XGB Model scores on training data:
Predictions: [8.217206 7.840215 8.138892 ... 8.170606 8.277083 8.323044]
MAE: 0.5732343367413175
RMSE: 0.725559368169386


# Test data

In [8]:
print(test_data.shape)
test_data.head()

(200000, 25)


Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,0,B,B,B,C,B,B,A,E,E,...,0.476739,0.37635,0.337884,0.321832,0.445212,0.290258,0.244476,0.087914,0.301831,0.845702
1,5,A,B,A,C,B,C,A,E,C,...,0.285509,0.860046,0.798712,0.835961,0.391657,0.288276,0.549568,0.905097,0.850684,0.69394
2,15,B,A,A,A,B,B,A,E,D,...,0.697272,0.6836,0.404089,0.879379,0.275549,0.427871,0.491667,0.384315,0.376689,0.508099
3,16,B,B,A,C,B,D,A,E,A,...,0.719306,0.77789,0.730954,0.644315,1.024017,0.39109,0.98834,0.411828,0.393585,0.461372
4,17,B,B,A,C,B,C,A,E,C,...,0.313032,0.431007,0.390992,0.408874,0.447887,0.390253,0.648932,0.385935,0.370401,0.900412


In [9]:
final_X = test_data.drop(["id"], axis=1)
final_X.head()

Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,B,B,B,C,B,B,A,E,E,I,...,0.476739,0.37635,0.337884,0.321832,0.445212,0.290258,0.244476,0.087914,0.301831,0.845702
1,A,B,A,C,B,C,A,E,C,H,...,0.285509,0.860046,0.798712,0.835961,0.391657,0.288276,0.549568,0.905097,0.850684,0.69394
2,B,A,A,A,B,B,A,E,D,K,...,0.697272,0.6836,0.404089,0.879379,0.275549,0.427871,0.491667,0.384315,0.376689,0.508099
3,B,B,A,C,B,D,A,E,A,N,...,0.719306,0.77789,0.730954,0.644315,1.024017,0.39109,0.98834,0.411828,0.393585,0.461372
4,B,B,A,C,B,C,A,E,C,F,...,0.313032,0.431007,0.390992,0.408874,0.447887,0.390253,0.648932,0.385935,0.370401,0.900412


### Applying One Hot Encoding to categorical columns for test data

In [10]:
# Apply one-hot encoder to each column with categorical data on final data(test data)
oh_final_cols = pd.DataFrame(oh_encoder.transform(final_X[cat_features]))

# Putting index columns back
oh_final_cols.index = final_X.index

# Removing categorical columns
num_X_final = final_X.drop(cat_features, axis=1)

# Adding one hot encoded columns with numerical columns
X_final = pd.concat([num_X_final, oh_final_cols], axis=1)

### Predicting the target on final data

In [11]:
final_predictions = model_1.predict(X_final)
final_predictions

array([8.01815 , 8.342946, 8.375421, ..., 8.418993, 8.070883, 8.05259 ],
      dtype=float32)

## Submitting the final predictions

In [12]:
# Submitting predicted target to `submission.csv`
output = pd.DataFrame({'Id': test_data.id,'target': final_predictions})
output.to_csv('submission.csv', index=False)