In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/30-days-of-ml/sample_submission.csv
/kaggle/input/30-days-of-ml/train.csv
/kaggle/input/30-days-of-ml/test.csv


## Loading all the necessary libraries

In [2]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

## Loading data

In [3]:
train_data = pd.read_csv("../input/30-days-of-ml/train.csv")
train_data_copy = train_data.copy()

## Splitting the train data

In [4]:
y = train_data_copy.target
X = train_data_copy.drop('target', axis=1)

train_X, valid_X, train_y, valid_y = train_test_split(X,y)

## Applying OH encoding

In [5]:
cat_cols = [col for col in train_X.columns if col[:3] == 'cat']
my_encoder = OneHotEncoder(handle_unknown='ignore',sparse=False)
train_X_cat_encode = pd.DataFrame(my_encoder.fit_transform(train_X[cat_cols]))
valid_X_cat_encode = pd.DataFrame(my_encoder.transform(valid_X[cat_cols]))

train_X_cat_encode.index = train_X.index
valid_X_cat_encode.index = valid_X.index

train_X_num = train_X.drop(cat_cols, axis = 1)
valid_X_num = valid_X.drop(cat_cols, axis = 1)

OH_train_data = pd.concat([train_X_num,train_X_cat_encode], axis=1,ignore_index=True,verify_integrity=True)
OH_valid_data = pd.concat([valid_X_num,valid_X_cat_encode], axis=1,ignore_index=True,verify_integrity=True)

## Defining our model

In [6]:
myModel = XGBRegressor(random_state=42, 
                        n_jobs=4,
                        n_estimators= 5000,
                        learning_rate= 0.01,
                        tree_method = 'gpu_hist',
                        eval_metric='rmse')

In [7]:
myModel.fit(OH_train_data,train_y,
            early_stopping_rounds=10,
            eval_set=[(OH_train_data, train_y), (OH_valid_data, valid_y)],
            eval_metric="rmse",
            verbose=False
           )

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, eval_metric='rmse',
             gamma=0, gpu_id=0, importance_type='gain',
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=5000, n_jobs=4,
             num_parallel_tree=1, random_state=42, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=1, tree_method='gpu_hist',
             validate_parameters=1, verbosity=None)

## Making predictions on Validation Data

In [8]:
pred_valid = myModel.predict(OH_valid_data)

## Checking the MSE

In [9]:
print(mean_squared_error(valid_y, pred_valid, squared=False))

0.7249795982908432


## Making predictions on Test Data

In [10]:
test_data = pd.read_csv("../input/30-days-of-ml/test.csv")

In [11]:
test_X = test_data

In [12]:
test_X_cat_encode = pd.DataFrame(my_encoder.transform(test_X[cat_cols]))
test_X_cat_encode.index = test_X.index
test_X_num = test_X.drop(cat_cols, axis = 1)
OH_test_data = pd.concat([test_X_num,test_X_cat_encode], axis=1,ignore_index=True,verify_integrity=True)

In [13]:
pred_test = myModel.predict(OH_test_data)

## Preparing output file for submission

In [14]:
output = pd.DataFrame({'Id': test_X.id,
                       'Target': pred_test})
output.to_csv('submission.csv', index=False)