In [3]:
## Importing libraries 
import eli5

import time
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("ggplot")
%matplotlib inline


import optuna
from optuna.integration import LightGBMPruningCallback
optuna.logging.set_verbosity(optuna.logging.WARNING)

import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, RepeatedKFold

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.preprocessing import OrdinalEncoder

from sklearn.metrics import mean_squared_error

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector


from xgboost import DMatrix, XGBRegressor
from catboost import Pool, CatBoostRegressor
from lightgbm import LGBMRegressor, DaskLGBMRegressor

# Reading the Data

In [4]:
train = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")
sub = pd.read_csv("SampleSubmission (1).csv")
vard = pd.read_csv("VariableDescription.csv")

In [5]:
train.head()

Unnamed: 0,child_id,data_year,child_date,child_age,child_enrolment_date,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,...,obs_cooking_5,obs_cooking_6,obs_heating_1,obs_heating_2,obs_heating_3,obs_heating_4,obs_heating_5,obs_heating_6,obs_heating_7,target
0,ID_SYSJ2FM0D,2022.0,2022-02-03,59.0,,,,,,Sometimes,...,,,,,,,,,,51.5
1,ID_J5BTFOZR3,2019.0,,60.163933,,,,1st year in the programme,103.0,Sometimes,...,,,,,,,,,,55.869999
2,ID_R00SN7AUD,2022.0,2022-03-11,69.0,,,,,108.400002,Often,...,,,,,,,,,,47.52
3,ID_BSSK60PAZ,2021.0,2021-10-13,53.0,2020-01-15,20.0,No,1st year in the programme,98.099998,Almost always,...,,,,,,,,,,58.599998
4,ID_IZTY6TC4D,2021.0,2021-10-13,57.0,2021-10-13,0.0,,2nd year in programme,114.0,Almost always,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,76.599998


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8585 entries, 0 to 8584
Columns: 679 entries, child_id to target
dtypes: float64(160), object(519)
memory usage: 44.5+ MB


In [7]:
test.head()

Unnamed: 0,child_id,data_year,child_date,child_age,child_enrolment_date,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,...,obs_cooking_4,obs_cooking_5,obs_cooking_6,obs_heating_1,obs_heating_2,obs_heating_3,obs_heating_4,obs_heating_5,obs_heating_6,obs_heating_7
0,ID_0I0999N6S,2021.0,2021-09-20,57.0,,,Yes,2nd year in programme,108.0,Almost always,...,,,,,,,,,,
1,ID_GQ6ONJ4FP,2021.0,2021-10-21,54.0,2021-01-10,9.0,Yes,1st year in the programme,105.0,Almost always,...,,,,,,,,,,
2,ID_YZ76CVRW3,2021.0,2021-05-17,57.0,,,Yes,,101.5,Often,...,,,,,,,,,,
3,ID_BNINCRXH8,2022.0,2022-09-09,59.334702,,,,3rd year in programme,,Almost always,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,ID_1U7GDTLRI,2021.0,2021-10-12,54.0,2021-01-15,8.0,Yes,1st year in the programme,103.5,Often,...,,,,,,,,,,


In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3680 entries, 0 to 3679
Columns: 678 entries, child_id to obs_heating_7
dtypes: float64(161), object(517)
memory usage: 19.0+ MB


# Checking for Missing values

In [9]:
train.isnull().sum(), train.shape

(child_id                   0
 data_year                  0
 child_date              1821
 child_age                  0
 child_enrolment_date    5964
                         ... 
 obs_heating_4           6324
 obs_heating_5           6324
 obs_heating_6           6324
 obs_heating_7           6324
 target                     0
 Length: 679, dtype: int64,
 (8585, 679))

In [10]:
test.isnull().sum(), test.shape

(child_id                   0
 data_year                  0
 child_date               754
 child_age                  0
 child_enrolment_date    2568
                         ... 
 obs_heating_3           2712
 obs_heating_4           2712
 obs_heating_5           2712
 obs_heating_6           2712
 obs_heating_7           2712
 Length: 678, dtype: int64,
 (3680, 678))

In [13]:
features = []; cat_features=[]; not_features = []
for k in train.columns[1:]:
    if train[k].isnull().sum() < 6000:
        features.append(k)
        if train[k].dtype=='O':
            cat_features.append(k)
            print('There is '+str(len(train[k].value_counts()))+' Class in: '+str(k))
        else:
            not_features.append(k)
            
print('Features: ', len(features))
print('Categorical Features: ', len(cat_features))
print('Numerical Features: ', len(not_features))

There is 254 Class in: child_date
There is 535 Class in: child_enrolment_date
There is 4 Class in: child_grant
There is 4 Class in: child_years_in_programme
There is 4 Class in: child_observe_attentive
There is 4 Class in: child_observe_concentrated
There is 4 Class in: child_observe_diligent
There is 4 Class in: child_observe_interested
There is 2 Class in: child_gender
There is 1018 Class in: child_dob
There is 3 Class in: child_stunted
There is 4 Class in: child_age_group
There is 153 Class in: id_mn_best
There is 10 Class in: prov_best
There is 50 Class in: id_dc_best
There is 50 Class in: dc_best
There is 153 Class in: mn_best
There is 5 Class in: pra_free_play
There is 4 Class in: pra_free_play_outdoor
There is 31 Class in: pra_groupings
There is 2 Class in: pra_groupings_1
There is 2 Class in: pra_groupings_2
There is 2 Class in: pra_groupings_3
There is 2 Class in: pra_groupings_4
There is 2 Class in: pra_groupings_5
There is 3 Class in: pra_engaged
There is 4 Class in: pra_age

In [14]:
df_train = train[features]
df_test = test[features[:-1]]

# Filling missing values

In [16]:
df_train = df_train.fillna('')
df_test = df_test.fillna('')

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
        df_train[df_train.columns[:-1]],
        df_train[df_train.columns[-1]],
        test_size = 0.15,
        random_state = 42,
        shuffle = True
    )

In [20]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7297, 276), (1288, 276), (7297,), (1288,))

In [None]:
df_test = test[features[:-1]]
train_dataset = Pool(data=X_train, label=y_train, cat_features=cat_features)
val_dataset = Pool(data=X_test, label=y_test, cat_features=cat_features)

model = CatBoostRegressor(iterations=30000, 
                         learning_rate=0.15, 
                         random_seed=1234,
                         verbose=300,)

model.fit(train_dataset,
          eval_set=val_dataset, 
          use_best_model=True,
          early_stopping_rounds=500,
         )

preds_valid = model.predict(X_test)
preds_test = model.predict(test_d)

print(mean_squared_error(y_test, preds_valid, squared=False))