In [79]:
## Importing libraries 
import eli5

import time
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("ggplot")
%matplotlib inline


import optuna
from optuna.integration import LightGBMPruningCallback
optuna.logging.set_verbosity(optuna.logging.WARNING)

import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, RepeatedKFold

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.preprocessing import OrdinalEncoder

from sklearn.metrics import mean_squared_error

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector


from xgboost import DMatrix, XGBRegressor
from catboost import Pool, CatBoostRegressor
from lightgbm import LGBMRegressor, DaskLGBMRegressor

## Reading the data

In [80]:
train = pd.read_csv("Train.csv")

test = pd.read_csv("Test.csv")
sub = pd.read_csv("SampleSubmission (1).csv")
vard = pd.read_csv("VariableDescription.csv")

In [81]:
train.head()

Unnamed: 0,child_id,data_year,child_date,child_age,child_enrolment_date,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,...,obs_cooking_5,obs_cooking_6,obs_heating_1,obs_heating_2,obs_heating_3,obs_heating_4,obs_heating_5,obs_heating_6,obs_heating_7,target
0,ID_SYSJ2FM0D,2022.0,2022-02-03,59.0,,,,,,Sometimes,...,,,,,,,,,,51.5
1,ID_J5BTFOZR3,2019.0,,60.163933,,,,1st year in the programme,103.0,Sometimes,...,,,,,,,,,,55.869999
2,ID_R00SN7AUD,2022.0,2022-03-11,69.0,,,,,108.400002,Often,...,,,,,,,,,,47.52
3,ID_BSSK60PAZ,2021.0,2021-10-13,53.0,2020-01-15,20.0,No,1st year in the programme,98.099998,Almost always,...,,,,,,,,,,58.599998
4,ID_IZTY6TC4D,2021.0,2021-10-13,57.0,2021-10-13,0.0,,2nd year in programme,114.0,Almost always,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,76.599998


In [82]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8585 entries, 0 to 8584
Columns: 679 entries, child_id to target
dtypes: float64(160), object(519)
memory usage: 44.5+ MB


In [83]:
df_train = train.copy()
df_test = test.copy()

In [84]:
# train.columns[train.columns.str.contains("observe")].to_list()
observe_columns =['child_observe_attentive', 'child_observe_concentrated',
               'child_observe_diligent', 'child_observe_interested',]

mapper_observe = {'Sometimes':1,
         'Almost never':0,
         'Often':2,
         'Almost always':3}

In [85]:
social_columns = ['teacher_social_initiative', 'teacher_social_peers',
                'teacher_social_nonaggressive',  'teacher_social_cooperate', 
                'teacher_social_assistance', 'teacher_social_ideas', 'teacher_social_initiative']

mapper_social = {'None of the time': 0,
                 'A little of the time': 1,
                 'Most of the time': 2,
                 'All of the time': 3}

In [86]:
# Function to map values to the columns
def mapper(df, map_columns, map_values):
    for col in map_columns:
        df[col] = df[col].map(map_values)
    return df

In [87]:
# Mapping values
df_train = mapper(df_train, observe_columns, mapper_observe)
df_test = mapper(df_test, observe_columns, mapper_observe)

df_train = mapper(df_train, social_columns, mapper_social)
df_test = mapper(df_test, social_columns, mapper_social)

In [88]:
df_train.columns[df_train.columns.str.contains("child")]

Index(['child_id', 'child_date', 'child_age', 'child_enrolment_date',
       'child_months_enrolment', 'child_grant', 'child_years_in_programme',
       'child_height', 'child_observe_attentive', 'child_observe_concentrated',
       'child_observe_diligent', 'child_observe_interested',
       'child_observe_total', 'child_gender', 'child_dob', 'child_zha',
       'child_stunted', 'child_attends', 'child_attendance', 'child_languages',
       'child_age_group', 'pri_children_4_6_years', 'obs_toilets_children',
       'count_children_present', 'count_children_attendance',
       'count_children_precovid', 'count_toilets_children', 'language_child'],
      dtype='object')

In [89]:
# map_years = {'1st year in the programme': 1, 
#              '2nd year in programme': 2,
#              '3rd year in programme': 3, 
#              'Do Not Know':0}

# df_train["child_years_in_programme"] = df_train["child_years_in_programme"].map(map_years)
# df_test["child_years_in_programme"] = df_test["child_years_in_programme"].map(map_years)

In [90]:
drop_cols = ['count_staff_gender_other',
         'pri_days',
         'obs_lighting_8',
         'count_practitioners_all',
         'count_register_gender_other',
         'obs_heating_3',
         'obs_lighting_5',
         'pri_difficult_see',
         'pri_difficult_walk',
         'obs_lighting_6',        
         'teacher_social_initiative',]
df_train[drop_cols].isnull().sum()

count_staff_gender_other       3424
pri_days                       3394
obs_lighting_8                 6330
count_practitioners_all        6957
count_register_gender_other    3470
obs_heating_3                  6324
obs_lighting_5                 6330
pri_difficult_see              6957
pri_difficult_walk             6957
obs_lighting_6                 6330
teacher_social_initiative      8585
dtype: int64

In [13]:
len(df_train.query("child_gender == 'Female'"))

4442

In [14]:
# df_train["child_gender"] = df_child["child_gender"].map({"Male": })

In [15]:
# pd.set_option("display.max_rows", 50)
# df_train["child_dob"] = 
# df_train["child_dob"] = pd.to_datetime(df_train["child_dob"],yearfirst=True)
# df_test["child_dob"] = pd.to_datetime(df_test["child_dob"],yearfirst=True)

# df_train["child_dob_month"] = df_train["child_dob"].dt.month
# df_test["child_dob_month"] = df_test["child_dob"].dt.month

# Machine Learning Algorithms

## Training with numerical columns only

In [16]:
n_splits = 10

kf = KFold(n_splits, shuffle=True, random_state=42)
rkf = RepeatedKFold(n_splits=n_splits, n_repeats=2, random_state=42)

In [17]:
# # Cross validation
def cross_validation(X, y, model, cv_method):
    rmse_scores = list()
    
    # cv = cv_method(n_splits = N_SPLITS, n_repeats=5, random_state=1121218)
    
    for idx, (train_idx, test_idx) in enumerate(cv_method.split(X, y)):
        
        print("=" * 12 + f"Training fold {idx}" + 12 * "=")
        start = time.time()
        
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # Making eval_set
        eval_set = [(X_test.values, y_test)]

        # Fitting the model to the data
        model.fit(X_train.values, y_train)

        # Predictions
        preds = model.predict(X_test)
        
        # Loss
        rmse = mean_squared_error(y_test, preds, squared=False)
        rmse_scores.append(rmse)
        
        runtime = time.time()-start
        
        print(f"Fold {idx} finished with score: {rmse:.5f} in {runtime:.2f} seconds.\n")
    
    print(f"Average score {np.mean(rmse_scores)}")
        
    
    return np.mean(rmse_scores), rmse_scores

# rmse, cv_scores = cross_validation(X, y, hgbr, k_fold)

In [18]:
# Getting feature importances
def get_feature_imp(model, imp_factor):
    cols_imp = [(col,imp) for col, imp in zip(model.feature_name_, model.feature_importances_) if imp < imp_factor]
    cols = [col for col, imp in zip(model.feature_name_, model.feature_importances_) if imp < imp_factor]
    
    return cols, cols_imp

In [19]:
def feature_imp_dataframe(model):
    feature_imp = model.feature_importances_
    columns = model.feature_name_

    df_imp = pd.DataFrame(
            {"features": columns,
            "feature_importance": feature_imp}
    )

    df_imp.sort_values(by=['feature_importance'], ascending=False, inplace=True)
    df_imp.reset_index(drop=True, inplace=True)
    
    return df_imp

In [20]:
def plot_feature_importance(df_feature_imp, n_imp_cols):
    labels = df_feature_imp.iloc[:30, 0].values

    plt.figure(figsize=(20, 6))
    ax = df_feature_imp.iloc[:30, 1].\
        plot(kind="bar", title="Feature Importance")
    ax.set_xticklabels(labels)
    plt.show()

### Selecting numerical columns from df_train and df_test

### Lightgbm

In [91]:
cols_list_2 = ['pri_difficult_see',
 'count_staff_contract_substitute',
 'pri_difficult_walk',
 'obs_lighting_4',
 'obs_heating_3',
 'count_register_gender_other',
 'count_staff_paid_managers',
 'obs_lighting_5',
 'count_practitioners_all',
 'obs_cooking_4',
 'pri_days',
 'count_staff_gender_other',
 'obs_cooking_5',
 'obs_lighting_6',
 'obs_lighting_3']

cols_list_3 = ['pri_difficult_see',
 'count_staff_contract_substitute',
 'pri_difficult_walk',
 'obs_lighting_4',
 'obs_heating_3',
 'count_register_gender_other',
 'count_staff_paid_managers',
 'obs_lighting_5',
 'count_practitioners_all',
 'obs_cooking_4',
 'pri_days',
 'count_staff_gender_other',
 'obs_cooking_5',
 'obs_lighting_6',
 'obs_lighting_3',
 'language_match',
  'teacher_social_initiative',
  'obs_lighting_8',
  'obs_heating_7']

In [92]:
train_int_cols = df_train.select_dtypes(include = ["float64", "number", "Int64", "int64"]).columns
test_int_cols = df_test.select_dtypes(include = ["float64", "number", "Int64", "int64"]).columns

# Numerical columns
num_cols = list(set(test_int_cols).intersection(set(train_int_cols)))


X, y = df_train[num_cols].drop(columns=drop_cols), df_train[['target']]
# X, y = df_train[num_cols].drop(columns=cols_list_2), df_train[['target']]
# X, y = df_train[num_cols].drop(columns=cols_list_3), df_train[['target']]

In [93]:
X.shape, y.shape

((8585, 158), (8585, 1))

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state =55)

# Training the model with reduced columns
lgbm = LGBMRegressor(learning_rate=0.01,
                     objective="regression",
                     n_estimators=3000,
                     num_leaves=35,
                     max_bin=240,
                     colsample_bytree=0.3,
                     max_depth=9,
                     min_child_samples=20,
                     n_jobs=6,
                     random_state=42)

lgbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric="rmse",
        early_stopping_rounds=100,
        verbose=False)

# Get score
lgbm.best_score_

defaultdict(collections.OrderedDict,
            {'valid_0': OrderedDict([('rmse', 9.536860776300456),
                          ('l2', 90.95171346653814)])})

In [95]:
get_feature_imp(lgbm, 5)

(['language_match',
  'count_staff_contract_substitute',
  'obs_lighting_3',
  'count_register_year_2013',
  'obs_heating_7',
  'obs_cooking_4'],
 [('language_match', 0),
  ('count_staff_contract_substitute', 0),
  ('obs_lighting_3', 4),
  ('count_register_year_2013', 3),
  ('obs_heating_7', 1),
  ('obs_cooking_4', 0)])

In [96]:
# Training the model on whole dataset
lgbm = LGBMRegressor(learning_rate=0.01,
                     objective="regression",
                     n_estimators=3000,
                     num_leaves=35,
                     max_bin=240,
                     colsample_bytree=0.3,
                     max_depth=9,
                     min_child_samples=20,
                     n_jobs=6,
                     random_state=42)
lgbm.fit(X, y)

In [97]:
# Prediction on test set
test_data =  df_test[X.columns.to_list()]

preds = lgbm.predict(test_data)
preds

array([60.86808134, 46.10992002, 48.7298092 , ..., 40.48581844,
       43.30328087, 42.49748806])

In [98]:
# Creating a submissionfile
sub_file = pd.read_csv("GBRSubmission1_score_10.63.csv")
sub_file.target = preds
sub_file.to_csv("LGBMSubmission_drop_cols_4.csv", index=False)

#### Selecting common numerical features from train and test set

In [None]:
train_int_cols = train.select_dtypes(include = ["float64", "number", "Int64", "int64"]).columns
test_int_cols = test.select_dtypes(include = ["float64", "number", "Int64", "int64"]).columns

# Numerical columns
num_cols = list(set(test_int_cols).intersection(set(train_int_cols)))

In [None]:
# Splitting features and target
X, y = train[num_cols], train[['target']]

In [None]:
print(X.shape, y.shape)

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =55)

In [None]:
# Training the model with reduced columns
lgbm = LGBMRegressor(learning_rate=0.01,
                     objective="regression",
                     n_estimators=3050,
                     num_leaves=35,
                     max_bin=240,
                     colsample_bytree=0.3,
                     max_depth=9,
                     min_child_samples=20,
                     n_jobs=6,
                     random_state=42)


lgbm.fit(
          X_train, y_train.values, 
          eval_set=[(X_test, y_test.values)],
          eval_metric = "rmse",
          early_stopping_rounds=20,
          verbose=False
          )


# Make predictions
y_pred = lgbm.predict(X_test)

# Check score
mean_squared_error(y_test, y_pred, squared=False)

In [None]:
# pd.set_option("max_colwidth", None)
# pd.concat([X[labels], y], axis=1).corr()

In [None]:
cols, cols_imp = get_feature_imp(lgbm, 5)
print(f"Columns:\n {cols_imp}")

In [None]:
df_imp = feature_imp_dataframe(lgbm)
plot_feature_importance(df_imp, 30)

### Creating dataset using important columns from lgbm.feature_imp

In [None]:
X = train[num_cols].drop(columns=drop_cols)
X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =55)

In [None]:
# lgbm_preds_feat
# (learning_rate=0.01,
# objective="regression",
# n_estimators=3000,
# num_leaves=35,
# max_bin=240,
# colsample_bytree=0.7,
# max_depth=9,
# min_child_samples=20,
# n_jobs=8,
# random_state=42) score = 9.77, 10.17


# lgbm_preds_feat1
# (learning_rate=0.01,
# objective="regression",
# n_estimators=3000,
# num_leaves=35,
# max_bin=240,
# colsample_bytree=0.3,
# max_depth=9,
# min_child_samples=20,
# n_jobs=8,
# random_state=42) score = 9.749, 10.0178



In [None]:
# Training the model with reduced columns
lgbm_2 = LGBMRegressor(
                    objective="regression",
                     learning_rate=0.01,
                     n_estimators=3000,
                     num_leaves=35,
                     max_bin=240,
                     colsample_bytree=0.3,
                     max_depth=9,
                     min_child_samples=20,
                     n_jobs=8,
                     random_state=42
                )

lgbm_2.fit(
          X_train.values, y_train.values, 
          eval_set=[(X_test.values, y_test.values)],
          eval_metric = "rmse",
          early_stopping_rounds=50,
          verbose=False
          )

# Make predictions
y_pred = lgbm_2.predict(X_test)

# Check score
mean_squared_error(y_test, y_pred, squared=False)

## Prediction on test dataset

In [None]:
## Training on whole dataset
lgbm.fit(X, y)

In [None]:
# Prediction on test dataset
test_data = df_test[X.columns.to_list()]
# lgbm_preds_new_feat = lgbm.predict(test_data)
# lgbm_preds_new_feat1 = lgbm.predict(test_data)

# lgbm_preds_new_feat2 = lgbm.predict(test_data)
lgbm_preds_drop_cols = lgbm.predict(test_data)

lgbm_preds_drop_cols

In [None]:
lgbm_preds_new_feat1

## Making Submission filr

In [None]:
import eli5

In [None]:
test_data = test[num_cols].drop(columns=low_imp_col, axis=1)

In [None]:
sub_file = pd.read_csv("GBRSubmission1_score_10.63.csv")
sub_file.target = lgbm_preds_drop_cols
sub_file.to_csv("LGBMSubmission_lgb_preds_drop_cols.csv", index=False)