In [212]:
## Importing libraries 
import eli5

import time
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("ggplot")
%matplotlib inline


import optuna
from optuna.integration import LightGBMPruningCallback, CatBoostPruningCallback
optuna.logging.set_verbosity(optuna.logging.WARNING)

import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, RepeatedKFold

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.preprocessing import OrdinalEncoder, RobustScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.metrics import mean_squared_error

from sklearn.pipeline import Pipeline, make_pipeline 
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector


from xgboost import DMatrix, XGBRegressor
from catboost import Pool, CatBoostRegressor
from lightgbm import LGBMRegressor, DaskLGBMRegressor

## Reading the data

In [346]:
train = pd.read_csv("Train.csv")

test = pd.read_csv("Test.csv")
sub = pd.read_csv("SampleSubmission (1).csv")
vard = pd.read_csv("VariableDescription.csv")

In [214]:
train.head()

Unnamed: 0,child_id,data_year,child_date,child_age,child_enrolment_date,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,...,obs_cooking_5,obs_cooking_6,obs_heating_1,obs_heating_2,obs_heating_3,obs_heating_4,obs_heating_5,obs_heating_6,obs_heating_7,target
0,ID_SYSJ2FM0D,2022.0,2022-02-03,59.0,,,,,,Sometimes,...,,,,,,,,,,51.5
1,ID_J5BTFOZR3,2019.0,,60.163933,,,,1st year in the programme,103.0,Sometimes,...,,,,,,,,,,55.869999
2,ID_R00SN7AUD,2022.0,2022-03-11,69.0,,,,,108.400002,Often,...,,,,,,,,,,47.52
3,ID_BSSK60PAZ,2021.0,2021-10-13,53.0,2020-01-15,20.0,No,1st year in the programme,98.099998,Almost always,...,,,,,,,,,,58.599998
4,ID_IZTY6TC4D,2021.0,2021-10-13,57.0,2021-10-13,0.0,,2nd year in programme,114.0,Almost always,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,76.599998


In [215]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8585 entries, 0 to 8584
Columns: 679 entries, child_id to target
dtypes: float64(160), object(519)
memory usage: 44.5+ MB


In [216]:
df_train = train.copy()
df_test = test.copy()

In [217]:
# train.columns[train.columns.str.contains("observe")].to_list()
observe_columns =['child_observe_attentive', 'child_observe_concentrated',
               'child_observe_diligent', 'child_observe_interested',]

mapper_observe = {'Sometimes':1,
         'Almost never':0,
         'Often':2,
         'Almost always':3}

In [218]:
social_columns = ['teacher_social_initiative', 'teacher_social_peers',
                'teacher_social_nonaggressive',  'teacher_social_cooperate', 
                'teacher_social_assistance', 'teacher_social_ideas', 'teacher_social_initiative']

mapper_social = {'None of the time': 0,
                 'A little of the time': 1,
                 'Most of the time': 2,
                 'All of the time': 3}

In [219]:
# Function to map values to the columns
def mapper(df, map_columns, map_values):
    for col in map_columns:
        df[col] = df[col].map(map_values)
    return df

In [220]:
# Mapping values
df_train = mapper(df_train, observe_columns, mapper_observe)
df_test = mapper(df_test, observe_columns, mapper_observe)

df_train = mapper(df_train, social_columns, mapper_social)
df_test = mapper(df_test, social_columns, mapper_social)

In [221]:
df_train.columns[df_train.columns.str.contains("child")]

Index(['child_id', 'child_date', 'child_age', 'child_enrolment_date',
       'child_months_enrolment', 'child_grant', 'child_years_in_programme',
       'child_height', 'child_observe_attentive', 'child_observe_concentrated',
       'child_observe_diligent', 'child_observe_interested',
       'child_observe_total', 'child_gender', 'child_dob', 'child_zha',
       'child_stunted', 'child_attends', 'child_attendance', 'child_languages',
       'child_age_group', 'pri_children_4_6_years', 'obs_toilets_children',
       'count_children_present', 'count_children_attendance',
       'count_children_precovid', 'count_toilets_children', 'language_child'],
      dtype='object')

In [222]:
drop_cols = ['count_staff_gender_other',
         'pri_days',
         'obs_lighting_8',
         'count_practitioners_all',
         'count_register_gender_other',
         'obs_heating_3',
         'obs_lighting_5',
         'pri_difficult_see',
         'pri_difficult_walk',
         'obs_lighting_6',        
         'teacher_social_initiative',]
df_train[drop_cols].isnull().sum()

count_staff_gender_other       3424
pri_days                       3394
obs_lighting_8                 6330
count_practitioners_all        6957
count_register_gender_other    3470
obs_heating_3                  6324
obs_lighting_5                 6330
pri_difficult_see              6957
pri_difficult_walk             6957
obs_lighting_6                 6330
teacher_social_initiative      8585
dtype: int64

In [274]:
def preprocess_data(train, test):
   # train.columns[train.columns.str.contains("observe")].to_list()
    observe_columns =['child_observe_attentive', 'child_observe_concentrated',
                   'child_observe_diligent', 'child_observe_interested',]

    mapper_observe = {'Sometimes':1,
             'Almost never':0,
             'Often':2,
             'Almost always':3} 
    
    social_columns = ['teacher_social_initiative', 'teacher_social_peers',
                'teacher_social_nonaggressive',  'teacher_social_cooperate', 
                'teacher_social_assistance', 'teacher_social_ideas', 'teacher_social_initiative']

    mapper_social = {'None of the time': 0,
                 'A little of the time': 1,
                 'Most of the time': 2,
             'All of the time': 3} 
    
    # Mapping values
    train = mapper(train, observe_columns, mapper_observe)
    test = mapper(test, observe_columns, mapper_observe)

    train = mapper(train, social_columns, mapper_social)
    test = mapper(test, social_columns, mapper_social) 
    
    # Columns to drop
    drop_cols = ['count_staff_gender_other',
         'pri_days',
         'obs_lighting_8',
         'count_practitioners_all',
         'count_register_gender_other',
         'obs_heating_3',
         'obs_lighting_5',
         'pri_difficult_see',
         'pri_difficult_walk',
         'obs_lighting_6',        
         'teacher_social_initiative',]
    
    train.drop(columns=drop_cols, axis=1, inplace=True)
    test.drop(columns=drop_cols, axis=1, inplace=True)
    
    return train, test

In [223]:
# df_train["child_gender_male"] = df_train["child_gender"].map({"Male":1,
#                                                        "Female": 0})
# df_test["child_gender_male"] = df_test["child_gender"].map({"Male":1,
#                                                        "Female":0})

In [224]:
# df_train["count_present_boys_girls_ratio"] = df_train["count_present_boys"] / df_train["count_present_girls"]
# df_test["count_present_boys_girls_ratio"] = df_test["count_present_boys"] / df_test["count_present_girls"]

In [225]:
# Extracting  day and month from some datetime columns
# datetime_columns = [
#                      'child_enrolment_date',
#                      'pqa_date',
#                      'pra_date',
#                      'obs_date',
#                      'child_dob'
#                     ]


# df_train[datetime_columns] = df_train[datetime_columns].apply(lambda x:pd.to_datetime(x, yearfirst=True))
# df_test[datetime_columns] = df_train[datetime_columns].apply(lambda x: pd.to_datetime(x, yearfirst=True))

# # Extracting month
# df_train["child_dob_month"] = df_train["child_dob"].dt.month
# df_test["child_dob_month"] = df_test["child_dob"].dt.month

# # Extracting day
# df_train["child_dob_day"] = df_train["child_dob"].dt.day
# df_test["child_dob_day"] = df_test["child_dob"].dt.day

# # Converting "child_enroll_month_column" to datetime 
# df_train["child_enrolment_month"] = df_train["child_enrolment_date"].dt.month
# df_test["child_enrolment_month"] = df_test["child_enrolment_date"].dt.month

# df_train["child_enrolment_day"] = df_train["child_enrolment_date"].dt.month
# df_test["child_enrolment_day"] = df_test["child_enrolment_date"].dt.month


In [226]:
# df_train["child_age_height_ratio"] = df_train["child_age"] / df_train["child_height"]
# df_test["child_age_height_ratio"] = df_test["child_age"] / df_test["child_height"]

# df_train["child_zha_ag_height_ratio"] = df_train["child_zha"] / df_train["child_age_height_ratio"]
# df_test["child_zha_ag_height_ratio"] = df_test["child_zha"] / df_test["child_age_height_ratio"]

In [227]:
# id_columns = ['id_enumerator', 'id_facility', 'id_ward', 'id_team']
# id_n_columns = ['id_facility_n', 'id_ward_n', 'id_mn_n', 'id_dc_n', 'id_prov_n']


# df_train['count_id'] = df_train[id_columns].sum(axis=1)
# df_test['count_id'] = df_test[id_columns].sum(axis=1)


# df_train['count_id_n'] = df_train[id_n_columns].sum(axis=1)
# df_test['count_id_n'] = df_test[id_n_columns].sum(axis=1)

# Machine Learning Algorithms

## Training with numerical columns only

In [228]:
n_splits = 10

kf = KFold(n_splits, shuffle=True, random_state=42)
rkf = RepeatedKFold(n_splits=n_splits, n_repeats=2, random_state=42)

In [362]:
# # Cross validation
def cross_validation(X, y, model, cv_method):
    rmse_scores = list()
    
    # cv = cv_method(n_splits = N_SPLITS, n_repeats=5, random_state=1121218)
    
    for idx, (train_idx, test_idx) in enumerate(cv_method.split(X, y)):
        
        print("=" * 12 + f"Training fold {idx}" + 12 * "=")
        start = time.time()
        
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # Making eval_set
        eval_set = [(X_test.values, y_test)]

        # Fitting the model to the data
        model.fit(X_train.values, y_train,
                 eval_set=eval_set,
                 eval_metri="rmse",
                 early_stopping_rounds=100,
                 verbose=False)

        # Predictions
        preds = model.predict(X_test)
        
        # Loss
        rmse = mean_squared_error(y_test, preds, squared=False)
        rmse_scores.append(rmse)
        
        runtime = time.time()-start
        
        print(f"Fold {idx} finished with score: {rmse:.5f} in {runtime:.2f} seconds.\n")
    
    print(f"Average score {np.mean(rmse_scores)}")
        
    
    return np.mean(rmse_scores), rmse_scores

# rmse, cv_scores = cross_validation(X, y, hgbr, k_fold)

In [230]:
# Getting feature importances
def get_feature_imp(model, imp_factor):
    cols_imp = [(col,imp) for col, imp in zip(model.feature_name_, model.feature_importances_) if imp < imp_factor]
    cols = [col for col, imp in zip(model.feature_name_, model.feature_importances_) if imp < imp_factor]
    
    return cols, cols_imp

In [231]:
def feature_imp_dataframe(model):
    feature_imp = model.feature_importances_
    columns = model.feature_name_

    df_imp = pd.DataFrame(
            {"features": columns,
            "feature_importance": feature_imp}
    )

    df_imp.sort_values(by=['feature_importance'], ascending=False, inplace=True)
    df_imp.reset_index(drop=True, inplace=True)
    
    return df_imp

In [232]:
def plot_feature_importance(df_feature_imp, n_imp_cols):
    labels = df_feature_imp.iloc[:30, 0].values

    plt.figure(figsize=(20, 6))
    ax = df_feature_imp.iloc[:30, 1].\
        plot(kind="bar", title="Feature Importance")
    ax.set_xticklabels(labels)
    plt.show()

### Selecting numerical columns from df_train and df_test

### Lightgbm

In [353]:
def train_model(estimator, params, X, y, X_val=None, y_val=None):

    model = estimator(**params)
    if X_val is not None:
        
        model. fit(X, y,
                eval_set=[(X_val, y_val)],
                eval_metric="rmse",
                early_stopping_rounds=100,
                verbose=False)

        print(model.best_score_)
        return model
    else:
        model.fit(X_train, y_train)
        return model

In [330]:
def predict_output(model, data):
    preds = model.predict(data)
    return preds

In [275]:
# Selecting numerical columns
train_int_cols = df_train.select_dtypes(include = ["float64", "number", "Int64", "int64"]).columns
test_int_cols = df_test.select_dtypes(include = ["float64", "number", "Int64", "int64"]).columns

# Numerical columns
num_cols_df = list(set(test_int_cols).intersection(set(train_int_cols)))

# Separating features and target
X, y = df_train[num_cols_df].drop(columns=cols_list_drop_3), df_train[['target']]
# X, y = df_train[num_cols_df].drop(columns=cols_list_2), df_train[['target']]

In [276]:
X.shape, y.shape

((8585, 150), (8585, 1))

In [332]:
# Pipeline for filling nan values and sacling the data
def use_pipeline():
    
    pipeline = Pipeline([('imputer', IterativeImputer()), # Imputing missing values
                     ('scaler', RobustScaler()),]) # Scaling the data

    # Fitting pipeline of train and test data
    train_x = pd.DataFrame(data=pipeline.fit_transform(X), columns=X.columns.to_list())
    test_data = pd.DataFrame(data=pipeline.transform(test_data), columns=X.columns.to_list())
    
    return train, test

In [277]:
# Test data
test_data = df_test.loc[:, X.columns]
test_data.shape

(3680, 150)

In [334]:
# train_x, test_ = use_pipeline()

In [278]:
X_train, X_test, y_train, y_test = train_test_split(train_x, y, test_size = 0.1, random_state =55)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state =55)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(7726, 158) (7726, 1)
(859, 158) (859, 1)


In [279]:
params_reg = {'learning_rate':0.01,
         'objective':'regression',
         'n_estimators':3000,
         'num_leaves':35,
         'max_bin':240,
         'colsample_bytree':0.3,
         'max_depth':9,
         'min_child_samples':20,
         'n_jobs':6,
        'reg_alpha':0.01,
         'random_state':42}

params_tweedie = {'learning_rate':0.01,
         'objective':'tweedie',
         'n_estimators':4000,
         'num_leaves':35,
         'max_bin':240,
         'colsample_bytree':0.3,
         'max_depth':9,
         'min_child_samples':20,
         'n_jobs':6,
         'random_state':42}

In [281]:
model = train_model(LGBMRegressor, params_reg, X_train, y_train, X_test, y_test)

defaultdict(<class 'collections.OrderedDict'>, {'valid_0': OrderedDict([('rmse', 9.627724380324523), ('l2', 92.69307674349523)])})


In [273]:
preds = predict_output(lgbm, X_test)
print(preds[:10])

[34.49386685 30.46165061 53.35228212 49.9250945  28.48678123 46.93718381
 39.54700813 59.64352429 40.37541512 51.01113479]


In [201]:
get_feature_imp(lgbm, 5)

(['count_staff_contract_substitute',
  'count_register_race_other',
  'obs_cooking_4'],
 [('count_staff_contract_substitute', 3),
  ('count_register_race_other', 3),
  ('obs_cooking_4', 0)])

In [327]:
# pd.set_option("display.max_row", None)
# df_imp

In [328]:
# df_imp = feature_imp_dataframe(lgbm)
# plot_feature_importance(df_imp, 30)

In [204]:
# Training the model on whole dataset
lgbm = LGBMRegressor(**params_tweedie)
lgbm.fit(train_x, y)

In [208]:
# # Prediction on test set
# test_data =  df_test[X.columns.to_list()]

preds_reg_tweedie = lgbm.predict(test_data)
preds_reg_tweedie

array([73.08726897, 75.4914531 , 56.61901609, ..., 57.10064819,
       69.23507404, 52.66858393])

In [210]:
np.sum(np.abs(preds_reg_tweedie) - np.abs(preds_reg_tweedie_1))

-68784.45989040818

In [207]:
# # Creating a submissionfile
# sub_file = pd.read_csv("GBRSubmission1_score_10.63.csv")
# sub_file.target = preds_reg_tweedie
# sub_file.to_csv("LGBM_tweedie_2.csv", index=False)

### Selecting common numerical features from train and test set

In [347]:
# Preprocess the data
# train, test = preprocess_data(train, test)

In [348]:
train_int_cols = train.select_dtypes(include = ["float64", "number", "Int64", "int64"]).columns
test_int_cols = test.select_dtypes(include = ["float64", "number", "Int64", "int64"]).columns

# Numerical columns
num_cols = list(set(test_int_cols).intersection(set(train_int_cols)))

In [349]:
# Splitting features and target
X, y = train[num_cols], train[['target']]

In [350]:
print(X.shape, y.shape)

(8585, 158) (8585, 1)


In [351]:
# train_x = pd.DataFrame(data=pipeline.fit_transform(X), columns=X.columns.to_list())

In [354]:
# Splitting the data
# X_train, X_test, y_train, y_test = train_test_split(train_x, y, test_size = 0.1, random_state =55)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state =55)

# Training the model
model_tweedie = train_model(LGBMRegressor, params_tweedie, X_train, y_train, X_test, y_test)
model_reg = train_model(LGBMRegressor, params_reg, X_train, y_train, X_test, y_test)

defaultdict(<class 'collections.OrderedDict'>, {'valid_0': OrderedDict([('rmse', 9.540110969669945), ('tweedie', 27.576378403610224)])})
defaultdict(<class 'collections.OrderedDict'>, {'valid_0': OrderedDict([('rmse', 9.472959295637873), ('l2', 89.736957816812)])})


In [356]:
cols, cols_imp = get_feature_imp(lgbm, 5)
print(f"Columns:\n {cols_imp}")

Columns:
 [('count_staff_contract_substitute', 4), ('obs_cooking_4', 0), ('obs_lighting_3', 2)]


In [326]:
# df_imp = feature_imp_dataframe(lgbm)
# plot_feature_importance(df_imp, 40)

## Catboost

In [None]:
# cbr = CatBoostRegressor(eval_metric="RMSE",learning_rate=0.01, n_estimators=1000, max)
# cbr.fit(X_train, y_train,
#        eval_set=[(X_test, y_test)],
#        early_stopping_rounds=50)


# def objective(trial: optuna.Trial) -> float:
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state =55)

#     param = {
#         "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.2),
#         "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
#         "depth": trial.suggest_int("depth", 1, 12),
#         "min_child_samples": trial.suggest_int("min_child_samples", 20, 100, step = 5),
#         "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
#         "bootstrap_type": trial.suggest_categorical(
#             "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
#         ),
#         "eval_metric": "RMSE",
#     }

#     if param["bootstrap_type"] == "Bayesian":
#         param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
#     elif param["bootstrap_type"] == "Bernoulli":
#         param["subsample"] = trial.suggest_float("subsample", 0.1, 1, log=True)

#     cbr = CatBoostRegressor(**param)

#     pruning_callback = CatBoostPruningCallback(trial, "RMSE")
#     cbr.fit(
#         X_train,
#         y_train,
#         eval_set=[(X_test, y_test)],
#         verbose=0,
#         early_stopping_rounds=100,
#         callbacks=[pruning_callback],
#     )

#     # evoke pruning manually.
#     pruning_callback.check_pruned()

#     preds = cbr.predict(X_test)
#     pred_labels = np.rint(preds)
#     accuracy = mean_squared_error(y_test, pred_labels, squared=False)

#     return accuracy


# if __name__ == "__main__":
#     study = optuna.create_study(
#         pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="minimize"
#     )
#     study.optimize(objective, n_trials=100, timeout=600)

#     print("Number of finished trials: {}".format(len(study.trials)))

#     print("Best trial:")
#     trial = study.best_trial

#     print("  Value: {}".format(trial.value))

#     print("  Params: ")
#     for key, value in trial.params.items():
#         print("    {}: {}".format(key, value))

# cbr = CatBoostRegressor(eval_metric="RMSE",learning_rate=0.01)
# cbr.fit(X, y)

## Prediction on test dataset

In [None]:
## Training on whole dataset
lgbm.fit(X, y)

In [None]:
lgbm_preds_new_feat1

## Making Submission filr

In [None]:
import eli5

In [None]:
test_data = test[num_cols].drop(columns=low_imp_col, axis=1)

In [None]:
# sub_file = pd.read_csv("GBRSubmission1_score_10.63.csv")
# sub_file.target = lgbm_preds_drop_cols
# sub_file.to_csv("LGBMSubmission_lgb_preds_drop_cols.csv", index=False)