In [None]:
'''
Importing Libraries
'''
import numpy as np
import pandas as pd
import os
import datetime
import xgboost as xgb

from sklearn.model_selection import train_test_split, GridSearchCV,KFold

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
'''
Loading data
'''
train_df = pd.read_csv('train.csv', encoding = "ISO-8859-1") 
demog_df = pd.read_csv('demog.csv', encoding = "ISO-8859-1") 
submission = pd.read_csv('submission.csv', encoding = "ISO-8859-1")

In [None]:
plt.hist(np.sort(train_df['RL'].dropna().values))

In [None]:
plt.hist(np.sort(train_df['OLV'].dropna().values))

In [None]:
plt.hist(np.sort(train_df['RR'].dropna().values))

In [None]:
plt.hist(np.sort(train_df['DRT'].dropna().values))

In [None]:
plt.hist(np.sort(train_df['DMS'].dropna().values))

In [None]:
plt.hist(np.sort(train_df['OLA'].dropna().values))

In [None]:
train_df.head()

In [None]:
demog_df.head()

In [None]:
train_df = pd.merge(train_df, demog_df, on='HCP_ID', how='left')
train_df.head()

In [None]:
train_df.isnull().sum()

In [None]:
train_df['gender'].value_counts()

In [None]:
train_df['gender'] = train_df['gender'].fillna('MALE')

In [None]:
from sklearn.preprocessing import LabelEncoder
lbl = LabelEncoder()

train_df['Region'] = lbl.fit_transform(train_df['Region'])
train_df['Value'] = lbl.fit_transform(train_df['Value'])
train_df['gender'] = lbl.fit_transform(train_df['gender'])

train_df.dtypes

In [None]:
'''
Number of Null values in every column
'''
def get_number_of_null_entries(df, var):
    return df[var].shape[0] - df[var].count()

for x in train_df.columns:
    print (x, end=" ")
    print (get_number_of_null_entries(train_df, x))

In [None]:
def fill_na_function(data, output_col):
    predictor_cols = data.columns.drop(output_col)
    null_indices = data[data[output_col].isnull()].index
    non_null_indices = data[~(data[output_col].isnull())].index
    
    for col in predictor_cols:
        if data[col].dtype=='object':
            data[col] = data[col].fillna(data[col].mode()[0])
        else:
            data[col] = data[col].fillna(data[col].mean())
    
    for col in predictor_cols:
        if(data[col].dtype == 'object'):
            lbl = LabelEncoder().fit(data[col])
            data[col] = lbl.transform(data[col])
    
    X, y = data[predictor_cols], data[output_col]
    X_train, X_test = X.loc[non_null_indices], X.loc[null_indices]
    y_train, y_test = y[non_null_indices], y[null_indices]
    
    if data[output_col].dtype == 'object':
        num_class = len(np.unique(y_train))
        if(num_class > 2):
            xgb_model = xgb.XGBClassifier(objective = 'multi:softmax', num_class = num_class)
            parameters = {
              'learning_rate': [0.05],
              'max_depth': [3,4,5,6],
              'n_estimators': [30, 40, 50],
            }
            clf = GridSearchCV(xgb_model, parameters, n_jobs=5, cv=StratifiedKFold(n_splits=5, shuffle=True),verbose=10)
            clf.fit(X_train, y_train)
    
            predicting_model = xgb.XGBClassifier(
              objective = 'multi:softmax',
              num_class = num_class,
              learning_rate = clf.best_params_['learning_rate'],
              max_depth = clf.best_params_['max_depth'],
              n_estimators = clf.best_params_['n_estimators'],
            )
        else:
            xgb_model = xgb.XGBClassifier()
            parameters = {
              'learning_rate': [0.05],
              'max_depth': [3,4,5,6],
              'n_estimators': [30, 60, 90, 125],
            }
            clf = GridSearchCV(xgb_model, parameters, n_jobs=5, cv=StratifiedKFold(n_splits=5, shuffle=True),verbose=10)
            clf.fit(X_train, y_train)
    
            predicting_model = xgb.XGBClassifier(
              learning_rate = clf.best_params_['learning_rate'],
              max_depth = clf.best_params_['max_depth'],
              n_estimators = clf.best_params_['n_estimators'],
            )
        predicting_model.fit(X_train, y_train)
        y[null_indices] = predicting_model.predict(X_test)
        return y
    
    else:
        xgb_model = xgb.XGBRegressor()
        parameters = {
              'learning_rate': [0.05],
              'max_depth': [3,4,5,6],
              'n_estimators': [30, 60, 90, 125],
        }
        clf = GridSearchCV(xgb_model, parameters, n_jobs=5, cv=KFold(n_splits=5, shuffle=True), verbose=1)
        clf.fit(X_train, y_train)
        
        predicting_model = xgb.XGBRegressor(
              learning_rate = clf.best_params_['learning_rate'],
              max_depth = clf.best_params_['max_depth'],
              n_estimators = clf.best_params_['n_estimators'],
        )
        predicting_model.fit(X_train, y_train)
        y[null_indices] = predicting_model.predict(X_test)
        return y

In [None]:
temp = pd.DataFrame(np.column_stack([train_df.columns, train_df.isnull().sum()]), columns = ['features', 'missing_values']).sort_values('missing_values')
for col in temp['features']:
    print (col, train_df[col].isnull().sum())
    if (train_df[col].isnull().sum() !=0 ):
        train_df[col] = fill_na_function(train_df.copy(), col)

In [None]:
train_df[['HCP_ID','RL', 'P2P', 'OLV', 'RR', 'DRT', 'DMS', 'OLA', 'DEM']].to_csv('shariq_suhail_03021997.csv', index=False)