In [1]:
#Import libraries
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import pickle
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tqdm.notebook import tqdm
from sklearn.externals import joblib

In [2]:
import tensorflow as tf
tf.keras.backend.clear_session()
from tensorflow.keras.layers import Dense,Input,Dropout
from tensorflow.keras.models import Model

<h2>1. Label Encoding and Normalising

<h3>1.1. Import dataset</h3>

In [3]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [4]:
data.shape

(4209, 378)

<h3>1.2. Removing outliers</h3>

In [5]:
#Remove outliers
data = data[data['y']<=150]

In [6]:
data.shape

(4194, 378)

<h3>1.3. Split the dataset</h3>

In [7]:
X = data.drop(columns=['y'])
Y = data['y']

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=25) 

In [9]:
print(X_train.shape)
print(X_val.shape)

(3355, 377)
(839, 377)


<h3>1.4. Label encoding of categorical features</h3>

In [10]:
def label_encoding(dataset):
    """
    Returns dictionary of Label encoding of categorical features
    """
    a = dataset.value_counts()
    
    #Get categories which occur more than 5 times in dataset
    b = list(filter(lambda x:x>5,a.values))    
    c = list(a.index[0:len(b)])

    return {j:i+1 for i,j in enumerate(c)}  #Dictionary of labels, unknown categories are assigned label 0

In [11]:
x0_labels = label_encoding(X_train['X0'])
x1_labels = label_encoding(X_train['X1'])
x2_labels = label_encoding(X_train['X2'])
x3_labels = label_encoding(X_train['X3'])
x5_labels = label_encoding(X_train['X5'])
x6_labels = label_encoding(X_train['X6'])
x8_labels = label_encoding(X_train['X8'])

In [12]:
categ_mapping = [x0_labels,x1_labels,x2_labels,x3_labels,x5_labels,x6_labels,x8_labels]

In [13]:
with open('categ_mapping.pkl', 'wb') as f:
    pickle.dump(categ_mapping, f)

In [14]:
X_train['X0'] = [x0_labels[i] if x0_labels.get(i) is not None else 0 for i in X_train['X0']]
X_train['X1'] = [x1_labels[i] if x1_labels.get(i) is not None else 0 for i in X_train['X1']]
X_train['X2'] = [x2_labels[i] if x2_labels.get(i) is not None else 0 for i in X_train['X2']]
X_train['X3'] = [x3_labels[i] if x3_labels.get(i) is not None else 0 for i in X_train['X3']]
X_train['X5'] = [x5_labels[i] if x5_labels.get(i) is not None else 0 for i in X_train['X5']]
X_train['X6'] = [x6_labels[i] if x6_labels.get(i) is not None else 0 for i in X_train['X6']]
X_train['X8'] = [x8_labels[i] if x8_labels.get(i) is not None else 0 for i in X_train['X8']]

<h3>1.5. Normalise categorical features</h3>

In [15]:
#Features from feature engineering which are non-binary
scale_features = ['ID','X0','X1','X2','X3','X5','X6','X8']

In [16]:
scaler = MinMaxScaler()
scaler.fit(X_train[scale_features])

MinMaxScaler(copy=True, feature_range=(0, 1))

In [17]:
scale_features_min = {i:j for i,j in zip(scale_features,scaler.data_min_)}
scale_features_max = {i:j for i,j in zip(scale_features,scaler.data_max_)}

In [18]:
norm_features = [scale_features,scale_features_min,scale_features_max]

In [19]:
with open('norm_features.pkl', 'wb') as f:
    pickle.dump(norm_features, f)

<h2>2. Modeling

<h3>2.1. Final Function 1 

In [20]:
def final_fun_1(dataset):
    
    """
    This function returns the predictions for input dataset
    
    Following files are needed in directory to run this function:
        1. categ_mapping.pkl - Contains the label encoding mapping of categorical features
        2. feature_engg.npy - Names of feature engineering variables
        3. num_features.npy - Names of binary features in the dataset
        4. norm_features.pkl - Contains utilies for normalising the non-binary features
        5. top_features.npy - Contains names of top features obtained during feature selection
        6. ensemble_drop_base_models.npy - Contains model names to drop from ensemble base models
    """
    start = time.time()
    
    #Functions
    def feature_engineering(data,feature_engg):
        """
        Adding new features to dataset
        """
        for i in tqdm(feature_engg):
            if len(i.split('_'))==2:
                a,b = i.split('_')
                data[i] = data[a] + data[b]
            else:
                a,b,c = i.split('_')
                data[i] = data[a] + data[b] + data[c]
        return data
    
    def normalizer(obs,min_value,max_value):
        """
        This function normalises the input value.
        """
        return (obs - min_value)/(max_value-min_value)
    
    print('\n1. Data preparation')
    print('\n1.1. Label encoding of categorical features')
    #Load categorical label encoding mapping
    with open('categ_mapping.pkl', 'rb') as f:
        categ_mapping = pickle.load(f)
    
    #Encode Categorical variables
    x0_labels,x1_labels,x2_labels,x3_labels,x5_labels,x6_labels,x8_labels = categ_mapping
    dataset['X0'] = [x0_labels[i] if x0_labels.get(i) is not None else 0 for i in dataset['X0']]
    dataset['X1'] = [x1_labels[i] if x1_labels.get(i) is not None else 0 for i in dataset['X1']]
    dataset['X2'] = [x2_labels[i] if x2_labels.get(i) is not None else 0 for i in dataset['X2']]
    dataset['X3'] = [x3_labels[i] if x3_labels.get(i) is not None else 0 for i in dataset['X3']]
    dataset['X5'] = [x5_labels[i] if x5_labels.get(i) is not None else 0 for i in dataset['X5']]
    dataset['X6'] = [x6_labels[i] if x6_labels.get(i) is not None else 0 for i in dataset['X6']]
    dataset['X8'] = [x8_labels[i] if x8_labels.get(i) is not None else 0 for i in dataset['X8']]
    
    #Load feature names for feature engineering
    print('\n1.2. Feature engineering')
    feature_engg = np.load('feature_engg.npy',allow_pickle=True)
    
    #Feature engineering
    dataset = feature_engineering(dataset,feature_engg)
    
    #Load names of binary features
    num_features = np.load('num_features.npy',allow_pickle=True)
    
    #Combine features
    final_features = ['ID','X0','X1','X2','X3','X5','X6','X8'] + list(num_features) + list(feature_engg)
    dataset = dataset[final_features]
    
    print('\n1.3. Normalising of non-binary features')
    
    #Load utilies for normalising features
    with open('norm_features.pkl', 'rb') as g:
        norm_features = pickle.load(g)
    
    #Normalise features
    scale_features, scale_features_min, scale_features_max = norm_features
    for i in tqdm(scale_features):
        dataset.loc[:,i] = dataset.loc[:,i].apply(normalizer,min_value=scale_features_min[i],max_value=scale_features_max[i])
    
    #Load names of top features
    top_features = np.load('top_features.npy')
    
    print('\n2. Model Prediction using Ensemble Bagging')
    
    features = top_features[:15]
    X_pred = dataset[features]
    X_pred.columns = ['f'+str(i) for i in range(len(X_pred.columns))]

    drop_models = np.load('ensemble_drop_base_models.npy')
    
    n_folds = 10
    n_splits = 50
    pred = np.zeros((X_pred.shape[0],n_splits-len(drop_models)))

    ml_models = [i+1 for i in range(n_splits) if i+1 not in drop_models]

    for i,ml in enumerate(tqdm(ml_models)):
        a = np.zeros((pred.shape[0],n_folds))
        for j in range(n_folds):
            m = joblib.load('ml_models/ml_model_'+str(ml)+'_fold_'+str(j+1)+'.pkl')
            a[:,j] = m.predict(X_pred)
        pred[:,i] = np.mean(a,axis=1)

    test_pred = pred
    
    #Take average of all predictions
    final_pred = np.mean(test_pred,axis=1)
    
    print('Time taken for predictions: {} seconds'.format(round(time.time()-start)))
    
    return final_pred

In [21]:
test = pd.read_csv('test.csv')

In [22]:
test_pred = final_fun_1(test)


1. Data preparation

1.1. Label encoding of categorical features

1.2. Feature engineering


HBox(children=(FloatProgress(value=0.0, max=314.0), HTML(value='')))



1.3. Normalising of non-binary features


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))



2. Model Prediction using Ensemble Bagging


HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))


Time taken for predictions: 37 seconds


In [23]:
pred = pd.DataFrame()
pred['ID'] = test.ID
pred['y'] = test_pred
pred.to_csv('final_test_pred2.csv', index=False)

<h3>3.2. Final Function 2

In [25]:
def final_fun_2(X,Y):
    
    """
    This function returns the R-squared value. 
    
    Following files are needed in directory to run this function:
        1. categ_mapping.pkl - Contains the label encoding mapping of categorical features
        2. feature_engg.npy - Names of feature engineering variables
        3. num_features.npy - Names of binary features in the dataset
        4. norm_features.pkl - Contains utilies for normalising the non-binary features
        5. top_features.npy - Contains names of top features obtained during feature selection
    """
    
    final_pred = final_fun_1(X)
    r_squared_value = r2_score(Y,final_pred) 
    
    return r_squared_value

In [26]:
train = pd.read_csv('train.csv')

In [27]:
data = train.sample(2000)
X = data.drop(columns=['y'])
Y = data['y']

In [28]:
score = final_fun_2(X,Y)


1. Data preparation

1.1. Label encoding of categorical features

1.2. Feature engineering


HBox(children=(FloatProgress(value=0.0, max=314.0), HTML(value='')))



1.3. Normalising of non-binary features


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))



2. Model Prediction using Ensemble Bagging


HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))


Time taken for predictions: 29 seconds


In [29]:
print('R-squared value:',score)

R-squared value: 0.5999049866739438
