In [1]:
#Encoding
%matplotlib inline

from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt


DATA_DIR = Path('..','Desktop','BDProject')

train_values = pd.read_csv('/Users/ShaimaShoukat/Desktop/BDProject/train_values.csv', index_col='building_id')
train_labels = pd.read_csv('/Users/ShaimaShoukat/Desktop/BDProject/train_labels.csv', index_col='building_id')
test_values = pd.read_csv('/Users/ShaimaShoukat/Desktop/BDProject/test_values.csv', index_col='building_id')

#Missing values
dataset = pd.concat(objs=[train_values, test_values], axis=0)


#one-hot encoding categorical features
train_values = train_values.drop(['geo_level_2_id', 'geo_level_3_id'], axis=1)
#test_values = test_values.drop(['geo_level_2_id', 'geo_level_3_id'], axis=1)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




# Pre-Processing

In [2]:
#Converting all variables to numerical values
train_values1=pd.get_dummies(train_values, columns=['foundation_type','land_surface_condition','roof_type','ground_floor_type','other_floor_type','position','plan_configuration','legal_ownership_status'],drop_first=True)
test_values_subset = pd.get_dummies(test_values, columns=['foundation_type','land_surface_condition','roof_type','ground_floor_type','other_floor_type','position','plan_configuration','legal_ownership_status'],drop_first=True)


In [3]:
#Finding the best features using K-fold and model selection package
from sklearn import base
from sklearn.model_selection import KFold

class KFoldTargetEncoderTrain(base.BaseEstimator,
                               base.TransformerMixin):
    def __init__(self,colnames,targetName,
                  n_fold=5, verbosity=True,
                  discardOriginal_col=False):
        self.colnames = colnames
        self.targetName = targetName
        self.n_fold = n_fold
        self.verbosity = verbosity
        self.discardOriginal_col = discardOriginal_col
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        assert(type(self.targetName) == str)
        assert(type(self.colnames) == str)
        assert(self.colnames in X.columns)
        assert(self.targetName in X.columns)
        mean_of_target = X[self.targetName].mean()
        kf = KFold(n_splits = self.n_fold,
                   shuffle = True, random_state=2019)
        col_mean_name = self.colnames + '_' + 'Enc'
        X[col_mean_name] = np.nan
        for tr_ind, val_ind in kf.split(X):
            X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
            X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)
                                     [self.targetName].mean())
            X[col_mean_name].fillna(mean_of_target, inplace = True)
        if self.verbosity:
            encoded_feature = X[col_mean_name].values
            print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,self.targetName,                    
                   np.corrcoef(X[self.targetName].values,
                               encoded_feature)[0][1]))
        if self.discardOriginal_col:
            X = X.drop(self.targetName, axis=1)
        return X



In [4]:
#Converting the target variable to numerical values
train = train_values1
train['damage_grade'] = train_labels['damage_grade']
targetc = KFoldTargetEncoderTrain('geo_level_1_id','damage_grade',n_fold=5)
new_train = targetc.fit_transform(train)
new_train = new_train.drop(['damage_grade'], axis=1)
new_train.head(100) 

Correlation between the new feature, geo_level_1_id_Enc and, damage_grade is 0.469760676528422.


Unnamed: 0_level_0,geo_level_1_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,...,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w,geo_level_1_id_Enc
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,2,30,6,5,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,2.162009
28830,8,2,10,8,7,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,2.482426
94947,21,2,10,5,5,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,2.563855
590882,22,2,10,6,5,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,2.001401
201944,11,3,30,8,9,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,2.339278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551423,7,2,15,8,4,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,2.296679
511997,8,1,35,7,3,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,2.486645
266161,10,1,10,8,3,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,2.344031
643474,21,3,0,5,6,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,2.564824


In [5]:
#Feature importance
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


bestfeatures = SelectKBest(score_func=chi2, k=30)
fit = bestfeatures.fit(new_train,train_labels)
np.set_printoptions(precision=10)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(new_train.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  
print(featureScores.nlargest(30,'Score'))  

                                     Specs         Score
2                                      age  89437.601578
37                     ground_floor_type_v  32465.421066
35                             roof_type_x  28048.595012
28                       foundation_type_i  27929.304672
0                           geo_level_1_id  24186.950757
10  has_superstructure_cement_mortar_brick  18949.414382
41                      other_floor_type_s  18549.408221
14        has_superstructure_rc_engineered  14411.297027
3                          area_percentage  11938.314627
13    has_superstructure_rc_non_engineered   8725.518483
31                       foundation_type_w   8315.794578
6      has_superstructure_mud_mortar_stone   6969.601844
29                       foundation_type_r   6391.952318
30                       foundation_type_u   5494.248443
40                      other_floor_type_q   5108.461280
19                 has_secondary_use_hotel   2937.304376
20                has_secondary

In [6]:
features = ['geo_level_1_id','count_floors_pre_eq','age','area_percentage','foundation_type','roof_type','ground_floor_type','other_floor_type','has_superstructure_adobe_mud','has_superstructure_mud_mortar_stone','has_superstructure_mud_mortar_brick','has_superstructure_cement_mortar_brick','has_superstructure_timber','has_superstructure_bamboo','has_superstructure_rc_non_engineered','legal_ownership_status','has_secondary_use_hotel','has_secondary_use_rental','has_superstructure_rc_engineered','has_secondary_use','plan_configuration']
new_train = train_values[features]
test_values = test_values[features]
new_train=pd.get_dummies(new_train, columns=['foundation_type','roof_type','ground_floor_type','other_floor_type','plan_configuration','legal_ownership_status'],drop_first=True)
test_values_subset = pd.get_dummies(test_values, columns=['foundation_type','roof_type','ground_floor_type','other_floor_type','plan_configuration','legal_ownership_status'],drop_first=True)

In [7]:
test_values_subset.head()

Unnamed: 0_level_0,geo_level_1_id,count_floors_pre_eq,age,area_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,...,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
300051,17,3,20,7,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
99355,6,2,25,13,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
890251,22,2,5,4,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
745817,26,1,0,19,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
421793,17,3,15,8,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [8]:
new_train['damage_grade'] = train_labels['damage_grade']
targetc = KFoldTargetEncoderTrain('geo_level_1_id','damage_grade',n_fold=5)
new_train1 = targetc.fit_transform(new_train)
new_train = new_train.drop(['damage_grade'], axis=1)
new_train.head(100) 

Correlation between the new feature, geo_level_1_id_Enc and, damage_grade is 0.469760676528422.


Unnamed: 0_level_0,geo_level_1_id,count_floors_pre_eq,age,area_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,...,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w,geo_level_1_id_Enc
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,2,30,6,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,2.162009
28830,8,2,10,8,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,2.482426
94947,21,2,10,5,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,2.563855
590882,22,2,10,6,0,1,0,0,1,1,...,0,0,0,0,0,0,0,1,0,2.001401
201944,11,3,30,8,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,2.339278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551423,7,2,15,8,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,2.296679
511997,8,1,35,7,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,2.486645
266161,10,1,10,8,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,2.344031
643474,21,3,0,5,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,2.564824


# Over sampling 

In [9]:

from imblearn.over_sampling import SMOTE
smote = SMOTE('minority')
train_values_sm,train_labels_sm = smote.fit_sample(new_train,train_labels.values.ravel())
print(train_values_sm.shape,train_labels_sm.shape)

Using TensorFlow backend.


(383736, 41) (383736,)


# LogisticRegression Model

In [1]:
# for preprocessing the data
from sklearn.preprocessing import StandardScaler

# the model
from sklearn.linear_model import LogisticRegression
# for preprocessing the data
from sklearn.preprocessing import StandardScaler

# the model
from sklearn.ensemble import RandomForestClassifier

# for combining the preprocess with model training
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV


#clf = LogisticRegression(solver='lbfgs', max_iter=10000, random_state=42,
#                             multi_class='multinomial')

#pipe = make_pipeline([StandardScaler(), LogisticRegression()])
pipe = Pipeline([('classifier', LogisticRegression())])

# Create param grid.

param_grid = [
    {'classifier__penalty' : ['l2'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['lbfgs'],
    'classifier__multi_class': ['multinomial','ovr'],
    'classifier__max_iter': [10000]}
]

# Create grid search object
clf = GridSearchCV(pipe, param_grid,cv = 5, verbose=True, n_jobs=1)

# Fit on data

best_clf = clf.fit(train_values_sm, train_labels_sm)

NameError: name 'np' is not defined

In [None]:

clf = clf.fit(train_values_sm,train_labels_sm)

In [None]:
# print the training scores
    print("training score : %.3f (%s)" % (clf.score(train_values_sm, train_labels_sm), multi_class))

In [None]:
class KFoldTargetEncoderTest(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self,train,colNames,encodedName):
        
        self.train = train
        self.colNames = colNames
        self.encodedName = encodedName
        
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        mean =  self.train[[self.colNames,
                self.encodedName]].groupby(
                                self.colNames).mean().reset_index() 
        
        dd = {}
        for index, row in mean.iterrows():
            dd[row[self.colNames]] = row[self.encodedName]
        X[self.encodedName] = X[self.colNames]
        X = X.replace({self.encodedName: dd})
        return X

In [None]:
test_targetc = KFoldTargetEncoderTest(new_train,
                                      'geo_level_1_id',
                                      'geo_level_1_id_Enc')
new_test = test_targetc.fit_transform(test_values_subset)

In [None]:
predictions = clf.predict(new_test)

In [None]:
submission_format = pd.read_csv('/Users/ShaimaShoukat/Desktop/BDProject/submission_format.csv', index_col='building_id')

In [None]:
my_submission = pd.DataFrame(data=predictions,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [None]:
my_submission.to_csv('submission10.csv')

In [None]:
  
# training a KNN classifier 
from sklearn.neighbors import KNeighborsClassifier 
knn = KNeighborsClassifier(n_neighbors = 7).fit(train_values_sm, train_labels_sm) 
  


In [None]:
# accuracy on X_test 
accuracy = knn.score(train_values_sm, train_labels_sm) 
print(accuracy)

In [None]:
# creating a confusion matrix 
knn_predictions = knn.predict(new_test)  

In [None]:
my_submission = pd.DataFrame(data=knn_predictions,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [None]:
my_submission.to_csv('submission11.csv')

In [None]:

# start Grid search
parameters = {'C':[0.01, 0.1, 1,5, 10], 'penalty':['l2','l1']}
log_reg = linear_model.LogisticRegression(class_weight='balanced')
log_reg_grid = GridSearchCV(log_reg, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
log_reg_grid_results,lr_model =  perform_model(log_reg_grid, train_data_final, y_train, cv_data_final, y_cv, class_labels=labels)

In [None]:
https://medium.com/@pouryaayria/k-fold-target-encoding-dfe9a594874b
https://github.com/pourya-ir/Medium/blob/master/K-fold-target-enc/K-fold-Target-Encoding.ipynb
http://contrib.scikit-learn.org/categorical-encoding/targetencoder.html
https://docs.featuretools.com/api_reference.html#feature-encoding
https://github.com/FeatureLabs/categorical_encoding/blob/master/guides/notebooks/categorical-encoding-DEMO.ipynb
https://mlwhiz.com/blog/2019/05/19/feature_extraction/