In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE, RFECV
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier


from sklearn import metrics
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score 
from sklearn.metrics import f1_score
from imblearn.metrics import geometric_mean_score
from imblearn.metrics import sensitivity_score
from imblearn.metrics import specificity_score
from sklearn.metrics import roc_auc_score

import xgboost as xgb

In [None]:
df = pd.read_csv("../input/promisee/promisin_couples.csv")
#df.replace(to_replace = -1 , value =np.nan)

X = df.iloc[:, 0:139].values
y = df.iloc[:, 139].values

#imputing missing values
from sklearn.impute import KNNImputer
imputer = KNNImputer()
#imputer = Imputer(missing_values = 'NaN', strategy = 'most_frequent', axis = 0)
imputer.fit(X[:, 0:139])
X[:, 0:139] = imputer.transform(X[:, 0:139])

#Making all the values discrete
from sklearn.preprocessing import KBinsDiscretizer
est = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
X = est.fit_transform(X)

# Filter Method: Spearman's Cross Correlation > 0.95
# Make correlation matrix
corr_matrix = pd.DataFrame(X).corr(method = "spearman").abs()

# Draw the heatmap
sns.set(font_scale = 1.0)
f, ax = plt.subplots(figsize=(11, 9))
sns.heatmap(corr_matrix, cmap= "YlGnBu", square=True, ax = ax)
f.tight_layout()
plt.savefig("correlation_matrix.png", dpi = 1080)

# Select upper triangle of matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# Drop features
X =pd.DataFrame(X).drop(to_drop, axis = 1)

###############################################################################
#                  8. Custom pipeline object to use with RFECV                #
###############################################################################
# Select Features using RFECV
class PipelineRFE(Pipeline):
    # Source: https://ramhiser.com/post/2018-03-25-feature-selection-with-scikit-learn-pipeline/
    def fit(self, X, y=None, **fit_params):
        super(PipelineRFE, self).fit(X, y, **fit_params)
        self.feature_importances_ = self.steps[-1][-1].feature_importances_
        return self

scaler = StandardScaler()
estimator = RandomForestClassifier(n_estimators= 200,
                                   class_weight ='balanced',
                                   max_features = 'auto',
                                   max_depth = 6,
                                   min_samples_split = 0.005,
                                   min_samples_leaf = 0.005,
                                   criterion = 'entropy',
                                   n_jobs = -1)
steps = [("scaler", scaler), ("classifier", estimator)]
pipe = PipelineRFE(steps = steps)

# Initialize RFECV object
feature_selector = RFECV(pipe, cv = 10, step = 1, min_features_to_select=10, scoring = "roc_auc", verbose = 1)

# Fit RFECV
X = feature_selector.fit_transform(X, y)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20,
                                                    random_state = 1000)


In [None]:

scaler = StandardScaler()
classifier = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
# Define steps in pipeline
steps = [("scaler", scaler), ("classifier", classifier)]
    
# Initialize Pipeline object 
pipeline = Pipeline(steps = steps)
      
# Define parameter grid
param_grid = {
    "classifier__max_depth":range(3,10,2),
     "classifier__min_child_weight":range(1,6,2),
    "classifier__gamma":[i/10.0 for i in range(0,5)],
    "classifier__subsample":[i/10.0 for i in range(6,10)],
 "classifier__colsample_bytree":[i/10.0 for i in range(6,10)],
     "classifier__reg_alpha":[0, 0.001, 0.005, 0.01, 0.05]
}
    
# Initialize GridSearch object
gscv = GridSearchCV(pipeline, param_grid, cv = 10,  n_jobs= -1, verbose = 1, scoring = "roc_auc")
                      
# Fit gscv
gscv.fit(X_train, y_train) 
    
clf = gscv.best_estimator_
    
    
y_pred = clf.predict(X_test)
    
    
print('Accuracy :       ', accuracy_score(y_test, y_pred))
print('ROC :            ', roc_auc_score(y_test, y_pred))
print('F-Measure :      ', f1_score(y_test, y_pred, average = 'weighted'))
print('Geometric Mean : ', geometric_mean_score(y_test, y_pred, average = 'weighted'))
print('Sensitivity :    ', sensitivity_score(y_test, y_pred, average = 'weighted'))
print('Specificity :    ', specificity_score(y_test, y_pred, average = 'weighted'))
print('Type I Error :   ', (1-geometric_mean_score(y_test, y_pred, average = 'weighted')))
print('Type II Error :  ', (1-specificity_score(y_test, y_pred, average = 'weighted')))

In [None]:
gscv.best_params_
