In [6]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score 
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE


In [7]:
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

- Observation from dataset: all 1 valued target rows are in starting of the data¶
so we will shuffle the data

In [8]:
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
idx_1 = np.where(df.m13 == 1)
idx_0 = np.where(df.m13 == 0)

#Checking if shuffling was done successfully, first line of code gives indexes of rows containing value =1 in column m13

In [9]:
ntrain = df.shape[0]
ntest = df_test.shape[0]
y_train = df.m13.values
all_data = pd.concat([df, df_test], sort=False).reset_index(drop=True)
all_data.drop(['m13'], axis=1, inplace=True)

In [None]:
df['m13'].value_counts()

In [None]:
df_test.isnull().sum()


In [None]:
df.isnull().sum()


In [None]:
df.info()


In [None]:
all_data['first_payment_date'].unique()


Will treat it as categorical variable, Also Notice that date format is different in test data and training data, so we need to make it consistent first otherwise it will create unnecessary dummy variables

In [None]:
all_data.replace("Apr-12", "04/2012", inplace = True)
all_data.replace("Mar-12", "03/2012", inplace = True)
all_data.replace("May-12", "05/2012", inplace = True)
all_data.replace("Feb-12", "02/2012", inplace = True)

In [None]:
all_data['origination_date'].value_counts()

Similar to first payment date we can see different formatting for same date here, so we will convert them here again

In [None]:
all_data.replace("2012-02-01", "01/02/12", inplace = True)
all_data.replace("2012-01-01", "01/01/12", inplace = True)
all_data.replace("2012-03-01 ", "01/03/12 ", inplace = True)

In [None]:
all_data['loan_purpose'].unique()

In [None]:
all_data['source'].unique()

In [None]:
all_data['financial_institution'].value_counts()

In [None]:
corr = all_data.corr()
sns.heatmap(corr,
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)
pos_filtered_corr = corr[ corr.iloc[:,:] >= 0.5] # from this we easily know highly positively correlated values
neg_filtered_corr = corr[ corr.iloc[:,:] <= -0.5] # no attribute is that highly negatively correlated
sns.heatmap(pos_filtered_corr,
            xticklabels=pos_filtered_corr.columns.values,
            yticklabels=pos_filtered_corr.columns.values) #only of the attributes which are correlated highly

From correlation matrix, we found that following features are highly correlated but removing them is affection our final model in negative ways so we will not remove them:
1. Borrower credit sccore and number of borowers- they are connected 99% so I will remove one of them for sure
2. m8 and m9
3. m9 and m10
4. m9 , m10 and m11
5. m10, m11 and m12
Also, from a general observation origination_date is a redundant attribute if we are usign first_payment_date, so I will remove it as well

In [None]:
#all_data = all_data.drop(["co-borrower_credit_score", "m9", "m10", "m11", "m12"], axis=1) # based on correlation
all_data = all_data.drop(["origination_date", "co-borrower_credit_score"], axis=1)

In [None]:
all_data['loan_purpose'] = all_data['loan_purpose'].astype(str)
all_data['first_payment_date'] = all_data['first_payment_date'].astype(str)
all_data['financial_institution'] = all_data['financial_institution'].astype(str)
all_data['origination_date'] = all_data['origination_date'].astype(str)
all_data['source'] = all_data['source'].astype(str)
Finalall_data = pd.get_dummies(all_data)

converting all categorical variables in dummy variables

In [None]:
#Finalall_data = Finalall_data.drop(["source_X", "financial_institution_OTHER", "loan_purpose_A23", "first_payment_date_02/2012", "origination_date_01/01/12"], axis=1) 

We are removing one dummy variable for each categorical variables for those algorithms which are affected by dummy variable trap

In [None]:
train = Finalall_data[:ntrain]
test = Finalall_data[ntrain:]
X = train.drop(["loan_id"], axis=1)

Removed loan id from train data, since it is not useful for training purposes

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [None]:
#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

Haven't used feature scaler for final model

In [None]:
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', np.unique(y_train), y_train)

Above piece of code gives us the class weights for all possible classes in dataset using 'balanced' technique, same task is performed by Logistic Regression class_weight parameter when its value is chosen to be 'balanced', which is helpful when we are dealing with class imbalance problems

In [None]:
#svm_model = SVC(kernel='rbf', random_state=0, gamma='auto')
#svm_model.fit(X_train, y_train)

1. this is a basic implementation of svm, in one case I took all dummy variables and in another case i took care of dummy variable trap, and the results are 98% same in both cases.
2. Apart from that SVM is taking long times for fitting and giving very low f1 value for class '1' (0.07), average macro is fair though(0.53), refer classification report for more details

In [None]:
#Classifier = LogisticRegression(random_state = 0, solver='sag', n_jobs=-1, class_weight='balanced', fit_intercept=False)

Observations -
1. without fit_intercept=False and class weighs balanced, LR is performing well with avg, f1 value = 0.65, but with them in consideration f1 value is deteriorating to 0.38, weird
2. For more details on above point, refer snapshots of classification report
3. Also overfitting is observed in this SVM

In [None]:
#Classifier = DecisionTreeClassifier(random_state=0, class_weight='balanced')
#Classifier.fit(X_train, y_train)
#scv = StratifiedKFold(n_splits=5)
#crossvalscore = cross_val_score(estimator=Classifier, X=X_train, y=y_train, cv=scv,  scoring = 'f1_macro')
#crossvalscore.mean()
#crossvalscore.std()

1. In its most basic form, decision tree is performing good, since it have avg. f1 score of around 0.588, though have acceptable value of f1 for class '1' = 0.18, It is performing fast as well.
2. When we remove one of dummy varibles for each categorical variable, avg f1 score reduces to 0.53.
3. Watch classification report for more details
4. DT overfits on training set with macro f1 value = 1
5. adding class weight parameter doesn't changes above observations much

In [None]:
#NB = GaussianNB()
#NB.fit(X_train, y_train)

1. Till now it's been best when training but, during kfold validation mean of f1 score was 0.4 and std = 0.209
2. No issues of overfitting
3. avg f1 value = 0.62 with acceptable value for f1 of class '1', refer classification report for more
4. After removing dummy variable same effects are observed as for DT

In [None]:
#RFC = RandomForestClassifier(random_state=0, class_weight='balanced', n_jobs=-1)
#RFC.fit(X_train, y_train)

- Performs pretty much same as DT, maybe a little better, overfitting training data
- Refer classification results for more details
- I have observed above in all models that kfold gave results similar to shown in classification report

Now that I have used pretty much everything that I read for imbalanced classes, It's time to try last thing, sampling the data, 1. Oversampling using SMOTE
2. Random Under sampling with replacement
- After Several Experiments I deduced that oversampling using SMOTE give overall better results than under sampling using RFC alongside, and by overall better results I mean better f1 score for class '1', better overall f1 score on X_test, better accuracy, and overfitting of training data as well.
- Also cross val score mean value turns out to be 0.99(again signifies overfitting) when x=X_res and y=y_res, but it is 0.72 for X_train and Y_train
Final Conclusion GBC classifier along with sampling is not overfitting that much so tuning it should give better results for future test predicitons as well, so I will further proceed with GBC
Note: All above results in this markdown are considerable only when no feature was dropped from original dataset

In [None]:
#oversampler = SMOTE(sampling_strategy=0.05, random_state=0, k_neighbors=10)
#X_res, y_res = oversampler.fit_resample(X_train, y_train)

So here we have done the sampling, we may want to shuffle the obtained dataset X_res and y_res, though minute changes in results are obtained

In [None]:
#X_res = pd.DataFrame(X_res)
#y_res = pd.DataFrame(y_res)
#df_all_rows = pd.concat([X_res, y_res], axis=1)
#df_all_rows = df_all_rows.sample(frac=1).reset_index(drop=True)
#X_res = df_all_rows.iloc[:,:-1].values
#y_res = df_all_rows[0]
#y_res = y_res.T.reset_index(drop=True).T
#y_res = df_all_rows.iloc[:,-1].values

In [None]:
#GBC = GradientBoostingClassifier(random_state=0)
#GBC.fit(X_res, y_res)

In [None]:
#scv = StratifiedKFold(n_splits=5)
#crossvalscore = cross_val_score(estimator=GBC, X=X_res, y=y_res, cv=scv,  scoring = 'f1_macro')
#crossvalscore.mean()
#crossvalscore.std()

This is the cross-validation step and we have discussed results of it in one of the above markdowns when X and y are changed in this. Note that we have used stratifiedK fold as one of the step for tackling imbalanced classes.

In [None]:
params_grid = [{'min_samples_leaf': [2, 5, 10, 15, 20], 'min_samples_split': [2, 3, 4, 5, 6, 7, 10, 100]
                    , 'max_depth': [1, 4, 8, 16 ,32]
                    , 'criterion': ['gini']}
                    , {'criterion': ['entropy'], 'min_samples_leaf': [2, 5, 10, 15, 20]
                    , 'min_samples_split' : [2, 3, 4, 5, 6, 7, 10, 100]
                    , 'max_depth': [1, 4, 8, 16 ,32]}]
#params_grid = [{'min_samples_leaf': [0.1, 0.5, 5], 'min_samples_split': [2, 3, 4, 5, 6, 7, 10]
#                    , 'max_depth': [1, 4, 8, 16 ,32]
#                    , 'criterion': ['gini']}
#                    , {'criterion': ['entropy'], 'min_samples_leaf': [0.1, 0.5, 5]
#                    , 'min_samples_split' : [0.1, 2, 3 , 4, 5, 6, 8, 10]
#                    , 'max_depth': [1, 4, 8, 16 ,32]}]

Using grid search for hyperparameter tuning in decision tree

In [None]:
gridsearch = GridSearchCV(DecisionTreeClassifier(random_state=0, class_weight='balanced'
                                                 , presort=True), params_grid
                                                , cv= scv, scoring='f1_macro', n_jobs=-1, verbose=50)
gridsearch.fit(X_res, y_res)
gridsearch.best_params_
gridsearch.best_score_


In [None]:
# Predicting on splitted test set and training set to see if overfitting is there or not
y_pred_train = gridsearch.best_estimator_.predict(X_train)
y_pred = gridsearch.best_estimator_.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)

MacroF1_test = f1_score(y_test, y_pred, average='macro')
MacroF1_train = f1_score(y_train, y_pred_train, average='macro')

# --classification report --
Report = metrics.classification_report(y_test, y_pred, labels=[0,1])

In [None]:
#Prediciting on test set
y_test_preds = grid_search.best_estimator_.predict(test)

In [None]:
#Preparing to write a CSV file
index = pd.DataFrame(test['loan_id'])
test = test.drop(["loan_id"], axis=1)
submission_format = pd.DataFrame(y_test_preds)
FinalSubmission = pd.concat([test, submission_format])
FinalSubmission = FinalSubmission.sort_values(by='ID')
FinalSubmission.to_csv("/home/suraj/Desktop/ML Problems/ML hackathon Problem/FinalSubmission.csv", header=True, index=None)

0.21 test score with naive implementation of DT, 0.30 with grid search on decision tree, still need to find reason for such difference in test and cross scores, sampling is also left 
See what changes can you make in origination data and first payment date variables

Okay I have tried three things-
1. first normal original DT parametrized model with 30....
2. DT2_parametrized got same result but removed origination date and co-borrower parameter
3. DT_parametrized2( removed origination date and co-borrower parameter and have different set of param_grid) and resuls are not at all different