In [None]:
# Good example for recall function for imbalanced data.
#https://www.kaggle.com/gargmanish/how-to-handle-imbalance-data-study-in-detail
# For data download: https://www.kaggle.com/mlg-ulb/creditcardfraud/downloads/creditcard.csv/3
# Use given link to leanr technique for imbalanced data classification:
#https://www.kdnuggets.com/2017/06/7-techniques-handle-imbalanced-data.html
# https://www.kaggle.com/themlguy/undersample-and-oversample-approach-explored


#Tuning the mode: https://towardsdatascience.com/fine-tuning-a-classifier-in-scikit-learn-66e048c21e65
#For ROC Curve: https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-classification-in-python/ 

#In this case, other alternative evaluation metrics can be applied such as:

#----Precision/Specificity: how many selected instances are relevant.
#----Recall/Sensitivity: how many relevant instances are selected.
#----F1 score: harmonic mean of precision and recall.
#----MCC: correlation coefficient between the observed and predicted binary classifications.
#----AUC: relation between true-positive rate and false positive rate.

import pandas as pd
df = pd.read_csv("AP_Sample_data1.csv")

df = df.drop(['unique_id_popln', 'pcn_no','cust_xref_id','cust_xref_id_1'], axis=1)

df.head()

# Converting object columns into numeric:
from sklearn.preprocessing import LabelEncoder
for check in df.columns:
    if df[check].dtype == 'object':
        df[check].fillna("?", inplace=True)
        label_encoder = LabelEncoder()
        label_encoder.fit(df[check].unique())
        df[check] = label_encoder.transform(df[check]) 

# use this to check null or nan values in dataset
# df.isna().sum()
df.fillna(0,inplace=True)
df.head()

#Normalize dataset:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(df)
df_normalized = pd.DataFrame(np_scaled)
df_normalized.head()

# Giving Column name (Lost during normalization)
df_normalized.columns =df.columns
df_normalized.head()

In [None]:
#-------------- Starting Underpopulation Process:--------------------------->>>
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.metrics import confusion_matrix,recall_score,precision_recall_curve,auc,roc_curve,roc_auc_score,classification_report

#------------------ Process for Underpopulation No Flag Data --------------------------->>>>>>>>>> 
def generatePerformanceReport(clf, x_train, y_train,x_test, y_test, bool_):
    if bool_ == True:
        clf.fit(x_train,y_train.values.ravel())
    pred = clf.predict(x_test)
    cnf_matrix = confusion_matrix(y_test,pred)
    TN,FP,FN,TP = cnf_matrix.ravel()
    
    print('------------------------------------')
    print('Lenth of training data:',len(x_train))
    print('Lenth of test data:',len(x_test))
    print('------------------------------------')
    print('True positives:',TP)
    print('True negatives:',TN)
    print('False positives:',FP)
    print('False negatives:',FN)
    
    print('--------Classification report-----------')
    print(classification_report(y_test,pred))

normal_indices = df_normalized[df_normalized['auto_sltn_opp_flag']==0].index  
ap_auto_indices = df_normalized[df_normalized['auto_sltn_opp_flag']==1].index

#------------- Calling all model and checking underpopulated data in different ratio ---------->>>>>

for i in range(1,4):
    normal_sample_data = np.array(np.random.choice(normal_indices, i*len(ap_auto_indices),replace=False))
    
    undersampled_data = np.concatenate([ap_auto_indices, normal_sample_data])
    
    undersampled_data = df_normalized.iloc[undersampled_data]
    
    print('length of undersample data', len(undersampled_data))
    
    print('% of AP Auto Opportunity in undersampled data ',len(undersampled_data.loc[undersampled_data['auto_sltn_opp_flag']==1])/len(undersampled_data)) 
    
    # Get feature and label data:
    feature_data = undersampled_data.loc[:,undersampled_data.columns!='auto_sltn_opp_flag']
    label_data = undersampled_data.loc[:,undersampled_data.columns=='auto_sltn_opp_flag']
    
    x_train, x_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.30)
 
    # Defining loop for Models:------------------
    for j in [LogisticRegression(),SVC(),RandomForestClassifier(n_estimators=100)]:
        clf=j
        print(j)
        generatePerformanceReport(clf, x_train, y_train, x_test, y_test, True)
        
        remaining_indices = [i for i in df_normalized.index if i not in undersampled_data.index]
        
        testdf = df_normalized.iloc[remaining_indices]
        testdf_label = df_normalized.loc[:, testdf.columns=='auto_sltn_opp_flag']
        
        testdf_features=df_normalized.loc[:,testdf.columns!='auto_sltn_opp_flag']
        
        generatePerformanceReport(clf,x_train,y_train, testdf_features, testdf_label, False)


In [None]:
#------------- Another process to check the result for unbalanced data:----- 
#oversampled_data------------------------------------>>>>>>>>
normal_sampled_indices=df_normalized.loc[df_normalized['auto_sltn_opp_flag']==0].index
oversampled_data=df_normalized.iloc[normal_sampled_indices]
ap_auto_data=df_normalized.loc[df_normalized['auto_sltn_opp_flag']==1]
oversampled_data=oversampled_data.append([ap_auto_data]*11, ignore_index=True)

print('length of oversampled_data data ', len(oversampled_data))
print('% of AP Auto opportunity in oversampled_data data ',len(oversampled_data.loc[oversampled_data['auto_sltn_opp_flag']==1])/len(oversampled_data))


#get feature and label data
feature_data=oversampled_data.loc[:,oversampled_data.columns!='auto_sltn_opp_flag']
label_data=oversampled_data.loc[:,oversampled_data.columns=='auto_sltn_opp_flag']
X_train, X_test, y_train, y_test=train_test_split(feature_data,label_data,test_size=0.30)
for j in [LogisticRegression(),RandomForestClassifier(n_estimators=100)]:
    clf=j
    print(j)
    generatePerformanceReport(clf,X_train,y_train,X_test,y_test,True)
    #the above code classifies X_test which is part of undersampled data
    #now, let us consider the remaining rows of dataset and use that as test set
    remaining_indices=[i for i in df_normalized.index  if i not in oversampled_data.index]
    testdf=df_normalized.iloc[remaining_indices]
    testdf_label=df_normalized.loc[:,testdf.columns=='auto_sltn_opp_flag']
    testdf_feature=df_normalized.loc[:,testdf.columns!='auto_sltn_opp_flag']
    generatePerformanceReport(clf,X_train,y_train,testdf_feature,testdf_label,False)
    
# RandomForestClassifier working fine in Oversampl method for AP Automation unbalanced data.