In [1]:
import pandas as pd
import numpy as np 
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn import preprocessing

#import feature selection modules
from sklearn.feature_selection import mutual_info_classif,RFE,RFECV

#import classification modules
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

#import classification evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold 
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve 
from sklearn.metrics import f1_score
from sklearn.metrics import auc

In [3]:
def load_data():
    dmfraud = pd.read_csv('creditriskmodeling.csv')
    return dmfraud
dmfraud = load_data()

In [8]:
def cleaningup(dmfraud):
    
    dmfraud.dropna(how='any', axis = 0, inplace = True)
    print (dmfraud.shape)
    
    le = preprocessing.LabelEncoder()
    le.fit(dmfraud['Default'])
    list(le.classes_)
    dmfraud['Default'] = le.transform(dmfraud['Default'])
    
    print("dmfraud is all cleaned up..")
    return dmfraud

In [9]:
def basicanalysis(dmfraud):
    print ('Shape is : \n', dmfraud.shape)
    print ('Data Type is : \n', dmfraud.dtypes)
    print ('Column names are : \n', dmfraud.columns)
    print ('Statistical Analysis : \n', dmfraud.describe()) #ye function hai jbi () brackets lgey hain

In [11]:
def stringcolanalysis(dmfraud):
    stringcols = dmfraud.select_dtypes(exclude = np.number)
    fig = plt.figure(figsize = (8,10))
    for i,col in enumerate(stringcols): 
        fig.add_subplot(4,2,i+1) 
        fig.savefig('Categorical.png')
        dmfraud[col].value_counts().plot(kind = 'barh', color='lightgreen' ,fontsize=10) 
        plt.tight_layout()
        plt.title(col)

In [13]:
def numcolanalysis(dmfraud):
    numcols = dmfraud.select_dtypes(include=np.number)
    for col in numcols:
        fig = plt.figure(figsize = (5,5))
        sb.boxplot(dmfraud[col], color='grey', linewidth=1)
        plt.tight_layout()
        plt.title(col)
        plt.savefig("Numerical.png")

In [14]:
def traintestsplit(dmfraud,split,random):
    #make a copy of the label column and store in y
    y = dmfraud['Default'].copy()
    
    #now delete the original
    X = dmfraud.drop('Default',axis=1)
    
    #manual split
    trainX, testX, trainY, testY= train_test_split(X, y, test_size=split, random_state=random)
    
    return X, trainX, testX, trainY, testY 

In [15]:
def cross_valid(X,y,split,repeat,random):
    kf = RepeatedKFold(n_splits=split, n_repeats=repeat, random_state=random) 
    for train_index, test_index in kf.split(X):
        trainX, testX = X.iloc[train_index], X.iloc[test_index] 
        trainY, testY = y.iloc[train_index], y.iloc[test_index]
    return trainX,trainY,testX,testY

In [16]:
def validationmetrics(model,testX,testY):
    predictions = model.predict(testX)
    print("Prediction Vector: \n", predictions)
    
    #Accuracy
    print("Accuracy: \n", accuracy_score(testY, predictions)*100)
    
    #Precision
    print("Precision of Fraud Happening: \n", precision_score(testY, predictions,pos_label=1,labels=[0,1])*100)
    
    #Recall
    print("Recall of Fraud Happening: \n", recall_score(testY, predictions,pos_label=1,labels=[0,1])*100)
    
    #get FPR (specificity) and TPR (sensitivity)
    fpr , tpr, _ = roc_curve(testY, predictions)
    
    #AUC
    print("AUC of Fraud Happening: \n",auc(fpr, tpr))
    
    #F-Score
    print("F-Score OF Fraud Happening:\n", f1_score(testY, predictions))
    
    #confusion Matrix
    
    print("Confusion Matrix: \n", confusion_matrix(testY, predictions,labels=[0,1]))

In [19]:
def RFfeatureimportance(dmfraud, trainX, testX, trainY, testY, trees, random):
    clf = RandomForestClassifier(n_estimators=trees, random_state=random)
    clf.fit(trainX,trainY)
    validationmetrics(clf,testX,testY)
    print(pd.Series(clf.feature_importances_, index=dmfraud.columns.values).sort_values(ascending=False)*100)

In [20]:
def LogReg(dmfraud, trainX, testX, trainY, testY):
    clf  = LogisticRegression()
    clf.fit(trainX , trainY)
    validationmetrics(clf,testX,testY)

In [21]:
def KNN(dmfraud, trainX, testX, trainY, testY):
    clf = KNeighborsClassifier()
    clf.fit(trainX , trainY)
    validationmetrics(clf,testX,testY)

In [27]:
def MachineLearningwithRFFS():
    #include all selected features in impftrs and last should be the label
    impftrs = []
    dmfraud = load_data()
    #dmfraud = stringcolanalysis(dmfraud)
    #dmfraud = numcolanalysis(dmfraud)
    dmfraud = cleaningup(dmfraud)
    dmfraud = dmfraud[impftrs]
    dmfraud, trainX, testX, trainY, testY = traintestsplit(dmfraud,0.2,91) 
    dmfraud = RFfeatureimportance(dmfraud, trainX, testX, trainY, testY, 5, 91)
    
    print("\n\n Results for Logistic Regression.....")
    LogReg(dmfraud, trainX, testX, trainY, testY)
    
    print("\n\n Results for KNN.....")
    KNN(dmfraud, trainX, testX, trainY, testY)

In [28]:
MachineLearningwithRFFS()

(390, 20)
dmfraud is all cleaned up..
Prediction Vector: 
 [0 1 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 1 0 1 0 1 1
 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0]
Accuracy: 
 93.58974358974359
Precision of Fraud Happening: 
 90.0
Recall of Fraud Happening: 
 85.71428571428571
AUC of Fraud Happening: 
 0.9110275689223057
F-Score OF Fraud Happening:
 0.8780487804878048
Confusion Matrix: 
 [[55  2]
 [ 3 18]]
Expense to Revenue Ratio                  38.688557
Working Capital Requirement               13.297776
Net Profit Margin                         12.877282
Liability to Equity                       11.281302
Debt Cash Flow Coverage Ratio              4.986828
Net Debt to Equity Ratio                   3.061095
Fixed Asset to Debt Ratio                  2.568537
Short Term Debt to Sales Ratio             2.501698
Return on Equity                           2.206724
Gross Profit Margin                        1.884359
Return on Inves



In [None]:
def MachineLearningwithRFFS():
    #include all selected features in impftrs and last should be the label
    impftrs = []
    dmfraud = load_data()
    #dmfraud = stringcolanalysis(dmfraud)
    #dmfraud = numcolanalysis(dmfraud)
    dmfraud = cleaningup(dmfraud)
    
    dmfraud, trainX, testX, trainY, testY = traintestsplit(dmfraud,0.2,91) 
    dmfraud = RFfeatureimportance(dmfraud, trainX, testX, trainY, testY, 5, 91)
    
    print("\n\n Results for Logistic Regression.....")
    LogReg(dmfraud, trainX, testX, trainY, testY)
    
    print("\n\n Results for KNN.....")
    KNN(dmfraud, trainX, testX, trainY, testY)