In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import pandas as pd
import random
np.random.seed(10)

In [2]:
#load dataset
df = pd.read_excel('/content/FinScheduleDetails.xlsx')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1052 entries, 0 to 1051
Data columns (total 82 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   FinReference          1052 non-null   object        
 1   SchDate               1052 non-null   datetime64[ns]
 2   SchSeq                1052 non-null   int64         
 3   PftOnSchDate          1052 non-null   bool          
 4   CpzOnSchDate          1052 non-null   bool          
 5   RepayOnSchDate        1052 non-null   bool          
 6   RvwOnSchDate          1052 non-null   bool          
 7   DisbOnSchDate         1052 non-null   bool          
 8   DefSchdDate           1052 non-null   datetime64[ns]
 9   DownpaymentOnSchDate  1052 non-null   bool          
 10  BalanceForPftCal      1052 non-null   int64         
 11  BaseRate              422 non-null    object        
 12  SplRate               0 non-null      float64       
 13  MrgRate           

In [4]:
df.dropna(how='all', axis=1, inplace=True)

In [6]:
result = df.select_dtypes(include='number')
print(result)

      SchSeq  BalanceForPftCal  MrgRate  ActRate  CalculatedRate  NoOfDays  \
0          1        1360464000      0.8      0.0           13.45        31   
1          1        1344777300      0.8      0.0           13.45        30   
2          1        1336801800      0.8      0.0           13.45        31   
3          1        1328736900      0.8      0.0           13.45        30   
4          1        1320581600      0.8      0.0           13.45        31   
...      ...               ...      ...      ...             ...       ...   
1047       1         782347400      1.1      0.0           13.75        30   
1048       1         780494100      1.1      0.0           13.75        31   
1049       1         778619600      1.1      0.0           13.75        31   
1050       1         776723600      1.1      0.0           13.75        28   
1051       1         774805900      1.1      0.0           13.75        31   

      DayFactor  ProfitCalc  ProfitSchd  PrincipalSchd  ...  In

In [7]:
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1052 entries, 0 to 1051
Data columns (total 58 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   SchSeq              1052 non-null   int64  
 1   BalanceForPftCal    1052 non-null   int64  
 2   MrgRate             1052 non-null   float64
 3   ActRate             1052 non-null   float64
 4   CalculatedRate      1052 non-null   float64
 5   NoOfDays            1052 non-null   int64  
 6   DayFactor           1052 non-null   float64
 7   ProfitCalc          1052 non-null   int64  
 8   ProfitSchd          1052 non-null   int64  
 9   PrincipalSchd       1052 non-null   int64  
 10  RepayAmount         1052 non-null   int64  
 11  ProfitBalance       1052 non-null   int64  
 12  DisbAmount          1052 non-null   int64  
 13  DownPaymentAmount   1052 non-null   int64  
 14  CpzAmount           1052 non-null   int64  
 15  FeeChargeAmt        1052 non-null   int64  
 16  Refund

In [8]:
result=result.fillna(result.median())

In [9]:
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1052 entries, 0 to 1051
Data columns (total 58 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   SchSeq              1052 non-null   int64  
 1   BalanceForPftCal    1052 non-null   int64  
 2   MrgRate             1052 non-null   float64
 3   ActRate             1052 non-null   float64
 4   CalculatedRate      1052 non-null   float64
 5   NoOfDays            1052 non-null   int64  
 6   DayFactor           1052 non-null   float64
 7   ProfitCalc          1052 non-null   int64  
 8   ProfitSchd          1052 non-null   int64  
 9   PrincipalSchd       1052 non-null   int64  
 10  RepayAmount         1052 non-null   int64  
 11  ProfitBalance       1052 non-null   int64  
 12  DisbAmount          1052 non-null   int64  
 13  DownPaymentAmount   1052 non-null   int64  
 14  CpzAmount           1052 non-null   int64  
 15  FeeChargeAmt        1052 non-null   int64  
 16  Refund

In [10]:

# A succinct way to convert a single column of boolean values to a column of integers 1 or 0:

df["Target"] = df["RepayOnSchDate"].astype(int)

In [5]:
#Percentage of on tyme payment
df["Target"].mean()

0.9876425855513308

In [12]:
result["Target"]=df["Target"]

In [11]:
# Target Variable assumed ==  RepayOnSchDate

In [14]:
result.shape

(1052, 59)

In [32]:
#Feature Selection by mutual information

from sklearn.feature_selection import SelectKBest, mutual_info_classif
selector = SelectKBest(mutual_info_classif, k=10)
Y=df["Target"].tolist()
X=selector.fit_transform(result.iloc[:,0:58], Y)
X.shape


(1052, 10)

In [33]:

# Selected Variable
cols = selector.get_support(indices=True)
selected_columns =result.iloc[:,cols].columns.tolist()
selected_columns

['NoOfDays',
 'DayFactor',
 'ProfitSchd',
 'PrincipalSchd',
 'RepayAmount',
 'ProfitBalance',
 'DisbAmount',
 'CpzAmount',
 'InstNumber',
 'ProfitFraction']

In [44]:
corr_mat = pd.DataFrame(X, columns=selected_columns).corr(method='pearson')
corr_mat

Unnamed: 0,NoOfDays,DayFactor,ProfitSchd,PrincipalSchd,RepayAmount,ProfitBalance,DisbAmount,CpzAmount,InstNumber,ProfitFraction
NoOfDays,1.0,0.946587,0.136199,0.05726,0.17295,-0.035457,-0.624904,-0.030998,0.165873,0.006112
DayFactor,0.946587,1.0,0.14726,0.059596,0.185152,-0.041553,-0.657401,-0.036862,0.172353,-0.001151
ProfitSchd,0.136199,0.14726,1.0,-0.341485,0.662934,-0.212151,-0.090656,-0.211505,-0.500361,-0.047788
PrincipalSchd,0.05726,0.059596,-0.341485,1.0,0.477291,-0.169014,-0.073057,-0.170446,0.731136,0.026784
RepayAmount,0.17295,0.185152,0.662934,0.477291,1.0,-0.33298,-0.142953,-0.333517,0.114583,-0.023344
ProfitBalance,-0.035457,-0.041553,-0.212151,-0.169014,-0.33298,1.0,-0.006694,0.999922,-0.169274,0.000422
DisbAmount,-0.624904,-0.657401,-0.090656,-0.073057,-0.142953,-0.006694,1.0,-0.006673,-0.091524,0.000709
CpzAmount,-0.030998,-0.036862,-0.211505,-0.170446,-0.333517,0.999922,-0.006673,1.0,-0.168618,0.000827
InstNumber,0.165873,0.172353,-0.500361,0.731136,0.114583,-0.169274,-0.091524,-0.168618,1.0,0.008713
ProfitFraction,0.006112,-0.001151,-0.047788,0.026784,-0.023344,0.000422,0.000709,0.000827,0.008713,1.0


In [50]:

# Run this program on your local python
# interpreter, provided you have installed
# the required libraries.
  
# Importing the required packages
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [52]:
 # Separating the target variable
X =pd.DataFrame(X, columns=selected_columns)
Y =df["Target"]
# Splitting the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100)
      


In [53]:
# Function to perform training with giniIndex.
def train_using_gini(X_train, X_test, y_train):
  
    # Creating the classifier object
    clf_gini = DecisionTreeClassifier(criterion = "gini",
            random_state = 100,max_depth=3, min_samples_leaf=5)
  
    # Performing training
    clf_gini.fit(X_train, y_train)
    return clf_gini
# Function to perform training with entropy.
def tarin_using_entropy(X_train, X_test, y_train):
  
    # Decision tree with entropy
    clf_entropy = DecisionTreeClassifier(
            criterion = "entropy", random_state = 100,
            max_depth = 3, min_samples_leaf = 5)
  
    # Performing training
    clf_entropy.fit(X_train, y_train)
    return clf_entropy

# Function to perform training with entropy.
def tarin_using_entropy(X_train, X_test, y_train):
  
    # Decision tree with entropy
    clf_entropy = DecisionTreeClassifier(
            criterion = "entropy", random_state = 100,
            max_depth = 3, min_samples_leaf = 5)
  
    # Performing training
    clf_entropy.fit(X_train, y_train)
    return clf_entropy
  
  
# Function to make predictions
def prediction(X_test, clf_object):
  
    # Predicton on test with giniIndex
    y_pred = clf_object.predict(X_test)
    print("Predicted values:")
    print(y_pred)
    return y_pred

In [56]:
# Function to calculate accuracy
def cal_accuracy(y_test, y_pred):
      
    print("Confusion Matrix: ",
        confusion_matrix(y_test, y_pred))
      
    print ("Accuracy : ",
    accuracy_score(y_test,y_pred)*100)
      
    print("Report : ",
    classification_report(y_test, y_pred))
  
# Driver code
def main( X_train, X_test, y_train, y_test):

    clf_gini = train_using_gini(X_train, X_test, y_train)
    clf_entropy = tarin_using_entropy(X_train, X_test, y_train)
      
    # Operational Phase
    print("Results Using Gini Index:")
      
    # Prediction using gini
    y_pred_gini = prediction(X_test, clf_gini)
    cal_accuracy(y_test, y_pred_gini)
      
    print("Results Using Entropy:")
    # Prediction using entropy
    y_pred_entropy = prediction(X_test, clf_entropy)
    cal_accuracy(y_test, y_pred_entropy)
      

In [57]:
main( X_train, X_test, y_train, y_test)

Results Using Gini Index:
Predicted values:
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Confusion Matrix:  [[  1   2]
 [  2 311]]
Accuracy :  98.73417721518987
Report :                precision    recall  f1-score   support

           0       0.33      0.33      0.33         3
           1       0.99      0.99      0.99       313

    accuracy                           0.99       316
   macro avg  

In [None]:
#Over sampling approach Required to solve this problem SMOTE, ADASYN are the example

In [None]:
import imblearn

In [62]:
# decision tree  on imbalanced dataset with SMOTE oversampling and random undersampling
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# define dataset
X, Y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
	n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
# define pipeline
model = DecisionTreeClassifier()
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.7)
steps = [('over', over), ('under', under), ('model', model)]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, Y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % mean(scores))

Mean ROC AUC: 0.851


In [None]:
# We can draw AUC ROC Curve