Write a class or function to train, test, and store results for

*Decision Tree
*Random Forest
*Support Vector Machine
*TensorFlow Network

It needs to:

*accept four inputs X_train, y_train, X_test, and y_test,
*train the model on X_train and y_train,
*score the results using X_test and y_test
*Print out the confusion matrix for each of the models, and
*Return the results of the scoring as Python dictionary.

In [1]:
import pandas as pd
import numpy as np

In [2]:
AcquisitionColumnNames = (
    "LOAN_ID", "ORIG_CHN", "Seller.Name", 
    "ORIG_RT", "ORIG_AMT", "ORIG_TRM", "ORIG_DTE",
    "FRST_DTE", "OLTV", "OCLTV", "NUM_BO", 
    "DTI", "CSCORE_B", "FTHB_FLG", "PURPOSE", 
    "PROP_TYP", "NUM_UNIT", "OCC_STAT", "STATE", "ZIP_3", 
    "MI_PCT", "Product.Type", "CSCORE_C", "MI_TYPE", 
    "RELOCATION_FLG"
)

PerformanceColumnNames = (
    "LOAN_ID", "Monthly.Rpt.Prd", "Servicer.Name", 
    "LAST_RT", "LAST_UPB", "Loan.Age", "Months.To.Legal.Mat", 
    "Adj.Month.To.Mat", "Maturity.Date", "MSA", 
    "Delq.Status", "MOD_FLAG", "Zero.Bal.Code", 
    "ZB_DTE", "LPI_DTE", "FCC_DTE","DISP_DT", 
    "FCC_COST", "PP_COST", "AR_COST", "IE_COST", 
    "TAX_COST", "NS_PROCS","CE_PROCS", "RMW_PROCS", 
    "O_PROCS", "NON_INT_UPB", "PRIN_FORG_UPB_FHFA", 
    "REPCH_FLAG", "PRIN_FORG_UPB_OTH", "TRANSFER_FLG"
)

In [3]:
acquisition_data_path = "C:/Users/krish/Desktop/Preety/Acquisition_2010Q1.txt"
performance_data_path = "C:/Users/krish/Desktop/Preety/Performance_2010Q1.txt"
acquisition_df = pd.read_csv(
    acquisition_data_path,
    names=AcquisitionColumnNames,
    header=None,
    sep="|"
)
performance_df = pd.read_csv(
    performance_data_path,
    names=PerformanceColumnNames,
    header=None,
    sep="|",
#     nrows = 10000
)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
DS = set(performance_df['Delq.Status'])
print(DS)
mapper = {}
for ds in DS:
    try:
        mapper[ds] = int(ds)
    except:
        mapper[ds] = -1

performance_df['Delq.Status'] = performance_df['Delq.Status'].map(mapper)


{'8', nan, '47', '46', '64', '49', '55', '40', '17', '54', '16', '35', '36', '37', '28', '60', '29', '13', '39', 'X', '63', '66', '6', '11', '20', '42', '51', '65', '4', '23', '44', '52', '57', '2', '10', '18', '22', '25', '15', '24', '38', '48', '56', '31', '7', '19', '14', '43', '1', '3', '34', '53', '5', '32', '62', '30', '12', '50', '9', '33', '26', '61', '0', '59', '58', '27', '21', '45', '41'}


In [7]:
V, C = np.unique(performance_df['Delq.Status'], return_counts=True)
print(V)

[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66]


In [8]:
loans = performance_df.groupby("LOAN_ID", sort=True)['Delq.Status'].max()

ID_To_Delinq = {}

for row in loans.iteritems():
    loan_id, delinq = row
    ID_To_Delinq[loan_id] = delinq

In [9]:
def mapper(row):
    return ID_To_Delinq.get(row["LOAN_ID"], -1)

acquisition_df['MAX_DELINQ'] = acquisition_df.apply(mapper, axis=1)

In [10]:
V, C = np.unique(acquisition_df['MAX_DELINQ'], return_counts=True)

In [13]:
FCC_DTE = performance_df['FCC_DTE'].notna()

forclosed = performance_df[FCC_DTE]

FORECLOSURES = {}

for row in forclosed.iterrows():
    row = row[1]
    FORECLOSURES[row['LOAN_ID']] = row['FCC_DTE']

FORCLOSED = set(forclosed['LOAN_ID'])

def mapper(row):
    # return FORECLOSURES.get(row['LOAN_ID'], "NO_FCC")
    return int(row['LOAN_ID'] in FORCLOSED)

acquisition_df['FCC'] = acquisition_df.apply(mapper, axis=1)

In [14]:
print(set(acquisition_df['FCC']))

{0, 1}


In [15]:
V, C = np.unique(
    performance_df['Monthly.Rpt.Prd'], 
    return_counts=True
)

In [16]:
actual_date = performance_df['Monthly.Rpt.Prd'] == "01/01/2015"
next_date   = performance_df['Monthly.Rpt.Prd'] == "01/01/2016"

date_df = performance_df[actual_date]
next_df = performance_df[next_date]

Delinquency = {}
Next_Delinquency = {}

for row in date_df.iterrows():
    row = row[1]
    Delinquency[row['LOAN_ID']] = ID_To_Delinq.get(row["LOAN_ID"], -1)
    
for row in next_df.iterrows():
    row = row[1]
    Next_Delinquency[row['LOAN_ID']] = ID_To_Delinq.get(row['LOAN_ID'], -1)

In [17]:
def mapper(row):
    return Delinquency.get(row["LOAN_ID"], -1)

def next_mapper(row):
    return Next_Delinquency.get(row['LOAN_ID'], -1)

acquisition_df['DELINQ_DATE'] = acquisition_df.apply(mapper, axis=1)

acquisition_df['DELINQ_NEXT'] = acquisition_df.apply(next_mapper, axis=1 )

In [18]:
delinq = acquisition_df['DELINQ_DATE'] > 0
delinq_df = acquisition_df[delinq]

print({len(delinq_df.index)})

def check_date_range(row):
    return row['DELINQ_NEXT'] >= row['DELINQ_DATE']

delinq_df['DELINQ_DELTA'] = delinq_df.apply(
    check_date_range,
    axis=1
)

{1148}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [19]:
V, C = np.unique(delinq_df['DELINQ_DELTA'], return_counts=True)

for v, c in zip(V, C):
    print(v, " | ", c)

False  |  134
True  |  1014


In [20]:
df = delinq_df

DEL_NOTNAN = df["DELINQ_DELTA"].notna()
df = df[DEL_NOTNAN]
OLTV = df['OLTV'].notna()
df = df[OLTV]
CS = df['CSCORE_B'].notna()
df = df[CS]
DTI = df['DTI'].notna()
df = df[DTI]

credit_score  = np.array(df['CSCORE_B'])
credit_score /= np.max(credit_score)

loan_to_value = np.array(df['OLTV'])

loan_to_value = loan_to_value/np.max(loan_to_value)

debt_to_income= np.array(df['DTI'])
debt_to_income /= np.max(debt_to_income)

delinq_value  = np.array(df['DELINQ_DATE'])

max_delinq    = np.array(df['DELINQ_DELTA'])

foreclosed    = np.array(df['FCC'])
X = np.array(
    [
        credit_score, 
        loan_to_value, 
        debt_to_income, 
        delinq_value
    ]
).transpose()

y = np.array([foreclosed]).transpose()
print(X.shape)
print(y.shape)

Total = np.hstack([X, y])
print(Total.shape)
np.random.shuffle(Total)

X = Total[:, :4]
y = Total[:, 4:]

print(X.shape)
print(y.shape)

prop = 0.9
train_num = int(prop * len(Total))
print("Train Number:", train_num)

X_train, X_test = X[:train_num], X[train_num:]
y_train, y_test = y[:train_num], y[train_num:]

print("X_Train:", X_train.shape)
print("X_Test:", {X_test.shape})
print("=="*10)
print("y_Train:", {y_train.shape})
print("y_Test:",  {y_test.shape})

V, C = np.unique(y, return_counts=True)
class_weight = {}
for v, c in zip(V, C):
    prop = c / len(y)
    class_weight[v] = 1 - prop
    print(v, " | ", c)

class_names = np.unique(y)

print(class_names)

(1132, 4)
(1132, 1)
(1132, 5)
(1132, 4)
(1132, 1)
Train Number: 1018
X_Train: (1018, 4)
X_Test: {(114, 4)}
y_Train: {(1018, 1)}
y_Test: {(114, 1)}
0.0  |  1085
1.0  |  47
[0. 1.]


In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [35]:
class MLtest:
    def __init__(self, X_train, y_train, X_test, y_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        
    def dzt(self, class_weight=class_weight):
        dtc = tree.DecisionTreeClassifier(class_weight = class_weight)
        self.clf = dtc
        return self.clf
    
    def rf(self, n_estimators=100, max_depth=5):
        clf = RandomForestClassifier(n_estimators = n_estimators, max_depth = max_depth)
        self.clf = clf
        return self.clf
    
    def svm(self):
        clf = SVC(gamma='auto')
        self.clf = clf
        return self.clf
    
    def mlp(self, solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100, 10)):
        clf = MLPClassifier(solver = solver, alpha = alpha, hidden_layer_sizes = hidden_layer_sizes)
        self.clf = clf
        return self.clf
    
    def train(self):
        self.clf.fit(self.X_train, self.y_train)
        return self.clf
    
    def test(self):
        y_pred = self.clf.predict(self.X_test)
        cm = confusion_matrix(self.y_test, y_pred)
        print ('confusion matrix: \n', cm)
        return {'test acc' : clf.score(self.X_test, y_test)}

In [36]:
Testclass = MLtest(X_train, y_train, X_test, y_test)
Testclass.dzt(class_weight=class_weight)
Testclass.train()
Testclass.test()

NameError: name 'tree' is not defined

In [29]:
Testclass.rf(n_estimators=100, max_depth=5)
Testclass.train()
Testclass.test()



NameError: name 'confusion_matrix' is not defined

In [30]:
Testclass.svm()
Testclass.train()
Testclass.test()

  y = column_or_1d(y, warn=True)


NameError: name 'confusion_matrix' is not defined

In [31]:
Testclass.mlp(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100, 10))
Testclass.train()
Testclass.test()

  y = column_or_1d(y, warn=True)


NameError: name 'confusion_matrix' is not defined