In [1]:
import numpy as np
import pandas as pd

import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split


def reduce_mem_usage(df):
    
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
 
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB --> {:.2f} MB (Decreased by {:.1f}%)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [6]:
data = reduce_mem_usage(pd.read_csv('/final2_prosper_dataset.csv'))
data['average_income'] = (data['MinIncome']+ data['MaxIncome'])/2
data.drop('Unnamed: 0',axis=1,inplace=True)
data.drop(["ListingCreationDate","FirstRecordedCreditLine","IncomeRange","LoanOriginationDate","DateCreditPulled"],axis=1,inplace=True)
risk = ['ProsperRating (numeric)','ProsperRating (Alpha)','ProsperScore','high_risk']
CreditScore = ['CreditScoreRangeLower','CreditScoreRangeUpper']
data['ProsperRating (Alpha)'].value_counts(normalize=True)

Memory usage of dataframe is 81.09 MB --> 33.53 MB (Decreased by 58.6%)


C     0.417278
B     0.135921
A     0.127271
D     0.125325
E     0.085932
HR    0.061177
AA    0.047096
Name: ProsperRating (Alpha), dtype: float64

In [7]:
df = data.copy()
df.drop(CreditScore,axis=1,inplace=True)
df = df.drop(risk,axis=1).drop('LoanKey',axis=1)
df = df.drop(['MinIncome','MaxIncome'],axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113066 entries, 0 to 113065
Data columns (total 80 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   CreditGrade                         113066 non-null  object 
 1   Term                                113066 non-null  int8   
 2   BorrowerAPR                         113066 non-null  float32
 3   BorrowerRate                        113066 non-null  float32
 4   LenderYield                         113066 non-null  float32
 5   EstimatedEffectiveYield             113066 non-null  float32
 6   EstimatedLoss                       113066 non-null  float32
 7   EstimatedReturn                     113066 non-null  float32
 8   ListingCategory (numeric)           113066 non-null  int8   
 9   BorrowerState                       113066 non-null  object 
 10  Occupation                          113066 non-null  object 
 11  EmploymentStatus          

In [8]:
X = pd.get_dummies(df,columns=[col for col in df.columns if 'O' == df[col].dtype])
X.head()

Unnamed: 0,Term,BorrowerAPR,BorrowerRate,LenderYield,EstimatedEffectiveYield,EstimatedLoss,EstimatedReturn,ListingCategory (numeric),EmploymentStatusDuration,IsBorrowerHomeowner,...,Occupation_Truck Driver,Occupation_Waiter/Waitress,EmploymentStatus_Employed,EmploymentStatus_Full-time,EmploymentStatus_Not available,EmploymentStatus_Not employed,EmploymentStatus_Other,EmploymentStatus_Part-time,EmploymentStatus_Retired,EmploymentStatus_Self-employed
0,36,0.16516,0.158,0.138,0.16162,0.0724,0.09211,0,2.0,1,...,0,0,0,0,0,0,0,0,0,1
1,36,0.12016,0.092,0.082,0.0796,0.0249,0.0547,2,44.0,0,...,0,0,1,0,0,0,0,0,0,0
2,36,0.28269,0.275,0.24,0.16162,0.0724,0.09211,0,67.0,0,...,0,0,0,0,1,0,0,0,0,0
3,36,0.12528,0.0974,0.0874,0.0849,0.0249,0.06,16,113.0,1,...,0,0,1,0,0,0,0,0,0,0
4,36,0.24614,0.2085,0.1985,0.18316,0.0925,0.09066,2,44.0,1,...,0,0,1,0,0,0,0,0,0,0


In [9]:
data.loc[data["ProsperScore"]==11,"ProsperScore"]=10
Data = data.copy()
# Transform into a categorical variable
Data["ProsperScore"] = pd.Categorical(Data["ProsperScore"])

# Assign a number to each category (label encoding)
Data["ProsperScore"] = Data["ProsperScore"].cat.codes 

y = Data.drop(["ProsperScore"], axis=1)
# Use to_categorical on your labels
y = to_categorical(Data["ProsperScore"])

# Now print the one-hot encoded labels
print('One-hot encoded competitors: \n',y)
print(y.shape)
print(X.shape)

One-hot encoded competitors: 
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(113066, 10)
(113066, 211)


In [10]:
predictors, test_X, target, test_y = train_test_split(X, y, train_size = 0.75, random_state = 1111)
print(predictors.shape)
print(test_X.shape)
print(target.shape)
print(test_y.shape)

(84799, 211)
(28267, 211)
(84799, 10)
(28267, 10)


In [11]:
# Instantiate a sequential model
model2 = Sequential()

# Add dense layers
model2.add(Dense(256, input_shape=(211,), activation='relu'))
model2.add(Dense(512, activation='relu'))
model2.add(Dense(256, activation='relu'))
model2.add(Dense(128, activation='relu'))
model2.add(Dense(64, activation='relu'))

# Add a dense layer with as many neurons as prosper rating
model2.add(Dense(10, activation='softmax'))

# Compile your model using categorical_crossentropy loss
model2.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])


In [12]:
# Fit your model to the training data for 200 epochs
model2.fit(predictors, target, validation_split=0.25, epochs=100)

# Evaluate your model accuracy on the test data
accuracy = model2.evaluate(test_X, test_y)[1]

# Print accuracy
print('Accuracy:', accuracy)

# Predict on coords_small_test
preds = model2.predict(test_X)

# Print preds vs true values
print("{:45} | {}".format('Raw Model Predictions','True labels'))
for i,pred in enumerate(preds):
  print("{} | {}".format(pred,test_y[i]))

# Extract the position of highest probability from each pred vector
preds_chosen = [np.argmax(pred) for pred in preds]

# Print preds vs true values
print("{:10} | {}".format('Rounded Model Predictions','True labels'))
for i,pred in enumerate(preds_chosen):
  print("{:25} | {}".format(pred,test_y[i]))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
                        3 | [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
                        1 | [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
                        2 | [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
                        3 | [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
                        5 | [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
                        4 | [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
                        6 | [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
                        7 | [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
                        9 | [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
                        6 | [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
                        5 | [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
                        0 | [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
                        1 | [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
                        7 | [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
                        6 | [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
                        7 | [0. 0. 

In [14]:
accuracy

0.5290975570678711

In [17]:
import pickle
with open("second_model2.pichle","wb") as file:
    pickle.dump(model2,file)

INFO:tensorflow:Assets written to: ram://34c18d8e-6f69-4c68-a14a-d47bfa4c6d18/assets


In [1]:
list(range(3,100))

[3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99]