## TPOT tries a pipeline, evaluates its performance, and randomly changes parts of the ## pipeline in search of better performing algorithms. 

### Import essential libraries

In [1]:
import tpot
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
data = pd.read_csv('data_after_mice.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,emp_length,annual_inc,dti,...,verification_status_Source Verified,verification_status_Verified,loan_status_Current,loan_status_Default,loan_status_Fully Paid,loan_status_In Grace Period,loan_status_Late (16-30 days),loan_status_Late (31-120 days),issue_month,issue_year
0,0,7000.0,7000.0,7000.0,0,6.62,214.93,0,48000.0,16.0,...,1,0,0,0,1,0,0,0,10,2011
1,1,7200.0,7200.0,7200.0,0,12.42,240.59,9,35000.0,6.14,...,0,0,0,0,1,0,0,0,10,2011
2,2,22000.0,22000.0,22000.0,0,14.65,758.88,1,192000.0,1.82,...,0,1,0,0,1,0,0,0,10,2011
3,3,18750.0,18750.0,18750.0,0,7.9,586.7,3,75000.0,13.94,...,1,0,0,0,1,0,0,0,11,2011
4,4,12500.0,12500.0,12500.0,0,7.9,391.13,3,32500.0,24.15,...,0,0,0,0,1,0,0,0,10,2011


### Defining Response and Predictors


The outcome variable is also called the response or dependent variable, and the risk factors and confounders are 
called the predictors, or explanatory or independent variables. 
In regression analysis, the dependent variable is denoted "Y" and the independent variables are denoted by "X".

In [3]:
predictors = ['grade_C','grade_D','grade_E',
        'grade_F','grade_G','total_rec_int',
        'total_pymnt_inv','funded_amnt_inv','sub_grade_B5',
        'sub_grade_C5','sub_grade_C4','sub_grade_C3','sub_grade_B4','sub_grade_D5']

predictors

['grade_C',
 'grade_D',
 'grade_E',
 'grade_F',
 'grade_G',
 'total_rec_int',
 'total_pymnt_inv',
 'funded_amnt_inv',
 'sub_grade_B5',
 'sub_grade_C5',
 'sub_grade_C4',
 'sub_grade_C3',
 'sub_grade_B4',
 'sub_grade_D5']

In [4]:
data.drop(['Unnamed: 0'],axis=1,inplace=True)

In [5]:
response = ['int_rate']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data[predictors], data[response],
                                                    train_size=0.6, test_size=0.4)


X_train = X_train.values
X_test = X_test.values
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

In [7]:
tpot_config = {'sklearn.tree.DecisionTreeClassifier': {},
              'sklearn.neural_network.MLPClassifier': {'hidden_layer_sizes': [50, 25], 
                                                       'activation': ['relu', 'tanh', 'logistic', 'identity'],
                                                      'batch_size': [10, 20, 40, 60, 80, 100], 
                                                       'solver': ['adam', 'sgd', 'lbfgs'], 'max_iter': [10, 50, 100],
                                                      'batch_size': [10, 20, 40, 60, 80, 100]},
              'sklearn.linear_model.LinearRegression': {}}

In [8]:
from ipywidgets import IntProgress

### Instantiating, fitting, & scoring the TPOT classifier is like any other sklearn classifier
### tpot.export:- TPOT writes the optimized pipeline to an external python file

In [9]:
tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, config_dict = tpot_config)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_lendingClub_pipeline_lite.py')

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=300.0, style=ProgressStyle(de…

Generation 1 - Current best internal CV score: -1.9517797526445115
Generation 2 - Current best internal CV score: -1.9517797526445115
Generation 3 - Current best internal CV score: -1.9517797526445115
Generation 4 - Current best internal CV score: -1.9517399573143002
Generation 5 - Current best internal CV score: -1.9517399573143002

Best pipeline: LinearRegression(CombineDFs(input_matrix, LinearRegression(CombineDFs(input_matrix, input_matrix))))
-1.8265096431543566


### TPOT writes the optimized pipeline to an external python file specified in the tpot.export() function.
### The code is copied from that file and pasted below.

In [10]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

### Split arrays or matrices into random train and test subsets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data[predictors], data[response],
                                                    train_size=0.6, test_size=0.4)


X_train = X_train.values
X_test = X_test.values
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

### Construct a FeatureUnion from the given transformers.

In [12]:

exported_pipeline = make_pipeline(
    make_union(
        make_union(
            FunctionTransformer(copy),
            FunctionTransformer(copy)
        ),
        StackingEstimator(estimator=LinearRegression())
    ),
    LinearRegression()
)

In [13]:
exported_pipeline.fit(X_train, y_train)
results = exported_pipeline.predict(X_test)

### Mape : The mean absolute percentage error (MAPE) is a statistical measure of how accurate a forecast system is. It measures this accuracy as a percentage

In [14]:
mape = 0

for i,j in zip(y_test, results):
    mape += np.abs((i-j)/i)
    
mape = (mape*100)/y_test.shape[0]

mape

11.72825015878589