# Exercise 1

Lets imagine there is a medical test for a disease that has a 92% chance of detecting a disease if the person has it (True Positive) and a 94% chance of correctly detecting that the disease is absent if the person doesnt have the disease (True Negative).

10% of the general population has the disease.

### What is the probability of testing positive (whether you have the disease or not)?

testing positive = true positive and false positive

P(have disease) = 0.1
P(no disease) = 0.9

P(have disease and positive test) = 0.1 * 0.92 = 0.092   #true positive
P(no disease and positive test) = 0.9 * 0.06 = 0.054     #false positive

P(testing positive) = 0.146


### What is the probability of having the disease given that you got a positive test?

A = test is positive
P(A) = 0.146

B = have the disease 
P(B) = 0.1

P(A|B) = P(B union A) / P(A) = 0.092 / 0.146 = 0.630 (3 decimal places)

# Exercise 2

We are going to use a reduced sample of the [Lending Club dataset](https://www.lendingclub.com/info/download-data.action). The Lending Club is a startup that provides personal loans up to $40,000. 

Every quarter they release all of their data related to how many loan requests they accept and how many they reject.

The target variable is `status`, that indicates whether the loan request was `accepted` or `rejected`.

In [1]:
import pandas as pd

loans = pd.read_csv("data/loans.csv")

target = "status"

loans.head(2)

Unnamed: 0,addr_state,debt_to_income_ratio,emp_length,loan_amnt,status,title,zip_code
0,CA,9.99,8 years,36000.0,accepted,Debt consolidation,916xx
1,TX,1.91%,< 1 year,5000.0,declined,Other,769xx


In [2]:
loans.dtypes

addr_state               object
debt_to_income_ratio     object
emp_length               object
loan_amnt               float64
status                   object
title                    object
zip_code                 object
dtype: object

In [3]:
#Remove off the postal code 
loans.drop("zip_code", axis=1, inplace = True)


In [4]:
loans.head(10)

Unnamed: 0,addr_state,debt_to_income_ratio,emp_length,loan_amnt,status,title
0,CA,9.99,8 years,36000.0,accepted,Debt consolidation
1,TX,1.91%,< 1 year,5000.0,declined,Other
2,OK,41.74%,< 1 year,15000.0,declined,Debt consolidation
3,DE,20.28%,< 1 year,10000.0,declined,Debt consolidation
4,AL,7.52%,< 1 year,5000.0,declined,Other
5,FL,18.46%,< 1 year,1000.0,declined,Credit card refinancing
6,TX,2.12%,< 1 year,1000.0,declined,Other
7,MN,15.53,3 years,35000.0,accepted,Debt consolidation
8,NC,100%,< 1 year,2000.0,declined,Car financing
9,VT,4.22%,< 1 year,4000.0,declined,Other


In [5]:
#Numerical Variables and its Pipeline
#Variables to look after: loan amount and the ratios
#format the ratio and add it back into the dataframe

debt_to_income = []

for ratio in loans.debt_to_income_ratio:
    if type(ratio) == str:
        if ratio[-1] == '%':
            debt_to_income.append(float("{0:.4f}".format(float(ratio[:-1])/100)))
        elif type(ratio) == float:
            debt_to_income.append(ratio)
debt_to_income

#casting the list into a dataframe
loans.debt_to_income_ratio = pd.DataFrame(debt_to_income)

In [6]:
loans.dtypes

addr_state               object
debt_to_income_ratio    float64
emp_length               object
loan_amnt               float64
status                   object
title                    object
dtype: object

In [7]:
#Set the numerical cols for the pipelines to work on
numerical_cols = ['debt_to_income_ratio', 'loan_amnt']

In [8]:
#Look after the nans in the numerical_columns
loans.loan_amnt.isnull().sum() #no nulls here
loans.debt_to_income_ratio.isnull().sum()  #Some nulls here

532

In [9]:
loans.debt_to_income_ratio = loans.debt_to_income_ratio.fillna(loans.debt_to_income_ratio.mean())

In [10]:
#Numerical datatypes' pipelines
from sklearn import preprocessing, feature_extraction

imputer = preprocessing.Imputer(strategy="mean")
scaler = preprocessing.StandardScaler()

In [11]:
from sklearn.pipeline import make_pipeline

In [12]:
numerical_pipeline = make_pipeline(imputer,scaler)

In [13]:
numerical_pipeline

Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True))])

In [14]:
numerical_pipeline.fit_transform(loans[numerical_cols])

array([[ -6.85298270e-02,   1.43390498e+00],
       [ -4.62941677e-02,  -5.73750097e-01],
       [ -5.82745154e-02,   7.38805735e-02],
       ..., 
       [ -3.47086692e-16,  -8.00420831e-01],
       [ -3.47086692e-16,  -8.32802365e-01],
       [ -3.47086692e-16,  -2.49934762e-01]])

In [15]:
from sklearn.base import BaseEstimator

class ColumnSelector(BaseEstimator):
    def __init__(self, cols=None, drop_axis=False):
        self.cols = cols
        self.drop_axis = drop_axis

    def fit_transform(self, X, y=None):
        return self.transform(X=X, y=y)

    def transform(self, X, y=None):
        if hasattr(X, 'loc'):
            #only pandas dataframes have the method loc
            t = X.loc[:, self.cols].values
        else:
            # its a numpy array
            t = X[:, self.cols]

        if t.shape[-1] == 1 and self.drop_axis:
            t = t.reshape(-1)
        if len(t.shape) == 1 and not self.drop_axis:
            t = t[:, np.newaxis]
        return t

    def fit(self, X, y=None):
        return self

In [16]:
numerical_col_selector = ColumnSelector(cols=numerical_cols)

In [17]:
numerical_col_selector.fit_transform(loans)

array([[  1.91000000e-02,   3.60000000e+04],
       [  4.17400000e-01,   5.00000000e+03],
       [  2.02800000e-01,   1.50000000e+04],
       ..., 
       [  1.24665209e+00,   1.50000000e+03],
       [  1.24665209e+00,   1.00000000e+03],
       [  1.24665209e+00,   1.00000000e+04]])

Categorical Pipeline

In [18]:
categors_col = ["addr_state", "title"]

In [19]:
loans[categors_col].head(3)

Unnamed: 0,addr_state,title
0,CA,Debt consolidation
1,TX,Other
2,OK,Debt consolidation


In [20]:
from category_encoders import OneHotEncoder

In [21]:
categors_pipeline = make_pipeline(
     ColumnSelector(cols=categors_col),
     OneHotEncoder()
)

categors_pipeline.fit_transform(loans)[:5]

Unnamed: 0,0_1,0_2,0_3,0_4,0_5,0_6,0_7,0_8,0_9,0_10,...,1_5,1_6,1_7,1_8,1_9,1_10,1_11,1_12,1_13,1_-1
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
categors_col_selector = ColumnSelector(cols=categors_col)

In [23]:
categors_col_selector.fit_transform(loans)

array([['CA', 'Debt consolidation'],
       ['TX', 'Other'],
       ['OK', 'Debt consolidation'],
       ..., 
       ['WI', 'Debt consolidation'],
       ['CO', 'Debt consolidation'],
       ['NC', 'Debt consolidation']], dtype=object)

Ordinal Pipeline 

In [24]:
#Variables here that are ordinal
ordinal_var = ["emp_length"]

In [25]:
#Look after the nulls here for the emp_length --> as majority of the clients' duration is below 1 year
loans.emp_length = loans.emp_length.fillna("< 1 year")

In [26]:
#find out the sayings available in the emp_length
loans.emp_length.value_counts()

< 1 year     9259
10+ years     234
1 year        127
2 years        80
3 years        74
5 years        50
4 years        47
6 years        42
7 years        38
8 years        26
9 years        23
Name: emp_length, dtype: int64

In [27]:
from category_encoders import OrdinalEncoder

# ColumnSelector's output is an array, so we use the column 0 for ordinal encoder
ordinal_encoder = OrdinalEncoder(mapping=[
    {"col": 0,
      "mapping": {
        ("< 1 year", 0),
        ("1 year", 1),
        ("2 years",2),
        ("3 years", 3),
        ("4 years", 4),
        ("5 years", 5),
        ("6 years", 6),
        ("7 years", 7),
        ("8 years", 8),
        ("9 years", 9),
        ("10+ years", 10),
      } 
     }
])
ordinal_pipeline = make_pipeline(
    ColumnSelector(cols=ordinal_var),
    ordinal_encoder
)

ordinal_pipeline.fit_transform(loans)[:4]

Unnamed: 0,0
0,8
1,0
2,0
3,0


In [34]:
ordinal_col_selector.fit_transform(loans)

array([['8 years'],
       ['< 1 year'],
       ['< 1 year'],
       ..., 
       ['< 1 year'],
       ['< 1 year'],
       ['< 1 year']], dtype=object)

Pipeline Union Portion here

In [35]:
from sklearn.pipeline import make_union

In [36]:
processing_pipeline = make_union(
    numerical_pipeline,
    categors_pipeline,
    ordinal_pipeline
)

In [37]:
processing_pipeline

FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True))])), ('pipeline-2', Pipeline(memory=None,
     steps=[('columnsele...', 9), ('7 years', 7), ('10+ years', 10), ('5 years', 5)}}],
        return_df=True, verbose=0))]))],
       transformer_weights=None)

In [38]:
#Not working, ask Sean
processing_pipeline.fit_transform(loans)

ValueError: could not convert string to float: 'Debt consolidation'

In [None]:
#Estimators on the target variable
from sklearn.linear_model import LinearRegression

estimator = LinearRegression()
estimator_pipeline = make_pipeline(
    processing_pipeline,
    estimator
)

In [None]:
estimator_pipeline.fit(laons, loans[target])

In [None]:
estimator_pipeline.predict(loans)[:5]

### Create a predictive pipeline and perform hyperparameter optimization, using the r2 scoring. Which numerical transformer works best? which estimator works best, a KNN Classifier, or a DecisionTree Classifier?