#  IPE PYTHON II_Paula Escusol Entío


In [1]:
#Importing libraries
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer
from sklearn.experimental import enable_iterative_imputer
from sklearn.feature_selection import RFE
from sklearn.impute import IterativeImputer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score, recall_score, \
    matthews_corrcoef, precision_score, confusion_matrix, make_scorer

In [2]:
conda install -c conda-forge missingno

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [3]:
import missingno as msno

In [None]:
#Inserting and readingg the data (excel file)
df = pd.read_excel("AUTO_LOANS_DATA.xlsx")
df

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
df.head()

## 0. Basic steps

### 1. Change the name of the columns to lower case

In [None]:
print(list(df.columns))

In [None]:
#We want to transform the name of all the columns to lower case
df.rename(columns=lambda x: x.lower(), inplace=True)

In [None]:
print(list(df.columns))

## 1. Missing not at random analysis -  Analyzing missing values. 

In [None]:
#Visualize the distribution of the missing values
msno.matrix(df)


In [None]:
#We use the heatmap to identify if there is a relationship in the presence of null values between the columns:
    #Values close to positive 1 indicate that the presence of null values in one column is correlated with the presence of null values in another column.
    #Values close to negative 1 indicate that the presence of null values in one column is anti-correlated with the presence of null values in another column.
    #Values close to 0, indicate there is little to no relationship between the presence of null values in one column compared to another.
msno.heatmap(df)


In [None]:
msno.dendrogram(df)
#In the dendrogram plot below, we can see we have three distinct groups:
    #The first is on the right side (profession, sex, and birth_date) which all have a high degree of null values;
    
    #The second is on the left, with the remainder of the columns which are more complete (customer_open_date,
    #bucket, outstanding, original_booked_amoount...) are all grouped together at zero indicating that they
    #are complete.
    
    #The variable car_type also has a high degree of high values. However, as we have seen in the heatmap above, 
    #the presence of null values in the car_type variable is anti-correlated with the presence of null values in 
    #any other column. 
    

## 2. Data Preparation

### 1. Removing duplicate records

In [None]:
#Let’s explore if there are any duplicate values in the dataset:
df['customer_id'].value_counts()

In [None]:
# We can see that some of the records (customers) are recorded multiple times. In case we decided to remove the
# duplicate records, our approach would be to keep only the last record for each customer, as follows:
    #df.sort_values(by=['loan_open_date'])
    #df.drop_duplicates('customer_id', keep = 'last', inplace = True)
    
#However, to be able to analyze the accuracy of our model when splitting the dataset (refer to section "Risk Based
#Segmentation by variable" of the document) we have decided not to remove the duplicate records.

### REMOVE - 2. Creating a new column "user type" to identify the type of user: Corporate (1) vs. Individuals (0)

In [None]:
#Duplicating the column "program_name" to a new column "user_type" to later identify the type of the user: corporates or individuals
df['user_type'] = df.apply(lambda row: row.program_name, axis = 1)
df

In [None]:
df["user_type"].replace({"Auto Loans Corporate Guarantee": "1", 
                         "Auto Loans 50% Down Payment - Employed": "0",
                        "Pick Up and Small Trucks": "0",
                        "Auto Loans 40% Down Payment - Employed": "0",
                        "Auto Loans 30% Down Payment - Self Employed": "0",
                        "Auto Loans 40% Down Payment - Self Employed": "0",
                        "Auto Loans 20% Down Payment - Employed": "0",
                        "Auto Loans 30% Down Payment - Employed": "0",
                        "Auto Loans 50% Down Payment - Self Employed": "0",
                        "Auto Loans Special Deals": "0",
                        "Auto Loans Payroll Clients": "0",
                        "Auto Loans Secured against CD": "0",
                        "Auto Loans Doctors - Employed": "0",
                        "Auto Loans Fully Secured": "0",
                        "Auto Loans Doctors - Self Employed": "0",
                        "Auto Loans 20% Down Payment - Self Employed": "0",
                        "Auto Loans 50% Down Payment No Car Prohibition - Self Employed": "0",
                        "Auto Loans Run Off - Self Employed":"0",
                        "Auto Loans Run Off - Employed": "0",
                        "Auto Loans 50% Down Payment No Car Prohibition - Employed": "0",
                        "Auto Loans 50% Down Payment Used Cars - Self Employed": "0",
                        "Auto Loans 50% Down Payment Used Cars - Employed": "0",
                        "Auto Loans 30% Down Payment Used Cars - Self Employed": "0",
                        "Auto Loans 30% Down Payment Used Cars - Employed": "0"}, inplace=True)

df

In [None]:
#number of individuals (0) vs. corporates (1)
df.user_type.value_counts()

In [None]:
# here we should 

### 3. Transforming the variable "program_name" into a categorical variable. 

In [None]:
#The function below transforms the variable "program_name" into a categorical variable.

df['program_name'] = df.program_name.astype("category").cat.codes
df.program_name, df.dtypes

In [None]:
df["program_name"] = df["program_name"].astype("category")
df.program_name, df.dtypes

In [None]:
feat_type = ['Categorical' if x.name == 'category' else 'Numerical' for x in df.dtypes]
feat_type

df

### 4. Transforming the variable "birth_date" into age

In [None]:
df['age'] = pd.to_datetime(df['birth_date'], format='%Y-%m-%d')
df['age'] = ((df.reporting_date - df.birth_date)/np.timedelta64(1, 'Y'))

df.round({"age":0})

df.drop('birth_date', inplace=True, axis=1)
df

### 5. Removing Nans

#### a) Selecting only the numerics to replace NaNs or blanks with 0 

In [None]:
#Check the amount of missing values that we have in each variable
df.isnull().sum()

In [None]:
list_numeric_columns = ['age']
for item in list_numeric_columns:
    df[item] = df[item].fillna(0).replace('NaN',0).replace('',0)

#### b) Selecting non-numerics to replace NaNs with blanks

In [None]:
# Within the non-numeric variables with missing values, we can distinguish two groups: 
    #1 - Non-numerics which have missing values related to the type of user (MNAR): Corporate users do not contain information
    #regarding the sex or profession.
    
    #2 - Non-numerics with missing at random data (MAR)- concretelly, the variable CAR_TYPE.
    
#We will deal with these situations differently: 

#1 - MNAR for Corporate users: 
       
list_non_numeric_columns = ['profession', 'sex']
for item in list_non_numeric_columns:
    df[item] = df[item].fillna('').replace('NaN','');
    
#2 - MAR for CAR_TYPE:
    #Once the variable CAR_TYPE is converted to a categorical variable, we will replace the missing values with 
    #the most common value of the variable (the mode). Refer to point "12 - Transforming the variable "car_type" 
    #to categorical" to see this point.
    

### 6. Transforming the variable "loan_open_date" into months that the row has been opened at the reporting date

In [None]:
# This function converts a given date (loan_open_date) to the number of months that the contract has been opened 
# at each reporting date.
                                   
df['months_loan_opened'] = ((df.reporting_date - df.loan_open_date)/np.timedelta64(1, 'M'))
df.drop('loan_open_date', inplace=True, axis=1)

df

### 7. Transforming the variable "expected_close_date" into months.

In [None]:
# This function converts the "expected_close_date" variable into the number of months that the contract will still be opened.

df['months_to_close_loan'] = ((df.expected_close_date - df.reporting_date)/np.timedelta64(1, 'M'))
df.drop('expected_close_date', inplace=True, axis=1)

df

### 8. Transforming the variable "customer_open_date" into months.

In [None]:
# This function converts the "customer_open_date" variable into the number of months that the customer has had a 
# contract has been opened at each reporting date 
                                   
df['months_client_opened'] = ((df.reporting_date - df.customer_open_date)/np.timedelta64(1, 'M'))
#df['months_client_opened'] = df['customer_open_date'].astype(int)
df.drop('customer_open_date', inplace=True, axis=1)

df

### 9. Transforming the variable "sex" into dummy variables

In [None]:
#Transform the scales of the variable "sex" to 0 and 1, respectively.

# Get one hot encoding of column sex
one_hot = pd.get_dummies(df['sex'])
# Drop column sex as it is now encoded
df = df.drop('sex',axis = 1)
# Join the encoded df
df = df.join(one_hot)
df 

### 10. Transforming the target variable "bucket" into dummy variables

In [None]:
# The variable "bucket" indicates the number of unpaid installements at each reporting date. We have transformed
# this variable to dummy variables to indicate whether the user has had unpaid installments (1) or not(0).
bucket_replace = {2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1}
df = df.replace({'bucket': bucket_replace})


In [None]:
df['bucket'].unique()

### 11. Merging the variable "profession" and tranforming it into a categorical variable 

In [None]:
#Let’s explore in how many different professions we have in the dataset:

df['profession'].value_counts()

In [None]:
#First, lets transform all the scales of the "profession" variable to lower case
df["profession"] = df["profession"].str.lower()

In [None]:
#Now, since we have a lot of unneccesary categories, we will group all the scales into new categories and change 
#the type of the variable to categorical.
#The new categories that we will create are based on the existing categories, and will be: 
    #Employee (includes, among others, religious persons, military officers, politicians and athletes)
    #Company or shop owner
    #Homemaker (before: "housewife")
    #Retired
    #Unemployed
    #Landlord
    #Student
    
profession_replace = {"employee": "Employee", 
                          "company owner": "Company or shop owner", 
                         "manager": "Employee",
                        "shop owner": "Company or shop owner",
                       "instructor / teacher": "Employee",
                          "housewife": "Homemaker",
                          "accountant - employee": "Employee",
                          "engineer": "Employee",
                          "doctor": "Employee",
                          "retired": "Retired",
                          "unemployed": "Unemployed",
                          "contractor": "Employee",
                          "pharmacist": "Employee",
                          "nurse": "Employee",
                          "technician": "Employee",
                          "secretary": "Employee",
                          "business man / trader": "Employee",
                          "professors": "Employee",
                          "landlord": "Landlord",
                          "banker": "Employee",
                          "driver": "Employee",
                          "journalist": "Employee",
                          "chemist": "Employee",
                          "tour leader": "Employee",
                          "jeweller": "Employee",
                          "religion person": "Employee",
                          "artist": "Employee",
                          "broadcast / media": "Employee",
                          "athletes": "Employee",
                          "lawyer ? self employer": "Employee",
                          "craftsman": "Employee",
                          "car / boat agency / deale": "Employee",
                          "student": "Student",
                          "consultant": "Employee",
                          "hostess": "Employee",
                          "diver": "Employee",
                          "lawyer ? employee": "Employee",
                          "police officer": "Employee",
                          "bazaar shop owner": "Company or shop owner",
                          "pilot": "Employee",
                          "cae - current staff": "Employee",
                          "military officer": "Employee",
                          "cae ibs staff": "Employee",
                          "real estate broker / agen": "Employee",
                          "agrarian": "Employee",
                          "economist": "Employee",
                          "press": "Employee",
                          "cae ex-staff less than 10": "Employee",
                          "publisher": "Employee",
                          "politician": "Employee",
                          "hotel manager": "Employee"}

df = df.replace({'profession': profession_replace})

df['profession'] = df['profession'].astype(str)
df['profession'].value_counts()  

In [None]:
## If instead of transforming the variable program_name, we want to create a new variable, we should do: 
#df['program_name_category'] = df.program_name.astype("category").cat.codes

df['profession'] = df.program_name.astype("category").cat.codes

In [None]:
df

In [None]:
df["profession"] = df["profession"].astype("category")
# https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html
df.profession

In [None]:
feat_type = ['Categorical' if x.name == 'category' else 'Numerical' for x in df.dtypes]
feat_type
df

### 12. Transforming the variable "car_type" into categorical variables

In [None]:
#As we did before, let’s explore in how many different car types we have in the dataset:
df['car_type'].unique()

In [None]:
#We shall also transform all the scales to lower case
df["car_type"] = df["car_type"].str.lower()

In [None]:
df['car_type'].value_counts()

In [None]:
#Now, we want to group all the scales of the variable "car_type" into new categories, and change the type of the
#column to a categorical variable.
#The new categories that have been created are based on the continent of origin of the cars, and will be: 
    #Asian cars;
    #American cars;
    #European cars;
    #Others

car_type_replace = {'kia': "asian cars",
                    'carry': "others",
                    'chevrolet': "american cars",
                    'mitsubishi': "asian cars",
                    'seat': "european cars",
                    'skoda': "european cars",
                    'renault': "european cars",
                    'mercedes': "european cars",
                    'jack': "asian cars",
                    'byd': "asian cars",
                    'gely': "asian cars",
                    'hyundai': "asian cars",
                    'nissan': "asian cars",
                    'changan': "asian cars",
                    'gelory': "asian cars",
                    'suzuki': "asian cars",
                    'bmw': "european cars",
                    'daihatsu': "asian cars",
                    'ssang yong': "asian cars",
                    'baic': "asian cars",
                    'toyota': "asian cars",
                    'lada': "others",
                    'mazda': "asian cars",
                    'brilliance': "asian cars",
                    'kenbo': "asian cars",
                    'speranza': "others",
                    'saipa': "others",
                    'opel': "european cars",
                    'peugeot': "european cars",
                    'chana': "asian cars",
                    'citroen': "european cars",
                    'isuzu': "asian cars",
                    'proton': "asian cars",
                    'honda': "asian cars",
                    'volkswagen': "european cars",
                    'chery': "others",
                    'fiat': "european cars",
                    'subaru': "asian cars",
                    'jeep': "american cars",
                    'volvo': "european cars",
                    'mini': "european cars",
                    'ford': "american cars",
                    'great wall': "asian cars",
                    'mg': "european cars",
                    'mable': "asian cars",
                    'haima': "asian cars",
                    'changy': "asian cars",
                    'audi': "european cars",
                    'mahindra': "others",
                    'livan': "asian cars",
                    'florida': "others",
                    'dodge': "american cars",
                    'zemex': "asian cars",
                    'dfsk': "asian cars",
                    'saweast': "asian cars",
                    'zoty': "asian cars",
                    'faw': "asian cars",
                    'hawtai': "asian cars",
                    'jaguar': "european cars",
                    'victory': "american cars"}

df = df.replace({'car_type': car_type_replace})

#df['car_type'] = df['car_type'].astype(str)
df['car_type'].value_counts()  

In [None]:
df['car_type'] = df.program_name.astype("category").cat.codes

In [None]:
df["car_type"] = df["car_type"].astype("category")
# https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html
df.car_type, df.dtypes

In [None]:
feat_type = ['Categorical' if x.name == 'category' else 'Numerical' for x in df.dtypes]
feat_type
# Then use feat_type in your class: cls.fit(X_train, y_train, X_test, y_test, feat_type=feat_type)

df

In [None]:
#As we have previously analyzed, the missing values for the variable "car_type" are random and are not related 
#with the user_type. Therefore, wow that we have converted the variable "car_type" into a categorical variable, 
#we can inpute the mode to complete the missing values: 

df['car_type'].fillna(df['car_type'].mode()[0], inplace = True)

### 13. Dropping the variables "account_number", "customer_id" and "reporting_date"

In [None]:
#Since the aim of the project is to find which variables are good for segmentation, variables that only provide
#identification of the client or account (account_number and customer_id) do not provide relevant information and
#therefore, we have decided to remove them from the dataset. 

#Moreover, the variable "reporting_date" has also been removed from the dataset as it does not provide usefull
#information about any specific segment of the population. 

del df['account_number']
del df['reporting_date']
del df['customer_id']

### 14. Creating buckets for the variable "outstanding"

In [None]:
#The variable "outstanding" is a continue variable which probably holds as many unique values as records we have
#in our dataset. In order to be able to calculate the Information Gain of this variable towards the target 
#variable (bucket), we shall reduce the number of possible values of the "outstanding" variable. We will do so
#using qcut function to define the number of quantiles and divide up the data. 

pd.qcut(df['outstanding'], q=50)

In [None]:
#Then, we will store the bin results back in the original dataframe 

df['outstanding_quantile'] = pd.qcut(df['outstanding'], q=50, precision=0)
df

In [None]:
#To be able to use this information for further analysis, we will transform the variable "outstanding_cuantile"
#into a categorical variable.

df['outstanding_quantile'] = df.program_name.astype("category").cat.codes
df["outstanding_quantile"] = df["outstanding_quantile"].astype("category")
df.program_name

In [None]:
#Lastly, we can remove the original variable "outstanding" from our dataset since we won't be using it.

del df['outstanding']

## 2. Risk based segmentation: Splitting the data based on the user_type

### 1. Business segmentation

In [None]:
#By analyzing the data, we have seen that there are some variables missing not at random for certain users.
#Precisely, we have seen that the dataset contains two types of users: individuals and corporates. We can easily
#identify Corporates since these records have common missing values related to demographic information: 
    #SEX 
    #BIRTH_DATE (now transformed to AGE)
    #PROFESSION
#Therefore, we conclude that the first criteria for segmentation (business segmentation or risk segmentation) 
#shall be based on the user_type: individuals vs. corporates.

dfIndividuals, dfCorporates = [x for _, x in df.groupby(df['user_type'] == '1')]

In [None]:
dfIndividuals

In [None]:
dfCorporates

# Computing information gain for each variable

In [None]:
from sklearn.model_selection import train_test_split
df_random_sample, _ = train_test_split(df, test_size=0.2)

In [None]:
df1 = df_random_sample

In [None]:
target = 'bucket'
descriptive_feature = ['program_name', 'original_booked_amount', 'profession', 'car_type', 'user_type', 'age', 'months_loan_opened', 'months_to_close_loan', 'months_client_opened', 'F', 'M', 'outstanding_quantile'],

In [None]:
import io
import requests

In [None]:
#Information gain is used for determining the best features/attributes that render maximum information about a 
#target variable.

import io
import requests

class InformationGain():
    
    def  __init__(self, target, descriptive_feature):
         self.target = target
         self.descriptive_feature = descriptive_feature
        
def compute_impurity(feature, impurity_criterion):
    
    #This function calculates impurity of a feature. Supported impurity criteria: 'entropy', 'gini'.
    
    probs = feature.value_counts(normalize=True)
    
    if impurity_criterion == 'entropy':
        impurity = -1 * np.sum(np.log2(probs) * probs)
    elif impurity_criterion == 'gini':
        impurity = 1 - np.sum(np.square(probs))
    else:
        raise ValueError('Unknown impurity criterion')
        
    return(round(impurity, 3))    

target_entropy = compute_impurity(df[target], 'entropy')
target_entropy

def comp_feature_information_gain(df1, target, descriptive_feature, split_criterion):
    
    # This function calculates information gain for splitting on a particular descriptive feature for a given 
    # dataset and a given impurity criteria.
        
    print('target variable:', target)
    print('descriptive_feature:', descriptive_feature)
    print('split criterion:', split_criterion)
            
    target_entropy = compute_impurity(df1[target], split_criterion)

    # Two lists have been defined:
        # entropy_list to store the entropy of each partition
        # weight_list to store the relative number of observations in each partition
    entropy_list = list()
    weight_list = list()
    
    # Loop over each level of the feature to partition the dataset with respect to that level and compute
    # the entropy and the weight of the level's partition
    
    for level in df[descriptive_feature].unique():
        df_feature_level = df1[df1[descriptive_feature] == level]
        entropy_level = compute_impurity(df_feature_level[target], split_criterion)
        entropy_list.append(round(entropy_level, 3))
        weight_level = len(df_feature_level) / len(df)
        weight_list.append(round(weight_level, 3))

    #print('impurity of partitions:', entropy_list)
    #print('weights of partitions:', weight_list)

    feature_remaining_impurity = np.sum(np.array(entropy_list) * np.array(weight_list))
    #print('remaining impurity:', feature_remaining_impurity)
    
    information_gain = target_entropy - feature_remaining_impurity
    print('information gain:', information_gain)
    if information_gain <0.1:
        print("Not good for segmentation. Information Gain ratio not significant for BUCKET variable.")
    else:
        print("Good for segmentation. Pending further analysis")
    
    print('====================')
    
    return(information_gain)

#I have determined that for a variable to be significant for the segmentation, its Information Gain Ratio should be >0.1
#Information gain method obtained from: https://www.kaggle.com/edouarddesprez/health-insurance-auc-0-857

In [None]:
information_gain_list = list()
descriptive_feature_list = list()

split_criteria = 'gini'
for feature in df.drop(columns='bucket').columns:
    feature_info_gain = comp_feature_information_gain(df1, 'bucket', feature, split_criteria)
    information_gain_list.append(feature_info_gain)
    descriptive_feature_list.append(feature)

# Logistic regression model - all variables

In [None]:
target = 'bucket'
all_variables = ['program_name', 'original_booked_amount', 'profession', 'car_type', 'user_type', 'age', 'months_loan_opened', 'months_to_close_loan', 'months_client_opened', 'F', 'M', 'outstanding_quantile']

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
splitter = train_test_split
"-----------------------"

df_train, df_test = splitter(df1, test_size = 0.2, random_state = 42)

In [None]:
X_train = df_train[all_variables]
y_train = df_train[target]

In [None]:
X_test = df_test[all_variables]
y_test = df_test[target]

In [None]:
from sklearn.linear_model import LogisticRegression
method = LogisticRegression(random_state=0)
fitted_full_model = method.fit(X_train, y_train)
y_pred = fitted_full_model.predict(X_test)

In [None]:
y_pred = fitted_full_model.predict_proba(X_test)[:,0]

In [None]:
y_pred

In [None]:
y_test

In [None]:
#GINI Coefficient
from sklearn.metrics import roc_curve, auc
fpr,tpr,thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr,tpr)
GINI = (2*roc_auc) -1
GINI

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
#X_train, X_test,y_train, y_test = train_test_split(x,y,test_size=0.2,random_state = 42)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
#Computing the accuracy of our model
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

## Risk Based Segmentation by variable: 

In [None]:
# Now, we will be analyzing the accuracy of our model dividing our dataset by the variables with higher information
# gain, as ocmputed in the step above. 

### Splitting by age 

In [None]:
for i in range(25, 85, 10):
#segment 1
    df_train_seg1, df_train_seg2 = [x for _, x in df.groupby(df['age'] < i)]
    df_test_seg1, df_test_seg2 = [x for _, x in df.groupby(df['age'] < i)]
    X_train_seg1 = df_train_seg1[all_variables]
    y_train_seg1 = df_train_seg1[target]
    X_test_seg1 = df_test_seg1[all_variables]
    y_test_seg1 = df_test_seg1[target]
    fitted_model_seg1 = method.fit(X_train_seg1, y_train_seg1)
    y_pred_seg1 = fitted_model_seg1.predict(X_test_seg1)
    y_pred_seg1_fullmodel = fitted_full_model.predict(X_test_seg1)

#segment 2
    X_train_seg2 = df_train_seg2[all_variables]
    y_train_seg2 = df_train_seg2[target]
    X_test_seg2 = df_test_seg2[all_variables]
    y_test_seg2 = df_test_seg2[target]
    fitted_model_seg2 = method.fit(X_train_seg2, y_train_seg2)
    y_pred_seg2 = fitted_model_seg2.predict(X_test_seg2)
    y_pred_seg2_fullmodel = fitted_full_model.predict(X_test_seg2)

#printing results
    print ("     ")
    print("Variable analyzed: AGE > or < to", i)
    print("     SEGMENT 1: Model Developed on Seg 1 (train sample) applied on Seg 1 (test sample):",accuracy_score(y_test_seg1, y_pred_seg1))
    print("     SEGMENT 1: Model Developed on Full Population (train sample) applied on Seg 1 (test sample):",accuracy_score(y_test_seg1, y_pred_seg1_fullmodel))
    print ("     ")
    print("     SEGMENT 2: Model Developed on Full Population (train sample) applied on Seg 2 (test sample):",accuracy_score(y_test_seg2, y_pred_seg2_fullmodel))
    print("     SEGMENT 2: Model Developed on Seg 2 (train sample) applied on Seg 2 (test sample):",accuracy_score(y_test_seg2, y_pred_seg2))
    print ("     ")
    print ("     ==============")

### Splitting by original_booked_amount 

In [None]:
for i in range(500000, 2000000, 500000):
#segment 1
    df_train_seg1, df_train_seg2 = [x for _, x in df.groupby(df['original_booked_amount'] < i)]
    df_test_seg1, df_test_seg2 = [x for _, x in df.groupby(df['original_booked_amount'] < i)]
    X_train_seg1 = df_train_seg1[all_variables]
    y_train_seg1 = df_train_seg1[target]
    X_test_seg1 = df_test_seg1[all_variables]
    y_test_seg1 = df_test_seg1[target]
    fitted_model_seg1 = method.fit(X_train_seg1, y_train_seg1)
    y_pred_seg1 = fitted_model_seg1.predict(X_test_seg1)
    y_pred_seg1_fullmodel = fitted_full_model.predict(X_test_seg1)

#segment 2
    X_train_seg2 = df_train_seg2[all_variables]
    y_train_seg2 = df_train_seg2[target]
    X_test_seg2 = df_test_seg2[all_variables]
    y_test_seg2 = df_test_seg2[target]
    fitted_model_seg2 = method.fit(X_train_seg2, y_train_seg2)
    y_pred_seg2 = fitted_model_seg2.predict(X_test_seg2)
    y_pred_seg2_fullmodel = fitted_full_model.predict(X_test_seg2)

#printing results
    print ("     ")
    print("Variable analyzed: ORIGINAL_BOOKED_AMOUNT > or < to", i)
    print("     SEGMENT 1: Model Developed on Seg 1 (train sample) applied on Seg 1 (test sample):",accuracy_score(y_test_seg1, y_pred_seg1))
    print("     SEGMENT 1: Model Developed on Full Population (train sample) applied on Seg 1 (test sample):",accuracy_score(y_test_seg1, y_pred_seg1_fullmodel))
    print ("     ")
    print("     SEGMENT 2: Model Developed on Full Population (train sample) applied on Seg 2 (test sample):",accuracy_score(y_test_seg2, y_pred_seg2_fullmodel))
    print("     SEGMENT 2: Model Developed on Seg 2 (train sample) applied on Seg 2 (test sample):",accuracy_score(y_test_seg2, y_pred_seg2))
    print ("     ")
    print ("     ==============")

### Splitting by months_loan_opened 

In [None]:
for i in range(12, 60, 12):
#segment 1
    df_train_seg1, df_train_seg2 = [x for _, x in df.groupby(df['months_loan_opened'] < i)]
    df_test_seg1, df_test_seg2 = [x for _, x in df.groupby(df['months_loan_opened'] < i)]
    X_train_seg1 = df_train_seg1[all_variables]
    y_train_seg1 = df_train_seg1[target]
    X_test_seg1 = df_test_seg1[all_variables]
    y_test_seg1 = df_test_seg1[target]
    fitted_model_seg1 = method.fit(X_train_seg1, y_train_seg1)
    y_pred_seg1 = fitted_model_seg1.predict(X_test_seg1)
    y_pred_seg1_fullmodel = fitted_full_model.predict(X_test_seg1)

#segment 2
    X_train_seg2 = df_train_seg2[all_variables]
    y_train_seg2 = df_train_seg2[target]
    X_test_seg2 = df_test_seg2[all_variables]
    y_test_seg2 = df_test_seg2[target]
    fitted_model_seg2 = method.fit(X_train_seg2, y_train_seg2)
    y_pred_seg2 = fitted_model_seg2.predict(X_test_seg2)
    y_pred_seg2_fullmodel = fitted_full_model.predict(X_test_seg2)

#printing results
    print ("     ")
    print("Variable analyzed: MONTHS_LOAN_OPENED > or < to", i)
    print("     SEGMENT 1: Model Developed on Seg 1 (train sample) applied on Seg 1 (test sample):",accuracy_score(y_test_seg1, y_pred_seg1))
    print("     SEGMENT 1: Model Developed on Full Population (train sample) applied on Seg 1 (test sample):",accuracy_score(y_test_seg1, y_pred_seg1_fullmodel))
    print ("     ")
    print("     SEGMENT 2: Model Developed on Full Population (train sample) applied on Seg 2 (test sample):",accuracy_score(y_test_seg2, y_pred_seg2_fullmodel))
    print("     SEGMENT 2: Model Developed on Seg 2 (train sample) applied on Seg 2 (test sample):",accuracy_score(y_test_seg2, y_pred_seg2))
    print ("     ")
    print ("     ==============")

### Splitting by months_client_opened 

In [None]:
for i in range(12, 60, 12):
#segment 1
    df_train_seg1, df_train_seg2 = [x for _, x in df.groupby(df['months_client_opened'] < i)]
    df_test_seg1, df_test_seg2 = [x for _, x in df.groupby(df['months_client_opened'] < i)]
    X_train_seg1 = df_train_seg1[all_variables]
    y_train_seg1 = df_train_seg1[target]
    X_test_seg1 = df_test_seg1[all_variables]
    y_test_seg1 = df_test_seg1[target]
    fitted_model_seg1 = method.fit(X_train_seg1, y_train_seg1)
    y_pred_seg1 = fitted_model_seg1.predict(X_test_seg1)
    y_pred_seg1_fullmodel = fitted_full_model.predict(X_test_seg1)

#segment 2
    X_train_seg2 = df_train_seg2[all_variables]
    y_train_seg2 = df_train_seg2[target]
    X_test_seg2 = df_test_seg2[all_variables]
    y_test_seg2 = df_test_seg2[target]
    fitted_model_seg2 = method.fit(X_train_seg2, y_train_seg2)
    y_pred_seg2 = fitted_model_seg2.predict(X_test_seg2)
    y_pred_seg2_fullmodel = fitted_full_model.predict(X_test_seg2)

#printing results
    print ("     ")
    print("Variable analyzed: MONTHS_CLIENT_OPENED > or < to", i)
    print("     SEGMENT 1: Model Developed on Seg 1 (train sample) applied on Seg 1 (test sample):",accuracy_score(y_test_seg1, y_pred_seg1))
    print("     SEGMENT 1: Model Developed on Full Population (train sample) applied on Seg 1 (test sample):",accuracy_score(y_test_seg1, y_pred_seg1_fullmodel))
    print ("     ")
    print("     SEGMENT 2: Model Developed on Full Population (train sample) applied on Seg 2 (test sample):",accuracy_score(y_test_seg2, y_pred_seg2_fullmodel))
    print("     SEGMENT 2: Model Developed on Seg 2 (train sample) applied on Seg 2 (test sample):",accuracy_score(y_test_seg2, y_pred_seg2))
    print ("     ")
    print ("     ==============")

# Execution summary report 

## Missing not at random report

In [None]:
print("The variables PROFESSION, SEX and BIRTHDATE (converted to AGE) seem Missing Not at Random, therefore we recommend:")
print('\033[1m Thin File Segment Variables: \033[0m',"REPORTING_DATE, ACCOUNT_NUMBER, CUSTOMER_ID, PROGRAM_NAME, LOAN_OPEN_DATE, EXPECTED_CLOSE_DATE, ORIGINAL_BOOKED_AMOUNT, OUTSTANDING, CUSTOMER_OPEN_DATE, CAR_TYPE")
print('     ')
print('\033[1m Full File Segment Variables: \033[0m',"REPORTING_DATE, ACCOUNT_NUMBER, CUSTOMER_ID, PROGRAM_NAME, LOAN_OPEN_DATE, EXPECTED_CLOSE_DATE, ORIGINAL_BOOKED_AMOUNT, OUTSTANDING, SEX, CUSTOMER_OPEN_DATE, BIRTH_DATE, PROFESSION, CAR_TYPE")


## Variable by Variable Risk Based Segmentation Analysis

In [None]:
print("\033[1m REPORTING_DATE \033[0m Not good for segmentation. Afer analysis, we did not find a good split using this variable.")
print ("     ")
print ("\033[1m ACCOUNT_NUMBER \033[0m Not good for segmentation. Afer analysis, we did not find a good split using this variable.")
print ("     ")
print ("\033[1m CUSTOMER_ID \033[0m Not good for segmentation. Afer analysis, we did not find a good split using this variable.")
print ("     ")
print("\033[1m PROGRAM_NAME \033[0m Not good for segmentation. Afer analysis, we did not find a good split using this variable.")
print ("     ")
print ("\033[1m LOAN_OPEN_DATE \033[0m Good for segmentation.")
print("     Segment1: MONTHS_LOAN_OPENED < '12' [Accuracy Full Model: 80% / Accuracy Segmented Model: 80%]")
print("     Segment2: MONTHS_LOAN_OPENED >= '12' [Accuracy Full Model: 88% / Accuracy Segmented Model: 88%]")
print ("     ")
print("\033[1m EXPECTED_CLOSE_DATE \033[0m Not good for segmentation. Afer analysis, we did not find a good split using this variable.")
print ("     ")
print ("\033[1m ORIGINAL_BOOKED_AMOUNT \033[0m Good for segmentation.")
print("     Segment1: ORIGINAL_BOOKED_AMOUNT < '1.000.000' [Accuracy Full Model: 99% / Accuracy Segmented Model: 99%]")
print("     Segment2: ORIGINAL_BOOKED_AMOUNT >= '1.000.000' [Accuracy Full Model: 83% / Accuracy Segmented Model: 83%]")
print ("     ")
print("\033[1m OUTSTANDING \033[0m Not good for segmentation. Afer analysis, we did not find a good split using this variable.")
print ("     ")
print ("\033[1m SEX \033[0m Not good for segmentation. Afer analysis, we did not find a good split using this variable.")
print ("     ")
print ("\033[1m CUSTOMER_OPEN_DATE \033[0m Good for segmentation.")
print("     Segment1: MONTHS_CLIENT_OPENED < '12' [Accuracy Full Model: 80% / Accuracy Segmented Model: 80%]")
print("     Segment2: MONTHS_CLIENT_OPENED >= '12' [Accuracy Full Model: 88% / Accuracy Segmented Model: 88%]")
print ("     ")
print ("\033[1m BIRTH_DATE \033[0m Good for segmentation.")
print("     Segment1: AGE < '65' [Accuracy Full Model: 87% / Accuracy Segmented Model: 87%]")
print("     Segment2: AGE >= '65' [Accuracy Full Model: 83% / Accuracy Segmented Model: 83%]")
print ("     ")
print("\033[1m PROFESSION \033[0m Not good for segmentation. Afer analysis, we did not find a good split using this variable.")
print ("     ")
print("\033[1m CAR_TYPE \033[0m Not good for segmentation. Afer analysis, we did not find a good split using this variable.")

# Decission Trees Test #1 for the Group assignment

In [None]:
target = 'bucket'
all_variables = ['program_name', 'original_booked_amount', 'profession', 'car_type', 'user_type', 'age', 'months_loan_opened', 'months_to_close_loan', 'months_client_opened', 'F', 'M', 'outstanding_quantile']

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
splitter = train_test_split
"-----------------------"

df_train, df_test = splitter(df, test_size = 0.2, random_state = 42)

In [None]:
from sklearn import tree
from matplotlib import pyplot as plt

X = df_train[all_variables]
Y = df_train[target]

#build decision tree
clf = tree.DecisionTreeClassifier(criterion='gini', max_depth=4,min_samples_leaf=4)
#max_depth represents max level allowed in each tree, min_samples_leaf minumum samples storable in leaf node

#fit the tree to iris dataset
clf.fit(X,Y)

#plot decision tree
fig, ax = plt.subplots(figsize=(100, 20)) #figsize value changes the size of plot
tree.plot_tree(clf,ax=ax,feature_names= ['program_name', 'original_booked_amount', 'profession', 'car_type', 'user_type', 'age', 'months_loan_opened', 'months_to_close_loan', 'months_client_opened', 'F', 'M', 'outstanding_quantile'])
plt.show()



In [None]:
#As we can see in the decission tree above, the most relevant variables that affect our target variable "BUCKET" are:
    #months loan opened (being the threshold 14.013)
    #age( being the threshold 39.088)
    #months_to_close_loan (being the threshold)
    #car_type -> does not make sense since the threshold is established in 4.5 but it should be a categorical variable
    #

### Splitting by months_loan_opened 

In [None]:
X_train = df_train[all_variables]
y_train = df_train[target]

In [None]:
X_test = df_test[all_variables]
y_test = df_test[target]

In [None]:
from sklearn.linear_model import LogisticRegression
method = LogisticRegression(random_state=0)
fitted_full_model = method.fit(X_train, y_train)
y_pred = fitted_full_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
df_train_seg1 = df_train[df['months_loan_opened'] <14.013]
df_train_seg2 = df_train[df['months_loan_opened'] >14.013]
df_test_seg1 = df_test[df['months_loan_opened'] <14.013]
df_test_seg2 = df_test[df['months_loan_opened'] >14.013]

In [None]:
X_train_seg1 = df_train_seg1[all_variables]
y_train_seg1 = df_train_seg1[target]
X_test_seg1 = df_test_seg1[all_variables]
y_test_seg1 = df_test_seg1[target]
fitted_model_seg1 = method.fit(X_train_seg1, y_train_seg1)

def GINI(y_test, y_pred_probadbility):
    from sklearn.metrics import roc_curve, auc
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_probadbility)
    roc_auc = auc(fpr, tpr)
    GINI = (2 * roc_auc) - 1
    return(GINI)

y_pred_seg1_proba = fitted_model_seg1.predict_proba(X_test_seg1)[:,1]
y_pred_seg1_fullmodel_proba = fitted_full_model.predict_proba(X_test_seg1)[:,1]

print("Segment1: months_loan_opened <14.013 [GINI Full Model: {:.4f}% / GINI Segmented Model: {:.4f}%]".format(
    GINI(y_test_seg1, y_pred_seg1_proba)*100,
    GINI(y_test_seg1, y_pred_seg1_fullmodel_proba)*100
)) 

In [None]:
X_train_seg2 = df_train_seg2[all_variables]
y_train_seg2 = df_train_seg2[target]
X_test_seg2 = df_test_seg2[all_variables]
y_test_seg2 = df_test_seg2[target]
fitted_model_seg2 = method.fit(X_train_seg2, y_train_seg2)

def GINI(y_test, y_pred_probadbility):
    from sklearn.metrics import roc_curve, auc
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_probadbility)
    roc_auc = auc(fpr, tpr)
    GINI = (2 * roc_auc) - 1
    return(GINI)

y_pred_seg2_proba = fitted_model_seg2.predict_proba(X_test_seg2)[:,1]
y_pred_seg2_fullmodel_proba = fitted_full_model.predict_proba(X_test_seg2)[:,1]

print("Segment2: months_loan_opened >14.013 [GINI Full Model: {:.4f}% / GINI Segmented Model: {:.4f}%]".format(
    GINI(y_test_seg2, y_pred_seg2_proba)*100,
    GINI(y_test_seg2, y_pred_seg2_fullmodel_proba)*100
)) 

### Splitting by age 

In [None]:
df_train_seg1 = df_train[df['age'] <39.088]
df_train_seg2 = df_train[df['age'] >39.088]
df_test_seg1 = df_test[df['age'] <39.088]
df_test_seg2 = df_test[df['age'] >39.088]

In [None]:
X_train_seg1 = df_train_seg1[all_variables]
y_train_seg1 = df_train_seg1[target]
X_test_seg1 = df_test_seg1[all_variables]
y_test_seg1 = df_test_seg1[target]
fitted_model_seg1 = method.fit(X_train_seg1, y_train_seg1)

def GINI(y_test, y_pred_probadbility):
    from sklearn.metrics import roc_curve, auc
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_probadbility)
    roc_auc = auc(fpr, tpr)
    GINI = (2 * roc_auc) - 1
    return(GINI)

y_pred_seg1_proba = fitted_model_seg1.predict_proba(X_test_seg1)[:,1]
y_pred_seg1_fullmodel_proba = fitted_full_model.predict_proba(X_test_seg1)[:,1]

print("Segment1: age <39 [GINI Full Model: {:.4f}% / GINI Segmented Model: {:.4f}%]".format(
    GINI(y_test_seg1, y_pred_seg1_proba)*100,
    GINI(y_test_seg1, y_pred_seg1_fullmodel_proba)*100
)) 

In [None]:
X_train_seg2 = df_train_seg2[all_variables]
y_train_seg2 = df_train_seg2[target]
X_test_seg2 = df_test_seg2[all_variables]
y_test_seg2 = df_test_seg2[target]
fitted_model_seg2 = method.fit(X_train_seg2, y_train_seg2)

def GINI(y_test, y_pred_probadbility):
    from sklearn.metrics import roc_curve, auc
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_probadbility)
    roc_auc = auc(fpr, tpr)
    GINI = (2 * roc_auc) - 1
    return(GINI)

y_pred_seg2_proba = fitted_model_seg2.predict_proba(X_test_seg2)[:,1]
y_pred_seg2_fullmodel_proba = fitted_full_model.predict_proba(X_test_seg2)[:,1]

print("Segment2: age >39 [GINI Full Model: {:.4f}% / GINI Segmented Model: {:.4f}%]".format(
    GINI(y_test_seg2, y_pred_seg2_proba)*100,
    GINI(y_test_seg2, y_pred_seg2_fullmodel_proba)*100
)) 