In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier

# увеличиваем количество отображаемых столбцов
pd.set_option('display.max_columns', 900)
# pd.set_option('display.max_rows', 100000)

### __Список исходных переменных включает в себя:__
- __acc_now_delinq__ - Количество просроченных счетов заемщика
- __addr_state__ - Штат, указанный заемщиком в заявке на кредит
- __annual_inc__ - Ежегодный доход, заявленный заемщиком при регистрации.
- __chargeoff_within_12_mths__ - Количество списаний за 12 месяцев.
- __collections_12_mths_ex_med__ - Количество сборов за 12 месяцев без учета медицинских сборов
- __delinq_2yrs__ - Количество случаев просрочки платежа более чем на 30 дней в кредитной истории заемщика за последние 2 года.
- __dti__ - Коэффициент, рассчитанный с использованием общей суммы ежемесячных выплат заемщика по общей сумме долговых обязательств, исключая ипотеку и запрошенный кредит LC, разделенный на ежемесячный доход, о котором заемщик сообщает самостоятельно.
- __earliest_cr_line__ - Месяц, когда заемщику была открыта самая ранняя кредитная линия, о которой сообщалось.
- __emp_length__ - Стаж работы в годах. Возможные значения: от 0 до 10, где 0 означает менее одного года, а 10 означает десять или более лет.
- __fico_range_high__ - Верхний граничный диапазон, к которому принадлежит FICO заемщика при выдаче кредита.
- __fico_range_low__ - Нижний граничный диапазон, к которому принадлежит FICO заемщика при выдаче кредита.
- __funded_amnt__ - Общая сумма, выделенная на этот кредит на тот момент.
- __home_ownership__ - Статус домовладения, предоставленный заемщиком при регистрации или полученный из кредитного отчета. АРЕНДА, СОБСТВЕННОСТЬ, ИПОТЕКА, ДРУГОЕ
- __inq_last_12m__ - Количество кредитных запросов за последние 12 месяцев
- __installment__ - Ежемесячный платеж, причитающийся заемщику в случае выдачи кредита.
- __int_rate__ - Процентная ставка по кредиту
- __issue_d__ - Месяц, в котором был предоставлен кредит.
- __loan_amnt__ - Указанная сумма кредита, заявленная заемщиком. Если в какой-то момент кредитный отдел уменьшит сумму кредита, то это отразится на этой стоимости.
- __mort_acc__ - Количество ипотечных счетов.
- __mths_since_last_delinq__ - Количество месяцев с момента последней просрочки заемщика.
- __mths_since_recent_bc_dlq__ - Количество месяцев с момента последней просрочки по банковской карте.
- __mths_since_recent_inq__ - Прошло несколько месяцев с момента последнего запроса.
- __num_accts_ever_120_pd__ - Количество счетов, просроченных на 120 или более дней.
- __num_actv_bc_tl__ - Количество активных на данный момент счетов банковских карт.
- __num_rev_accts__ - Количество возобновляемых счетов.
- __num_sats__ - Количество удовлетворительных счетов
- __num_tl_120dpd_2m__ - Количество счетов с просрочкой платежа на 120 дней (обновлено за последние 2 месяца).
- __num_tl_30dpd__ - Количество счетов с просрочкой платежа на 30 дней (обновлено за последние 2 месяца).
- __num_tl_90g_dpd_24m__ - Количество счетов с просрочкой платежа на 90 или более дней за последние 24 месяца.
- __num_tl_op_past_12m__ - Количество счетов, открытых за последние 12 месяцев.
- __open_acc__ - Количество открытых кредитных линий в кредитном досье заемщика.
- __open_il_24m__ - Количество счетов в рассрочку, открытых за последние 24 месяца.
- __open_rv_24m__ - Количество возобновляемых сделок, открытых за последние 24 месяца.
- __percent_bc_gt_75__ - Процент всех счетов банковских карт > 75% от лимита.
- __pub_rec__ - Количество уничижительных публичных записей.
- __pub_rec_bankruptcies__ - Количество официально зарегистрированных банкротств
- __purpose__ - Категория, предоставленная заемщиком для запроса на кредит.
- __revol_util__ - Коэффициент использования возобновляемой линии или сумма кредита, которую заемщик использует по отношению ко всему доступному возобновляемому кредиту.
- __tax_liens__ - Количество налоговых залогов.
- __term__ - Количество платежей по кредиту. Значения указаны в месяцах и могут быть 36 или 60.
- __title__ - Название кредита, предоставленное заемщиком.
- __total_acc__ - Общее количество кредитных линий, находящихся в настоящее время в кредитном файле заемщика.
- __verification_status__ - Указывает, был ли доход подтвержден LC, не подтвержден или был ли подтвержден источник дохода.
- __zip_code;__ - Первые 3 цифры почтового индекса, указанного заемщиком в заявке на кредит.

## __Считывание CSV файла в DataFrame__

In [2]:
# загружаем файл с данными
X_train = pd.read_csv('./X_train.csv', encoding = 'utf8', sep = ',')
X_train.head(3)

Unnamed: 0,index,acc_now_delinq,addr_state,annual_inc,chargeoff_within_12_mths,collections_12_mths_ex_med,delinq_2yrs,dti,earliest_cr_line,emp_length,fico_range_high,fico_range_low,funded_amnt,home_ownership,inq_last_12m,installment,int_rate,issue_d,loan_amnt,mort_acc,mths_since_last_delinq,mths_since_recent_bc_dlq,mths_since_recent_inq,num_accts_ever_120_pd,num_actv_bc_tl,num_rev_accts,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,open_acc,open_il_24m,open_rv_24m,percent_bc_gt_75,pub_rec,pub_rec_bankruptcies,purpose,revol_util,tax_liens,term,title,total_acc,verification_status,zip_code;
0,0,0.0,UT,80000.0,0.0,0.0,0.0,30.49,Oct-1996,4 years,689.0,685.0,16000.0,RENT,1.0,539.03,12.99%,Apr-2016,16000.0,5.0,50.0,65.0,6.0,3.0,4.0,17.0,14.0,0.0,0.0,0.0,3.0,14.0,2.0,3.0,100.0,1.0,1.0,credit_card,56.1%,0.0,36 months,Credit card refinancing,26.0,Source Verified,847xx;
1,1,0.0,CA,82000.0,0.0,0.0,0.0,7.0,Mar-1993,10+ years,689.0,685.0,6600.0,MORTGAGE,0.0,214.27,10.42%,Sep-2017,6600.0,2.0,,,,0.0,3.0,8.0,5.0,0.0,0.0,0.0,0.0,5.0,0.0,1.0,100.0,1.0,1.0,credit_card,88.3%,0.0,36 months,Credit card refinancing,11.0,Not Verified,900xx;
2,2,0.0,NV,46080.0,0.0,0.0,1.0,17.32,Nov-2000,3 years,674.0,670.0,10000.0,MORTGAGE,0.0,317.96,8.99%,Aug-2016,10000.0,3.0,10.0,,13.0,0.0,1.0,23.0,7.0,0.0,0.0,0.0,0.0,7.0,0.0,3.0,0.0,0.0,0.0,credit_card,18.1%,0.0,36 months,Credit card refinancing,31.0,Source Verified,895xx;


In [3]:
X_train.shape

(199999, 45)

In [4]:
# загружаем файл с данными
y_train = pd.read_csv('./y_train.csv', encoding = 'utf8', sep = ',')
y_train.head(10)

Unnamed: 0,index,loan_status
0,0,1.0
1,1,
2,2,1.0
3,3,
4,4,
5,5,
6,6,
7,7,
8,8,
9,9,


In [5]:
y_train.shape

(199999, 2)

## __ПЛАН ПРЕДВАРИТЕЛЬНОЙ ПОДГОТОВКИ ДАННЫХ__

__До разбиения набора данных на обучающую и контрольную выборки__
- Удаление очевидных бесполезных переменных (переменных у которых количество категорий совпадает с количеством наблюдений, или переменных с одним уникальным значением;
- Импутация пропусков, которую можно выполнить до разбиения на обучающую и контрольную выборки;
- Преобразование типов данных;
- Нормализация строковых значений (удаление лишних символов, приведение к одному и тому же регистру);
- Обработка дублирующих наблюдений;


In [6]:
# выводим информацию о количестве непропущенных наблюдений в переменных и типах переменных
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199999 entries, 0 to 199998
Data columns (total 45 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   index                       199999 non-null  object 
 1   acc_now_delinq              199873 non-null  float64
 2   addr_state                  199873 non-null  object 
 3   annual_inc                  199873 non-null  float64
 4   chargeoff_within_12_mths    199869 non-null  float64
 5   collections_12_mths_ex_med  199869 non-null  float64
 6   delinq_2yrs                 199873 non-null  float64
 7   dti                         199873 non-null  float64
 8   earliest_cr_line            199873 non-null  object 
 9   emp_length                  187808 non-null  object 
 10  fico_range_high             199873 non-null  float64
 11  fico_range_low              199873 non-null  float64
 12  funded_amnt                 199873 non-null  float64
 13  home_ownership

### __Удаление очевидных бесполезных переменных (переменных у которых количество категорий совпадает с количеством наблюдений, или переменных с одним уникальным значением)__

In [7]:
# Удаляем очевидную бесполезную переменную index (количество категорий совпадает с количеством наблюдений)
X_train.drop('index', axis = 1, inplace = True)

### __Изучение данных по каждой категории на наличие переменных с одним уникальным значением и правильности выбранного типа данных__

In [8]:
# выведем уникальные значения по переменной acc_now_delinq с помощью метода .unique()
X_train['acc_now_delinq'].unique()

array([ 0.,  1., nan,  2.,  3.,  5.])

In [9]:
# выведем уникальные значения по переменной addr_state 
X_train['addr_state'].unique()

# теперь с помощью метода .nunique() выведем количество уникальных значений переменной чтобы убедиться в отсутствии 
# неверных записей
X_train['addr_state'].nunique()

50

In [10]:
X_train['annual_inc'].unique()

array([  80000. ,   82000. ,   46080. , ..., 1121869. ,   75643.2,
         65245. ])

In [11]:
X_train['chargeoff_within_12_mths'].unique()

array([ 0.,  1., nan,  2.,  4.,  3.,  5.,  6., 10.,  7.])

In [12]:
X_train['collections_12_mths_ex_med'].unique()

array([ 0.,  1., nan,  2.,  3.,  4., 10.,  7.,  6.,  5., 20.])

In [13]:
X_train['collections_12_mths_ex_med'].unique()

array([ 0.,  1., nan,  2.,  3.,  4., 10.,  7.,  6.,  5., 20.])

In [14]:
X_train['delinq_2yrs'].unique()

array([ 0.,  1.,  2.,  4.,  3.,  5., nan,  6.,  8., 10.,  7.,  9., 11.,
       12., 13., 16., 14., 15., 21., 30., 18., 19., 17., 27.])

In [15]:
X_train['dti'].unique()

array([30.49,  7.  , 17.32, ..., 42.46, 40.73, 43.25])

In [16]:
X_train['earliest_cr_line'].unique()

array(['Oct-1996', 'Mar-1993', 'Nov-2000', 'Jan-2010', 'Jan-2002',
       'Apr-1993', 'Apr-2001', 'Sep-1995', 'Nov-1991', 'Sep-1996',
       'Aug-1998', 'May-1992', 'Apr-1989', 'Feb-2008', 'May-1999',
       'May-1995', 'Feb-2004', 'Sep-2010', 'Jun-2004', 'Mar-2001',
       'Mar-1997', 'Jan-2007', 'Jul-2012', 'Sep-2001', 'Dec-1997',
       'Mar-2000', 'May-1988', 'Apr-1986', 'Aug-2002', 'Dec-1996',
       'Nov-1994', 'Jul-1998', 'May-2002', 'Oct-2002', 'Sep-2003',
       'Dec-2004', 'Apr-1988', 'Sep-1999', 'Feb-2001', 'Sep-1993',
       'Apr-2005', 'May-2001', 'Aug-2000', 'Jan-2004', 'Mar-2005',
       'Oct-2004', 'Apr-1994', 'Oct-2001', 'Jan-2001', 'Feb-2005',
       'Sep-2009', 'Jan-2005', 'Jan-1986', 'Apr-2006', 'Jul-1993',
       'Jul-2000', 'Mar-1994', 'Nov-1985', 'Feb-2006', 'Nov-1997',
       'Aug-2012', 'Dec-1992', 'Mar-1998', 'Feb-1993', 'Oct-1983',
       'Dec-2001', 'Nov-1998', 'Jul-1984', 'Sep-2007', 'Nov-2002',
       'Nov-1995', 'Oct-2005', 'May-1986', 'Nov-1999', 'Oct-20

In [17]:
X_train['emp_length'].unique()

array(['4 years', '10+ years', '3 years', '< 1 year', '5 years', '1 year',
       '7 years', '8 years', '2 years', '9 years', nan, '6 years'],
      dtype=object)

In [18]:
X_train['fico_range_high'].unique()

array([689., 674., 694., 704., 664., 699., 679., 714., 684., 759., 669.,
       774., 754., 724., 729., 744., 749., 739., 709., 814., 764., 719.,
       779., 784., 794., 734., 789., 769., 799.,  nan, 834., 809., 804.,
       819., 824., 844., 829., 839., 850.])

In [19]:
X_train['fico_range_low'].unique()

array([685., 670., 690., 700., 660., 695., 675., 710., 680., 755., 665.,
       770., 750., 720., 725., 740., 745., 735., 705., 810., 760., 715.,
       775., 780., 790., 730., 785., 765., 795.,  nan, 830., 805., 800.,
       815., 820., 840., 825., 835., 845.])

In [20]:
X_train['funded_amnt'].unique()

array([16000.,  6600., 10000., ..., 35325., 39350., 37925.])

In [21]:
X_train['home_ownership'].unique()

array(['RENT', 'MORTGAGE', 'OWN', nan, 'OTHER', 'ANY', 'NONE'],
      dtype=object)

In [22]:
X_train['inq_last_12m'].unique()

array([ 1.,  0., nan,  2.,  3.,  7.,  4.,  6.,  5.,  8., 12., 11., 13.,
       10.,  9., 14., 18., 16., 31., 17., 27., 15., 21., 23., 20., 26.,
       22., 19., 34., 39., 24., 30., 45., 25., 28., 41., 29.])

In [23]:
X_train['installment'].unique()

array([539.03, 214.27, 317.96, ..., 233.13, 725.29, 968.42])

In [24]:
X_train['int_rate'].unique()

array([' 12.99%', ' 10.42%', '  8.99%', ' 11.53%', ' 26.30%', ' 12.12%',
       ' 14.65%', ' 13.59%', '  8.18%', ' 13.49%', ' 24.50%', ' 15.31%',
       ' 18.99%', ' 17.57%', ' 13.11%', '  6.24%', ' 13.33%', ' 11.99%',
       ' 13.67%', '  5.32%', '  6.03%', ' 14.49%', ' 14.99%', ' 13.56%',
       '  8.24%', ' 24.99%', '  9.49%', '  6.89%', ' 14.31%', ' 12.39%',
       '  9.16%', '  7.89%', ' 13.68%', '  9.17%', ' 12.49%', ' 10.64%',
       ' 11.49%', ' 14.09%', ' 12.85%', ' 13.98%', '  8.90%', ' 13.53%',
       ' 14.47%', ' 16.99%', '  8.46%', '  6.46%', ' 12.69%', ' 11.39%',
       ' 12.79%', ' 15.80%', ' 21.45%', ' 11.44%', ' 10.99%', ' 12.62%',
       ' 20.99%', ' 18.25%', '  8.67%', ' 15.41%', ' 11.47%', '  7.99%',
       ' 23.05%', ' 10.49%', '  8.59%', ' 14.08%', ' 15.05%', ' 10.16%',
       ' 22.45%', '  9.99%', ' 10.41%', ' 10.47%', ' 12.73%', '  7.26%',
       ' 16.02%', ' 15.59%', ' 11.71%', ' 12.35%', ' 13.99%', ' 22.99%',
       ' 22.39%', '  9.43%', ' 15.61%', ' 12.98%', 

In [25]:
X_train['issue_d'].unique()

array(['Apr-2016', 'Sep-2017', 'Aug-2016', 'Aug-2015', 'May-2013',
       'Mar-2015', 'May-2017', 'Oct-2015', 'Mar-2017', 'Oct-2014',
       'Mar-2016', 'Dec-2016', 'Apr-2015', 'Apr-2013', 'Sep-2015',
       'Feb-2015', 'Oct-2017', 'Oct-2013', 'Jun-2016', 'Sep-2013',
       'Apr-2017', 'Sep-2018', 'Jul-2014', 'Oct-2016', 'Nov-2014',
       'Jul-2015', 'Jan-2015', 'May-2014', 'Dec-2015', 'Feb-2013',
       'Dec-2013', 'Nov-2012', 'Jan-2014', 'Aug-2018', 'Jul-2019',
       'Jul-2016', 'Mar-2014', 'Jan-2017', 'Nov-2015', 'Aug-2014',
       'Dec-2018', 'Aug-2019', 'Dec-2014', 'Jun-2017', 'Nov-2016',
       'Sep-2012', 'Mar-2018', 'Jul-2018', 'Jun-2015', 'Jul-2017',
       'Dec-2011', 'Apr-2014', 'Feb-2018', 'Jun-2014', 'Mar-2013',
       'Sep-2016', 'Jun-2018', 'May-2019', 'Jul-2012', 'Nov-2017',
       'Aug-2017', 'Oct-2018', 'May-2016', 'Jan-2012', 'Jan-2016',
       'Feb-2016', 'Feb-2020', 'May-2018', 'Mar-2011', 'Feb-2017',
       'Feb-2014', 'Jul-2010', 'Jan-2019', 'Apr-2018', 'Apr-20

In [26]:
X_train['loan_amnt'].unique()

array([16000.,  6600., 10000., ..., 35325., 39350., 37925.])

In [27]:
X_train['mort_acc'].unique()

array([ 5.,  2.,  3.,  0.,  1.,  7.,  4.,  6.,  8., nan, 22., 11.,  9.,
       10., 19., 12., 17., 24., 13., 14., 15., 18., 16., 27., 20., 34.,
       21., 23.])

In [28]:
X_train['mths_since_last_delinq'].unique()

array([ 50.,  nan,  10.,  24.,  51.,  13.,  46.,  72.,   5.,  15.,  33.,
        27.,  32.,  37.,   7.,   9.,  11.,  23.,  22.,  59.,  76.,  78.,
        67.,  31.,  66.,   3.,   8.,  41.,   0.,  34.,  21.,  42.,  45.,
        53.,  62.,  35.,   2.,   4.,  17.,  14.,  48.,  71.,  74.,  36.,
        57.,  29.,  56.,  43.,   6.,  65.,  81.,  19.,  69.,  77.,  30.,
        60.,  47.,  61.,  12.,  54.,  18.,  16.,  52.,  82.,  79.,  28.,
        70.,  20.,  26.,  75.,  73.,  39.,  63.,  25.,  80.,  40.,  83.,
        68.,  44.,  55.,  38.,  49.,   1.,  64.,  96.,  58.,  84.,  91.,
       112.,  88., 158.,  87.,  85.,  98., 188., 102.,  97.,  93., 101.,
        94.,  86., 103., 122., 126.,  92., 110., 121., 130., 113., 108.,
       111., 120.,  90., 124., 105., 115.,  99., 114.,  95., 100., 140.,
       104.,  89., 133., 106., 131., 123., 149., 116., 125., 146., 135.,
       119., 118., 107., 109.])

In [29]:
X_train['mths_since_recent_bc_dlq'].unique()

array([ 65.,  nan,  41.,  46.,  69.,  16.,  67.,  37.,  11.,  23.,  66.,
         5.,  29.,  21.,  79.,  13.,  24.,  34.,  38.,  60.,   7.,  35.,
        39.,  95.,  14.,  62.,  51.,  36.,  56.,  32.,  81.,  80.,  43.,
        61.,  54.,  18.,  30.,  48.,  82.,   4.,  72.,  28.,  74.,  47.,
        17.,  19.,  10.,   3.,  78.,  15.,   9.,  64.,  57.,  40.,  22.,
        25.,  31.,  33.,  99.,  42.,  71.,   6.,  76.,  58.,  50.,  75.,
         8.,  12.,   0.,  45.,  83.,  27.,  68.,  63.,   2.,  55.,  49.,
        77.,  70.,  26.,  20.,  44.,  96.,  73.,  52.,  84.,  59.,  53.,
       124.,  98.,   1., 101.,  87.,  86.,  85.,  88., 158., 107., 120.,
       105., 112.,  97.,  93.,  94., 111., 194.,  89., 103.,  91., 162.,
       122., 126., 121., 102.,  90., 154.,  92., 109., 108., 100., 117.,
       114., 106., 113., 125., 110., 131., 119., 140., 116., 133., 104.,
       128., 123., 134., 130., 149., 156., 146., 135.])

In [30]:
X_train['mths_since_recent_inq'].unique()

array([ 6., nan, 13., 16., 15., 11.,  9.,  4., 23.,  1.,  0., 22.,  7.,
        5.,  8.,  2., 10., 12.,  3., 14., 21., 19., 24., 17., 18., 20.,
       25.])

In [31]:
X_train['num_accts_ever_120_pd'].unique()

array([ 3.,  0.,  1.,  2., nan,  8.,  4.,  5.,  9.,  6.,  7., 19., 11.,
       15., 12., 10., 13., 21., 14., 18., 29., 22., 26., 37., 17., 28.,
       16., 27., 20., 24., 30., 23., 34., 36., 38., 25., 32.])

In [32]:
X_train['num_actv_bc_tl'].unique()

array([ 4.,  3.,  1.,  2.,  6.,  5.,  9.,  0., 11., 12.,  7., 10.,  8.,
       nan, 14., 13., 16., 15., 32., 18., 17., 24., 19., 20., 22., 30.,
       21., 25., 23., 45., 27.])

In [33]:
X_train['num_rev_accts'].unique()

array([ 17.,   8.,  23.,  11.,   3.,   7.,  33.,  22.,  10.,  20.,   9.,
        19.,  41.,  18.,  14.,  12.,  29.,  25.,   6.,  48.,  15.,  16.,
        45.,   5.,  40.,  31.,  34.,  50.,  13.,  21.,  30.,  nan,   4.,
        49.,  26.,   2.,  35.,  28.,  27.,  24.,  32.,  36.,  52.,  38.,
        44.,  58.,  39.,  37.,  57.,  51.,  43.,  47.,   1.,  46.,  55.,
        42.,  53.,  60.,  67.,  56.,  61.,  64.,  62.,  59.,  54.,  72.,
        78.,  71.,  86.,  90.,  63.,  65.,  66.,  69.,  75., 101.,  83.,
        68.,  74.,  82., 112.,  76.,  70., 104.,  96.,  73., 103.,  89.,
        77.,  81.])

In [34]:
X_train['num_sats'].unique()

array([14.,  5.,  7., 19.,  4.,  6.,  8., 11., 10., 30., 16., 15.,  9.,
       12., 31., 17., 13., 21.,  3., 18., 22.,  2., 25., nan, 32., 20.,
       23., 26., 24., 29., 33., 27., 36., 28., 38., 34.,  1., 44., 35.,
       52., 41., 37., 40., 43., 45., 42., 59., 48., 39., 70., 58., 46.,
       57., 51.,  0., 49., 62., 47., 50., 79., 74., 53., 55., 76., 72.,
       78., 80., 81., 63., 61., 60., 56.])

In [35]:
X_train['num_tl_120dpd_2m'].unique()

array([ 0., nan,  1.,  2.])

In [36]:
X_train['num_tl_30dpd'].unique()

array([ 0.,  1., nan,  2.,  3.,  4.])

In [37]:
X_train['num_tl_90g_dpd_24m'].unique()

array([ 0.,  1., nan,  2.,  4.,  3., 10.,  5.,  6.,  7.,  8.,  9., 14.,
       11., 12., 13., 15., 20., 30., 17., 19., 18., 16., 26.])

In [38]:
X_train['num_tl_op_past_12m'].unique()

array([ 3.,  0.,  4.,  1.,  2.,  5.,  6., nan,  7.,  8.,  9., 10., 11.,
       15., 12., 13., 16., 14., 24., 29., 17., 18., 21., 19., 23., 25.,
       20.])

In [39]:
X_train['open_acc'].unique()

array([14.,  5.,  7., 19.,  4.,  8.,  6., 11., 10., 30., 13., 16., 15.,
        9., 12., 31., 17., 21., 37.,  3., 18., 23.,  2., 25., 22., 32.,
       20., 26., 24., 29., 33., 27., nan, 36., 28., 38., 35.,  1., 44.,
       52., 41., 40., 43., 34., 45., 42., 59., 48., 39., 70., 86., 46.,
       57., 51.,  0., 58., 49., 62., 47., 79., 74., 50., 53., 55., 76.,
       72., 78., 80., 81., 63., 61., 60., 56.])

In [40]:
X_train['open_il_24m'].unique()

array([ 2.,  0., nan,  3.,  1.,  4.,  5.,  7.,  6., 11.,  9.,  8., 12.,
       14., 10., 15., 13., 20., 17., 16., 18., 19., 26.])

In [41]:
X_train['open_rv_24m'].unique()

array([ 3.,  1., nan,  2.,  7.,  0.,  5., 14.,  8.,  4.,  6., 12.,  9.,
       10., 11., 24., 21., 13., 18., 15., 16., 20., 17., 19., 23., 25.,
       26., 22., 28., 30., 27., 38., 29., 34., 37., 33.])

In [42]:
X_train['percent_bc_gt_75'].unique()

array([100.  ,   0.  ,  33.3 ,  50.  ,  66.7 ,  62.5 ,  25.  ,  88.9 ,
        60.  ,  20.  ,  40.  ,  57.1 ,  28.6 ,  75.  ,  11.1 ,  42.9 ,
        21.4 ,   8.3 ,    nan,  71.4 ,  16.7 ,   6.7 ,  12.5 ,  55.6 ,
        90.9 ,  14.3 ,  22.2 ,  30.8 ,   6.3 ,  85.7 ,  83.3 ,  80.  ,
        84.6 ,   7.7 ,  90.  ,  87.5 ,  37.5 ,  77.8 ,  92.9 ,  38.5 ,
        18.2 ,  46.2 ,  36.4 ,  44.4 ,  81.8 ,  41.7 ,  10.  ,   5.  ,
        72.7 ,  58.3 ,  53.8 ,  23.5 ,   9.1 ,  15.4 ,  30.  ,  63.6 ,
        68.7 ,   7.1 ,   5.3 ,  27.3 ,  53.3 ,  31.2 ,  70.  ,  54.5 ,
         0.57,  45.5 ,   9.5 ,  31.3 ,  69.2 ,  86.7 ,   0.5 ,  23.1 ,
        78.6 ,  61.5 ,   0.6 ,  13.3 ,  76.5 ,  94.4 ,  11.8 ,  26.7 ,
        92.3 ,  73.3 ,  38.9 ,  11.5 ,   5.6 ,   5.9 ,  21.1 ,   4.5 ,
        31.6 ,  46.7 ,  10.5 ,  29.4 ,  18.8 ,  91.7 ,  52.9 ,   1.  ,
        26.3 ,  35.3 ,  41.2 ,  17.6 ,  43.8 ,  15.  ,  76.9 ,   0.33,
        36.8 ,  35.7 ,  82.4 ,  64.3 ,   6.2 ,  22.7 ,  26.1 ,  17.4 ,
      

In [43]:
X_train['pub_rec'].unique()

array([ 1.,  0.,  2.,  3., nan,  4.,  5., 14.,  8.,  6.,  7., 11., 10.,
        9., 15., 28., 12., 20., 13.])

In [44]:
X_train['pub_rec_bankruptcies'].unique()

array([ 1.,  0.,  2., nan,  3.,  7.,  4.,  5.,  6.,  9.])

In [45]:
X_train['purpose'].unique()

array(['credit_card', 'car', 'other', 'debt_consolidation',
       'major_purchase', 'small_business', 'home_improvement', 'moving',
       'medical', 'wedding', 'vacation', 'house', nan, 'renewable_energy',
       'educational'], dtype=object)

In [46]:
X_train['revol_util'].unique()

array(['56.1%', '88.3%', '18.1%', ..., '103.1%', '173.2%', '109.8%'],
      dtype=object)

In [47]:
X_train['tax_liens'].unique()

array([ 0.,  1.,  2., nan,  3.,  4.,  5., 13.,  7.,  6.,  8., 10.,  9.,
       15., 27., 11., 12., 18.])

In [48]:
X_train['term'].unique()

array([' 36 months', ' 60 months', nan], dtype=object)

In [49]:
X_train['title'].unique()

array(['Credit card refinancing', 'Car financing', 'Other', ...,
       'The Next Step', 'credit cards debt consolidation',
       'Car and Credit Card debt consolidate'], dtype=object)

In [50]:
X_train['total_acc'].unique()

array([ 26.,  11.,  31.,  24.,  16.,  21.,  29.,   7.,  23.,  46.,  34.,
        19.,  25.,  18.,  13.,  20.,  15.,   6.,  14.,  12.,  30.,  50.,
        28.,  32.,  38.,  76.,  10.,  33.,  17.,   5.,  40.,  35.,  36.,
        43.,   8.,  27.,  22.,  42.,   4.,  44.,  52.,  47.,  37.,   9.,
        66.,  41.,  57.,   3.,  63.,  53.,   2.,  45.,  39.,  49.,  70.,
        56.,  69.,  61.,  64.,  54.,  48.,  nan,  51.,  58.,  85.,  55.,
        59.,  65.,  62.,  60.,  84.,  73.,  67.,  74.,  68.,  78.,  82.,
        75.,  77.,  72.,  96.,  81.,  86.,  71.,  79.,  92.,  93.,  80.,
        88.,  95.,  90., 135.,  91.,  83.,  94.,  89., 105.,  87., 109.,
       100., 138.,  97., 112., 157., 102., 123., 125., 106., 101., 108.,
       120., 104., 107.,  98.])

In [51]:
X_train['verification_status'].unique()

array(['Source Verified', 'Not Verified', 'Verified', nan], dtype=object)

In [52]:
X_train['zip_code;'].unique()

array(['847xx;', '900xx;', '895xx;', '853xx;', '453xx;', '767xx;',
       '983xx;', '322xx;', '019xx;', '211xx;', '225xx;', '775xx;',
       '917xx;', '460xx;', '328xx;', '376xx;', '985xx;', '303xx;',
       '207xx;', '038xx;', '786xx;', '294xx;', '079xx;', '282xx;',
       '554xx;', '550xx;', '331xx;', '381xx;', '359xx;', '152xx;',
       '310xx;', '336xx;', '910xx;', '070xx;', '774xx;', '945xx;',
       '441xx;', '921xx;', '370xx;', '442xx;', '212xx;', '665xx;',
       '750xx;', '873xx;', '172xx;', '104xx;', '423xx;', '404xx;',
       '481xx;', '953xx;', '891xx;', '274xx;', '982xx;', '805xx;',
       '972xx;', '799xx;', '933xx;', '087xx;', '200xx;', '773xx;',
       '302xx;', '013xx;', '606xx;', '371xx;', '462xx;', '080xx;',
       '493xx;', '727xx;', '905xx;', '430xx;', '030xx;', '787xx;',
       '926xx;', '560xx;', '956xx;', '472xx;', '711xx;', '936xx;',
       '357xx;', '103xx;', '601xx;', '078xx;', '064xx;', '100xx;',
       '844xx;', '208xx;', '461xx;', '495xx;', '054xx;', '800x

In [53]:
X_train['zip_code;'].nunique()

891

## __Импутация пропусков__

In [54]:
# заполняем пропуски в переменных с типом float медианами
for i in ['acc_now_delinq', 'annual_inc', 'chargeoff_within_12_mths', 'collections_12_mths_ex_med', 'delinq_2yrs', 
          'dti', 'fico_range_high', 'fico_range_low', 'funded_amnt', 'inq_last_12m', 'installment', 'loan_amnt', 
          'mort_acc', 'mths_since_last_delinq', 'mths_since_recent_bc_dlq', 'mths_since_recent_inq', 'num_accts_ever_120_pd', 
          'num_actv_bc_tl', 'num_rev_accts', 'num_sats', 'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m', 
          'num_tl_op_past_12m', 'open_acc', 'open_il_24m', 'open_rv_24m', 'percent_bc_gt_75', 'pub_rec', 
          'pub_rec_bankruptcies', 'tax_liens', 'total_acc']:
    X_train[i].fillna(X_train[i].median(), inplace = True)

In [55]:
# заполняем пропуски в переменных с типом object  с помощью метода fillna
X_train['addr_state'].fillna(method='ffill', inplace=True)
X_train['earliest_cr_line'].fillna(method='ffill', inplace=True)
X_train['emp_length'].fillna(method='ffill', inplace=True)
X_train['home_ownership'].fillna(method='ffill', inplace=True)
X_train['int_rate'].fillna(method='ffill', inplace=True)
X_train['issue_d'].fillna(method='ffill', inplace=True)
X_train['purpose'].fillna(method='ffill', inplace=True)
X_train['revol_util'].fillna(method='ffill', inplace=True)
X_train['term'].fillna(method='ffill', inplace=True)
X_train['title'].fillna(method='ffill', inplace=True)
X_train['verification_status'].fillna(method='ffill', inplace=True)
X_train['zip_code;'].fillna(method='ffill', inplace=True)

In [56]:
# проверяем правильно ли прошла импутация, не пояились ли неверные значения переменных
X_train['addr_state'].unique()

array(['UT', 'CA', 'NV', 'AZ', 'OH', 'TX', 'WA', 'FL', 'MA', 'MD', 'VA',
       'IN', 'TN', 'GA', 'NH', 'SC', 'NJ', 'NC', 'MN', 'AL', 'PA', 'KS',
       'NM', 'NY', 'KY', 'MI', 'CO', 'OR', 'DC', 'IL', 'AR', 'LA', 'CT',
       'VT', 'WI', 'OK', 'MS', 'MO', 'WY', 'RI', 'WV', 'SD', 'DE', 'HI',
       'MT', 'AK', 'ID', 'NE', 'ND', 'ME'], dtype=object)

In [57]:
X_train['addr_state'].nunique()

50

In [58]:
# проверяем датафрейм на наличие пропусков
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199999 entries, 0 to 199998
Data columns (total 44 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   acc_now_delinq              199999 non-null  float64
 1   addr_state                  199999 non-null  object 
 2   annual_inc                  199999 non-null  float64
 3   chargeoff_within_12_mths    199999 non-null  float64
 4   collections_12_mths_ex_med  199999 non-null  float64
 5   delinq_2yrs                 199999 non-null  float64
 6   dti                         199999 non-null  float64
 7   earliest_cr_line            199999 non-null  object 
 8   emp_length                  199999 non-null  object 
 9   fico_range_high             199999 non-null  float64
 10  fico_range_low              199999 non-null  float64
 11  funded_amnt                 199999 non-null  float64
 12  home_ownership              199999 non-null  object 
 13  inq_last_12m  

## __Преобразование типов данных__

In [59]:
# многие переменные неверно записаны как float64, так как не имеют после плавающей точки значений. С целью оптимизации
# памяти преобразуем их в тип int 
for i in ['acc_now_delinq', 'chargeoff_within_12_mths', 'collections_12_mths_ex_med', 'delinq_2yrs', 'fico_range_high', 
          'fico_range_low', 'funded_amnt', 'inq_last_12m', 'loan_amnt', 'mort_acc', 'mths_since_last_delinq',
          'mths_since_recent_bc_dlq', 'mths_since_recent_inq', 'num_accts_ever_120_pd', 'num_actv_bc_tl', 
          'num_rev_accts', 'num_sats', 'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m', 
          'num_tl_op_past_12m', 'open_acc', 'open_il_24m', 'open_rv_24m', 'pub_rec', 'pub_rec_bankruptcies', 
          'tax_liens', 'total_acc']:
    X_train[i] = X_train[i].astype('int')

In [60]:
# сначала избавимся от лишнего символа % в переменных  int_rate и revol_util
X_train['int_rate'] = X_train['int_rate'].str.rstrip('%')
X_train['revol_util'] = X_train['revol_util'].str.rstrip('%')

In [61]:
# X_train['int_rate'].unique()

In [62]:
# X_train['revol_util'].unique()

In [63]:
# изменяем тип данных в переменных int_rate и revol_util в тип float 

for i in ['int_rate', 'revol_util']:
    X_train[i] = X_train[i].astype('float')

In [64]:
# изменяем тип данных в переменных 'earliest_cr_line', 'issue_d' в тип DateTime 

X_train['earliest_cr_line'] = pd.to_datetime(X_train['earliest_cr_line'], format = '%b-%Y')
X_train['issue_d'] = pd.to_datetime(X_train['issue_d'], format='%b-%Y') 
                     

In [65]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199999 entries, 0 to 199998
Data columns (total 44 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   acc_now_delinq              199999 non-null  int32         
 1   addr_state                  199999 non-null  object        
 2   annual_inc                  199999 non-null  float64       
 3   chargeoff_within_12_mths    199999 non-null  int32         
 4   collections_12_mths_ex_med  199999 non-null  int32         
 5   delinq_2yrs                 199999 non-null  int32         
 6   dti                         199999 non-null  float64       
 7   earliest_cr_line            199999 non-null  datetime64[ns]
 8   emp_length                  199999 non-null  object        
 9   fico_range_high             199999 non-null  int32         
 10  fico_range_low              199999 non-null  int32         
 11  funded_amnt                 199999 non-

## __Обработка редких категорий__

Редкие категории являются источником шума в данных, который негативно повлияет на качество модели.

In [66]:
# выводим частоты категорий по каждой категориальной переменной
categorical_columns = [c for c in X_train.columns if X_train[c].dtype.name == 'object']
for c in categorical_columns:
    print(X_train[c].value_counts())

addr_state
CA    28461
NY    16649
TX    16332
FL    14533
IL     7804
NJ     7274
PA     6633
OH     6630
GA     6443
NC     5699
VA     5484
MI     5083
MD     4802
AZ     4780
MA     4620
CO     4294
WA     4255
MN     3578
IN     3162
MO     3113
NV     3086
CT     3038
TN     3037
WI     2598
AL     2500
OR     2444
SC     2443
LA     2218
KY     1947
OK     1850
KS     1639
AR     1491
UT     1468
NM     1083
MS     1052
HI      969
NH      962
RI      905
WV      686
NE      612
MT      592
DE      538
DC      509
AK      448
SD      439
WY      435
VT      416
ME      356
ID      334
ND      275
Name: count, dtype: int64
emp_length
10+ years    70507
2 years      19221
3 years      17171
< 1 year     16926
1 year       14345
5 years      13322
4 years      12732
6 years       9710
7 years       9118
8 years       9095
9 years       7852
Name: count, dtype: int64
home_ownership
MORTGAGE    97157
RENT        80311
OWN         22377
ANY           138
OTHER          13
NONE        

In [67]:
# видим, что переменная home_ownership не содержит пропусков. Все категории с частотой менее 100 наблюдений объеденим 
# в отдельную категорию ANY 
X_train['home_ownership'] = X_train['home_ownership'].replace(['OTHER', 'NONE'], 'ANY')

In [68]:
print(X_train['home_ownership'].value_counts(dropna = False))

home_ownership
MORTGAGE    97157
RENT        80311
OWN         22377
ANY           154
Name: count, dtype: int64


In [69]:
# видим, что переменная purpose не содержит пропусков. Все категории с частотой менее 100 наблюдений объеденим 
# в отдельную категорию other 
X_train['purpose'] = X_train['purpose'].replace(['wedding', 'renewable_energy', 'educational'], 'other')

In [70]:
print(X_train['purpose'].value_counts(dropna = False))

purpose
debt_consolidation    114124
credit_card            45020
home_improvement       13161
other                  12419
major_purchase          4419
medical                 2406
small_business          2242
car                     2212
moving                  1404
vacation                1403
house                   1189
Name: count, dtype: int64


In [71]:
print(X_train['title'].value_counts(dropna = False).head(100))

title
Debt consolidation         101852
Credit card refinancing     40255
Home improvement            11870
Other                       10942
Major purchase               3866
                            ...  
debt free                      23
Pay Off                        23
Credit Card Pay Off            23
credit card pay off            23
refinance                      23
Name: count, Length: 100, dtype: int64


Теперь необходимо определится с пороговой относительной частотой. В данном случае мы хотим объединить категории с частотой 50 наблюдений и меньше в категорию other. мы делим 50 наблюдений на общее количество наблюдений в наборе данных (200 000 наблюдений), умножаем на 100 и получаем пороговую относительную частоту 0,025. С помощью программного кода, приведенного ниже, мы делим частоту каждой категории на общее количество наблюдений в наборе данных умножаем на 100 и получаем пороговую относительную частоту 0,025. Если относительная частота категории меньше 0,025, возвращаем значение True если больше возвращаем False

In [72]:
title_series = X_train['title'].value_counts()
mask = (title_series/title_series.sum() * 100).lt(0.025)
mask.head()

title
Debt consolidation         False
Credit card refinancing    False
Home improvement           False
Other                      False
Major purchase             False
Name: count, dtype: bool

In [73]:
# теперь с помощью функции np.where() мы все категории по которым получили TRUE заменим на категорию OTHER 
X_train['title'] = np.where(X_train['title'].isin(title_series[mask].index), 'Other', X_train['title'])
print(X_train['title'].value_counts(dropna = False).head())

title
Debt consolidation         101852
Credit card refinancing     40255
Other                       25622
Home improvement            11870
Major purchase               3866
Name: count, dtype: int64


In [74]:
print(X_train['title'].value_counts(dropna = False).tail())

title
Refinance                    53
credit card consolidation    53
credit card refinance        53
CC Consolidation             52
Credit card consolidation    51
Name: count, dtype: int64


In [75]:
print(X_train['zip_code;'].value_counts(dropna = False).tail())

zip_code;
771xx;    1
682xx;    1
017xx     1
513xx;    1
502xx;    1
Name: count, dtype: int64


In [76]:
# осуществляем обработку редких категорий аналогично title 
zip_series = X_train['zip_code;'].value_counts()
mask = (zip_series/zip_series.sum() * 100).lt(0.025)
mask.head()

zip_code;
945xx;    False
112xx;    False
750xx;    False
606xx;    False
300xx;    False
Name: count, dtype: bool

In [77]:
X_train['zip_code;'] = np.where(X_train['zip_code;'].isin(zip_series[mask].index), 'Other', X_train['zip_code;'])
print(X_train['zip_code;'].value_counts(dropna = False).tail())

zip_code;
491xx;    51
991xx;    51
764xx;    50
743xx;    50
897xx;    50
Name: count, dtype: int64


In [78]:
X_train.head(3)

Unnamed: 0,acc_now_delinq,addr_state,annual_inc,chargeoff_within_12_mths,collections_12_mths_ex_med,delinq_2yrs,dti,earliest_cr_line,emp_length,fico_range_high,fico_range_low,funded_amnt,home_ownership,inq_last_12m,installment,int_rate,issue_d,loan_amnt,mort_acc,mths_since_last_delinq,mths_since_recent_bc_dlq,mths_since_recent_inq,num_accts_ever_120_pd,num_actv_bc_tl,num_rev_accts,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,open_acc,open_il_24m,open_rv_24m,percent_bc_gt_75,pub_rec,pub_rec_bankruptcies,purpose,revol_util,tax_liens,term,title,total_acc,verification_status,zip_code;
0,0,UT,80000.0,0,0,0,30.49,1996-10-01,4 years,689,685,16000,RENT,1,539.03,12.99,2016-04-01,16000,5,50,65,6,3,4,17,14,0,0,0,3,14,2,3,100.0,1,1,credit_card,56.1,0,36 months,Credit card refinancing,26,Source Verified,847xx;
1,0,CA,82000.0,0,0,0,7.0,1993-03-01,10+ years,689,685,6600,MORTGAGE,0,214.27,10.42,2017-09-01,6600,2,31,38,5,0,3,8,5,0,0,0,0,5,0,1,100.0,1,1,credit_card,88.3,0,36 months,Credit card refinancing,11,Not Verified,900xx;
2,0,NV,46080.0,0,0,1,17.32,2000-11-01,3 years,674,670,10000,MORTGAGE,0,317.96,8.99,2016-08-01,10000,3,10,38,13,0,1,23,7,0,0,0,0,7,0,3,0.0,0,0,credit_card,18.1,0,36 months,Credit card refinancing,31,Source Verified,895xx;


In [79]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199999 entries, 0 to 199998
Data columns (total 44 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   acc_now_delinq              199999 non-null  int32         
 1   addr_state                  199999 non-null  object        
 2   annual_inc                  199999 non-null  float64       
 3   chargeoff_within_12_mths    199999 non-null  int32         
 4   collections_12_mths_ex_med  199999 non-null  int32         
 5   delinq_2yrs                 199999 non-null  int32         
 6   dti                         199999 non-null  float64       
 7   earliest_cr_line            199999 non-null  datetime64[ns]
 8   emp_length                  199999 non-null  object        
 9   fico_range_high             199999 non-null  int32         
 10  fico_range_low              199999 non-null  int32         
 11  funded_amnt                 199999 non-

# __Подготовка X_test__

In [80]:
# загружаем файл с данными
X_test = pd.read_csv('./X_test.csv', encoding = 'utf8', sep = ',')
X_test.head(3)

Unnamed: 0,index,acc_now_delinq,addr_state,annual_inc,chargeoff_within_12_mths,collections_12_mths_ex_med,delinq_2yrs,dti,earliest_cr_line,emp_length,fico_range_high,fico_range_low,funded_amnt,home_ownership,inq_last_12m,installment,int_rate,issue_d,loan_amnt,mort_acc,mths_since_last_delinq,mths_since_recent_bc_dlq,mths_since_recent_inq,num_accts_ever_120_pd,num_actv_bc_tl,num_rev_accts,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,open_acc,open_il_24m,open_rv_24m,percent_bc_gt_75,pub_rec,pub_rec_bankruptcies,purpose,revol_util,tax_liens,term,title,total_acc,verification_status,zip_code;
0,0,0.0,GA,46209.0,0.0,0.0,0.0,34.98,Apr-2009,8 years,669.0,665.0,15700.0,MORTGAGE,6.0,437.65,22.45%,Jan-2016,15700.0,3.0,48.0,48.0,2.0,1.0,4.0,9.0,11.0,0.0,0.0,0.0,4.0,11.0,3.0,3.0,75.0,0.0,0.0,debt_consolidation,61.7%,0.0,60 months,Debt consolidation,20.0,Verified,302xx;
1,1,0.0,PA,43500.0,0.0,0.0,0.0,20.25,Jun-1982,5 years,704.0,700.0,7200.0,OWN,,166.79,13.80%,Apr-2011,7200.0,,,,,,,,,,,,,14.0,,,,0.0,0.0,debt_consolidation,58.6%,0.0,60 months,POFF,32.0,Not Verified,151xx;
2,2,0.0,OH,72000.0,0.0,0.0,0.0,15.98,Nov-1997,3 years,699.0,695.0,24000.0,MORTGAGE,,847.21,16.29%,Nov-2012,24000.0,2.0,63.0,63.0,2.0,5.0,3.0,23.0,18.0,0.0,0.0,0.0,1.0,18.0,,,40.0,0.0,0.0,credit_card,46.2%,0.0,36 months,credit crd,36.0,Verified,440xx;


In [81]:
X_test.shape

(199999, 45)

In [82]:
# Удаляем очевидную бесполезную переменную index (количество категорий совпадает с количеством наблюдений)
X_test.drop('index', axis = 1, inplace = True)

In [83]:
for i in ['acc_now_delinq', 'annual_inc', 'chargeoff_within_12_mths', 'collections_12_mths_ex_med', 'delinq_2yrs', 
          'dti', 'fico_range_high', 'fico_range_low', 'funded_amnt', 'inq_last_12m', 'installment', 'loan_amnt', 
          'mort_acc', 'mths_since_last_delinq', 'mths_since_recent_bc_dlq', 'mths_since_recent_inq', 'num_accts_ever_120_pd', 
          'num_actv_bc_tl', 'num_rev_accts', 'num_sats', 'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m', 
          'num_tl_op_past_12m', 'open_acc', 'open_il_24m', 'open_rv_24m', 'percent_bc_gt_75', 'pub_rec', 
          'pub_rec_bankruptcies', 'tax_liens', 'total_acc']:
    X_test[i].fillna(X_test[i].median(), inplace = True)

In [84]:
# заполняем пропуски в переменных с типом object  с помощью метода fillna
X_test['addr_state'].fillna(method='ffill', inplace=True)
X_test['earliest_cr_line'].fillna(method='ffill', inplace=True)
X_test['emp_length'].fillna(method='ffill', inplace=True)
X_test['home_ownership'].fillna(method='ffill', inplace=True)
X_test['int_rate'].fillna(method='ffill', inplace=True)
X_test['issue_d'].fillna(method='ffill', inplace=True)
X_test['purpose'].fillna(method='ffill', inplace=True)
X_test['revol_util'].fillna(method='ffill', inplace=True)
X_test['term'].fillna(method='ffill', inplace=True)
X_test['title'].fillna(method='ffill', inplace=True)
X_test['verification_status'].fillna(method='ffill', inplace=True)
X_test['zip_code;'].fillna(method='ffill', inplace=True)

In [85]:
# многие переменные неверно записаны как float64, так как не имеют после плавающей точки значений. С целью оптимизации
# памяти преобразуем их в тип int 
for i in ['acc_now_delinq', 'chargeoff_within_12_mths', 'collections_12_mths_ex_med', 'delinq_2yrs', 'fico_range_high', 
          'fico_range_low', 'funded_amnt', 'inq_last_12m', 'loan_amnt', 'mort_acc', 'mths_since_last_delinq',
          'mths_since_recent_bc_dlq', 'mths_since_recent_inq', 'num_accts_ever_120_pd', 'num_actv_bc_tl', 
          'num_rev_accts', 'num_sats', 'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m', 
          'num_tl_op_past_12m', 'open_acc', 'open_il_24m', 'open_rv_24m', 'pub_rec', 'pub_rec_bankruptcies', 
          'tax_liens', 'total_acc']:
    X_test[i] = X_test[i].astype('int')

In [86]:
# сначала избавимся от лишнего символа % в переменных  int_rate и revol_util
X_test['int_rate'] = X_test['int_rate'].str.rstrip('%')
X_test['revol_util'] = X_test['revol_util'].str.rstrip('%')

In [87]:
# изменяем тип данных в переменных int_rate и revol_util в тип float 

for i in ['int_rate', 'revol_util']:
    X_test[i] = X_test[i].astype('float')

In [88]:
# изменяем тип данных в переменных 'earliest_cr_line', 'issue_d' в тип DateTime 

X_test['earliest_cr_line'] = pd.to_datetime(X_test['earliest_cr_line'], format = '%b-%Y')
X_test['issue_d'] = pd.to_datetime(X_test['issue_d'], format='%b-%Y')

In [89]:
# видим, что переменная home_ownership не содержит пропусков. Все категории с частотой менее 100 наблюдений объеденим 
# в отдельную категорию ANY 
X_test['home_ownership'] = X_test['home_ownership'].replace(['OTHER', 'NONE'], 'ANY')

In [90]:
# видим, что переменная purpose не содержит пропусков. Все категории с частотой менее 100 наблюдений объеденим 
# в отдельную категорию other 
X_test['purpose'] = X_test['purpose'].replace(['wedding', 'renewable_energy', 'educational'], 'other')

In [91]:
title_series = X_test['title'].value_counts()
mask = (title_series/title_series.sum() * 100).lt(0.025)

In [92]:
# теперь с помощью функции np.where() мы все категории по которым получили TRUE заменим на категорию OTHER 
X_test['title'] = np.where(X_test['title'].isin(title_series[mask].index), 'Other', X_test['title'])
print(X_test['title'].value_counts(dropna = False).head())

title
Debt consolidation         102355
Credit card refinancing     39745
Other                       25474
Home improvement            11778
Major purchase               3951
Name: count, dtype: int64


In [93]:
# осуществляем обработку редких категорий аналогично zip_code 
zip_series = X_test['zip_code;'].value_counts()
mask = (zip_series/zip_series.sum() * 100).lt(0.025)


In [94]:
X_test['zip_code;'] = np.where(X_test['zip_code;'].isin(zip_series[mask].index), 'Other', X_test['zip_code;'])
print(X_test['zip_code;'].value_counts(dropna = False).tail())

zip_code;
397xx;    51
728xx;    51
838xx;    50
215xx;    50
743xx;    50
Name: count, dtype: int64


In [95]:
X_test.head(3)

Unnamed: 0,acc_now_delinq,addr_state,annual_inc,chargeoff_within_12_mths,collections_12_mths_ex_med,delinq_2yrs,dti,earliest_cr_line,emp_length,fico_range_high,fico_range_low,funded_amnt,home_ownership,inq_last_12m,installment,int_rate,issue_d,loan_amnt,mort_acc,mths_since_last_delinq,mths_since_recent_bc_dlq,mths_since_recent_inq,num_accts_ever_120_pd,num_actv_bc_tl,num_rev_accts,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,open_acc,open_il_24m,open_rv_24m,percent_bc_gt_75,pub_rec,pub_rec_bankruptcies,purpose,revol_util,tax_liens,term,title,total_acc,verification_status,zip_code;
0,0,GA,46209.0,0,0,0,34.98,2009-04-01,8 years,669,665,15700,MORTGAGE,6,437.65,22.45,2016-01-01,15700,3,48,48,2,1,4,9,11,0,0,0,4,11,3,3,75.0,0,0,debt_consolidation,61.7,0,60 months,Debt consolidation,20,Verified,302xx;
1,0,PA,43500.0,0,0,0,20.25,1982-06-01,5 years,704,700,7200,OWN,2,166.79,13.8,2011-04-01,7200,1,31,37,5,0,3,13,11,0,0,0,2,14,1,2,40.0,0,0,debt_consolidation,58.6,0,60 months,Other,32,Not Verified,151xx;
2,0,OH,72000.0,0,0,0,15.98,1997-11-01,3 years,699,695,24000,MORTGAGE,2,847.21,16.29,2012-11-01,24000,2,63,63,2,5,3,23,18,0,0,0,1,18,1,2,40.0,0,0,credit_card,46.2,0,36 months,Other,36,Verified,440xx;


# __Подготовка y_train__

In [96]:
y_train.head()

Unnamed: 0,index,loan_status
0,0,1.0
1,1,
2,2,1.0
3,3,
4,4,


In [97]:
# удаляем index
y_train.drop('index', axis = 1, inplace = True)
y_train.head()

Unnamed: 0,loan_status
0,1.0
1,
2,1.0
3,
4,


In [98]:
# заменяем значения Nan значением 0
y_train['loan_status'] = np.where(y_train['loan_status'].isnull(), 0, y_train['loan_status'])
y_train.head()

Unnamed: 0,loan_status
0,1.0
1,0.0
2,1.0
3,0.0
4,0.0


### __OneHotEncoding__

In [99]:
onehot_encoder = OneHotEncoder(sparse_output=False)
onehot_encoded = onehot_encoder.fit_transform(X_train[['addr_state', 'emp_length', 'home_ownership',
                                                       'purpose', 'term', 'title', 'verification_status', 'zip_code;']])
   

In [100]:
onehot_X_train = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out(['addr_state', 'emp_length', 
                            'home_ownership', 'purpose', 'term', 'title', 'verification_status', 'zip_code;']))
# print(onehot_X_train)

In [101]:
X_train = pd.concat([X_train, onehot_X_train], axis=1).drop(['addr_state', 'emp_length', 'home_ownership',
                            'purpose', 'term', 'title', 'verification_status', 'zip_code;'], axis=1)

In [102]:
X_train.head(1)

Unnamed: 0,acc_now_delinq,annual_inc,chargeoff_within_12_mths,collections_12_mths_ex_med,delinq_2yrs,dti,earliest_cr_line,fico_range_high,fico_range_low,funded_amnt,inq_last_12m,installment,int_rate,issue_d,loan_amnt,mort_acc,mths_since_last_delinq,mths_since_recent_bc_dlq,mths_since_recent_inq,num_accts_ever_120_pd,num_actv_bc_tl,num_rev_accts,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,open_acc,open_il_24m,open_rv_24m,percent_bc_gt_75,pub_rec,pub_rec_bankruptcies,revol_util,tax_liens,total_acc,addr_state_AK,addr_state_AL,addr_state_AR,addr_state_AZ,addr_state_CA,addr_state_CO,addr_state_CT,addr_state_DC,addr_state_DE,addr_state_FL,addr_state_GA,addr_state_HI,addr_state_ID,addr_state_IL,addr_state_IN,addr_state_KS,addr_state_KY,addr_state_LA,addr_state_MA,addr_state_MD,addr_state_ME,addr_state_MI,addr_state_MN,addr_state_MO,addr_state_MS,addr_state_MT,addr_state_NC,addr_state_ND,addr_state_NE,addr_state_NH,addr_state_NJ,addr_state_NM,addr_state_NV,addr_state_NY,addr_state_OH,addr_state_OK,addr_state_OR,addr_state_PA,addr_state_RI,addr_state_SC,addr_state_SD,addr_state_TN,addr_state_TX,addr_state_UT,addr_state_VA,addr_state_VT,addr_state_WA,addr_state_WI,addr_state_WV,addr_state_WY,emp_length_1 year,emp_length_10+ years,emp_length_2 years,emp_length_3 years,emp_length_4 years,emp_length_5 years,emp_length_6 years,emp_length_7 years,emp_length_8 years,emp_length_9 years,emp_length_< 1 year,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,purpose_car,purpose_credit_card,purpose_debt_consolidation,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_small_business,purpose_vacation,term_ 36 months,term_ 60 months,title_Business,title_CC Consolidation,title_Car financing,title_Consolidate,title_Consolidation,title_Consolidation Loan,title_Credit Card,title_Credit Card Consolidation,title_Credit Card Loan,title_Credit Card Payoff,title_Credit Card Refinance,title_Credit Cards,title_Credit card consolidation,title_Credit card refinancing,title_DEBT CONSOLIDATION,title_Debt,title_Debt Consolidation,title_Debt Consolidation.1,title_Debt Consolidation Loan,title_Debt Free,title_Debt Loan,title_Debt consolidation,title_Freedom,title_Green loan,title_Home Improvement,title_Home Improvement Loan,title_Home buying,title_Home improvement,title_Loan,title_Major purchase,title_Medical expenses,title_Moving and relocation,title_My Loan,title_Other,title_Payoff,title_Personal,title_Personal Loan,title_Refinance,title_Small Business Loan,title_Vacation,title_consolidate,title_consolidation,title_credit card consolidation,title_credit card payoff,title_credit card refinance,title_debt consolidation,title_home improvement,title_loan,title_payoff,title_personal,title_personal loan,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,zip_code;_010xx;,zip_code;_011xx;,zip_code;_012xx;,zip_code;_013xx;,zip_code;_014xx;,zip_code;_015xx;,zip_code;_016xx;,zip_code;_017xx;,zip_code;_018xx;,zip_code;_019xx;,zip_code;_020xx;,zip_code;_021xx;,zip_code;_023xx;,zip_code;_024xx;,zip_code;_025xx;,zip_code;_026xx;,zip_code;_027xx;,zip_code;_028xx;,zip_code;_029xx;,zip_code;_030xx;,zip_code;_031xx;,zip_code;_032xx;,zip_code;_038xx;,zip_code;_040xx;,zip_code;_042xx;,zip_code;_050xx;,zip_code;_054xx;,zip_code;_056xx;,zip_code;_060xx;,zip_code;_061xx;,zip_code;_062xx;,zip_code;_063xx;,zip_code;_064xx;,zip_code;_065xx;,zip_code;_066xx;,zip_code;_067xx;,zip_code;_068xx;,zip_code;_069xx;,zip_code;_070xx;,zip_code;_071xx;,zip_code;_072xx;,zip_code;_073xx;,zip_code;_074xx;,zip_code;_075xx;,zip_code;_076xx;,zip_code;_077xx;,zip_code;_078xx;,zip_code;_079xx;,zip_code;_080xx;,zip_code;_081xx;,zip_code;_082xx;,zip_code;_083xx;,zip_code;_085xx;,zip_code;_086xx;,zip_code;_087xx;,zip_code;_088xx;,zip_code;_089xx;,zip_code;_100xx;,zip_code;_101xx;,zip_code;_103xx;,zip_code;_104xx;,zip_code;_105xx;,zip_code;_106xx;,zip_code;_107xx;,zip_code;_108xx;,zip_code;_109xx;,zip_code;_110xx;,zip_code;_111xx;,zip_code;_112xx;,zip_code;_113xx;,zip_code;_114xx;,zip_code;_115xx;,zip_code;_116xx;,zip_code;_117xx;,zip_code;_118xx;,zip_code;_119xx;,zip_code;_120xx;,zip_code;_121xx;,zip_code;_122xx;,zip_code;_123xx;,zip_code;_124xx;,zip_code;_125xx;,zip_code;_127xx;,zip_code;_128xx;,zip_code;_129xx;,zip_code;_130xx;,zip_code;_131xx;,zip_code;_132xx;,zip_code;_133xx;,zip_code;_134xx;,zip_code;_136xx;,zip_code;_137xx;,zip_code;_138xx;,zip_code;_140xx;,zip_code;_141xx;,zip_code;_142xx;,zip_code;_144xx;,zip_code;_145xx;,zip_code;_146xx;,zip_code;_147xx;,zip_code;_148xx;,zip_code;_150xx;,zip_code;_151xx;,zip_code;_152xx;,zip_code;_153xx;,zip_code;_154xx;,zip_code;_156xx;,zip_code;_159xx;,zip_code;_160xx;,zip_code;_161xx;,zip_code;_163xx;,zip_code;_165xx;,zip_code;_166xx;,zip_code;_168xx;,zip_code;_170xx;,zip_code;_171xx;,zip_code;_172xx;,zip_code;_173xx;,zip_code;_174xx;,zip_code;_175xx;,zip_code;_176xx;,zip_code;_177xx;,zip_code;_178xx;,zip_code;_179xx;,zip_code;_180xx;,zip_code;_181xx;,zip_code;_182xx;,zip_code;_183xx;,zip_code;_184xx;,zip_code;_185xx;,zip_code;_186xx;,zip_code;_187xx;,zip_code;_189xx;,zip_code;_190xx;,zip_code;_191xx;,zip_code;_193xx;,zip_code;_194xx;,zip_code;_195xx;,zip_code;_196xx;,zip_code;_197xx;,zip_code;_198xx;,zip_code;_199xx;,zip_code;_200xx;,zip_code;_201xx;,zip_code;_206xx;,zip_code;_207xx;,zip_code;_208xx;,zip_code;_209xx;,zip_code;_210xx;,zip_code;_211xx;,zip_code;_212xx;,zip_code;_214xx;,zip_code;_216xx;,zip_code;_217xx;,zip_code;_218xx;,zip_code;_219xx;,zip_code;_220xx;,zip_code;_221xx;,zip_code;_222xx;,zip_code;_223xx;,zip_code;_224xx;,zip_code;_225xx;,zip_code;_226xx;,zip_code;_227xx;,zip_code;_228xx;,zip_code;_229xx;,zip_code;_230xx;,zip_code;_231xx;,zip_code;_232xx;,zip_code;_233xx;,zip_code;_234xx;,zip_code;_235xx;,zip_code;_236xx;,zip_code;_238xx;,zip_code;_240xx;,zip_code;_241xx;,zip_code;_242xx;,zip_code;_243xx;,zip_code;_244xx;,zip_code;_245xx;,zip_code;_254xx;,zip_code;_260xx;,zip_code;_261xx;,zip_code;_265xx;,zip_code;_270xx;,zip_code;_271xx;,zip_code;_272xx;,zip_code;_273xx;,zip_code;_274xx;,zip_code;_275xx;,zip_code;_276xx;,zip_code;_277xx;,zip_code;_278xx;,zip_code;_279xx;,zip_code;_280xx;,zip_code;_281xx;,zip_code;_282xx;,zip_code;_283xx;,zip_code;_284xx;,zip_code;_285xx;,zip_code;_286xx;,zip_code;_287xx;,zip_code;_288xx;,zip_code;_290xx;,zip_code;_291xx;,zip_code;_292xx;,zip_code;_293xx;,zip_code;_294xx;,zip_code;_295xx;,zip_code;_296xx;,zip_code;_297xx;,zip_code;_298xx;,zip_code;_299xx;,zip_code;_300xx;,zip_code;_301xx;,zip_code;_302xx;,zip_code;_303xx;,zip_code;_304xx;,zip_code;_305xx;,zip_code;_306xx;,zip_code;_307xx;,zip_code;_308xx;,zip_code;_309xx;,zip_code;_310xx;,zip_code;_312xx;,zip_code;_313xx;,zip_code;_314xx;,zip_code;_315xx;,zip_code;_316xx;,zip_code;_317xx;,zip_code;_318xx;,zip_code;_319xx;,zip_code;_320xx;,zip_code;_321xx;,zip_code;_322xx;,zip_code;_323xx;,zip_code;_324xx;,zip_code;_325xx;,zip_code;_326xx;,zip_code;_327xx;,zip_code;_328xx;,zip_code;_329xx;,zip_code;_330xx;,zip_code;_331xx;,zip_code;_333xx;,zip_code;_334xx;,zip_code;_335xx;,zip_code;_336xx;,zip_code;_337xx;,zip_code;_338xx;,zip_code;_339xx;,zip_code;_341xx;,zip_code;_342xx;,zip_code;_344xx;,zip_code;_346xx;,zip_code;_347xx;,zip_code;_349xx;,zip_code;_350xx;,zip_code;_351xx;,zip_code;_352xx;,zip_code;_354xx;,zip_code;_355xx;,zip_code;_356xx;,zip_code;_357xx;,zip_code;_358xx;,zip_code;_359xx;,zip_code;_360xx;,zip_code;_361xx;,zip_code;_362xx;,zip_code;_363xx;,zip_code;_365xx;,zip_code;_366xx;,zip_code;_368xx;,zip_code;_370xx;,zip_code;_371xx;,zip_code;_372xx;,zip_code;_373xx;,zip_code;_374xx;,zip_code;_376xx;,zip_code;_377xx;,zip_code;_378xx;,zip_code;_379xx;,zip_code;_380xx;,zip_code;_381xx;,zip_code;_383xx;,zip_code;_384xx;,zip_code;_385xx;,zip_code;_386xx;,zip_code;_388xx;,zip_code;_390xx;,zip_code;_391xx;,zip_code;_392xx;,zip_code;_394xx;,zip_code;_395xx;,zip_code;_400xx;,zip_code;_401xx;,zip_code;_402xx;,zip_code;_403xx;,zip_code;_404xx;,zip_code;_405xx;,zip_code;_410xx;,zip_code;_420xx;,zip_code;_421xx;,zip_code;_423xx;,zip_code;_427xx;,zip_code;_430xx;,zip_code;_431xx;,zip_code;_432xx;,zip_code;_433xx;,zip_code;_434xx;,zip_code;_435xx;,zip_code;_436xx;,zip_code;_437xx;,zip_code;_439xx;,zip_code;_440xx;,zip_code;_441xx;,zip_code;_442xx;,zip_code;_443xx;,zip_code;_444xx;,zip_code;_445xx;,zip_code;_446xx;,zip_code;_447xx;,zip_code;_448xx;,zip_code;_449xx;,zip_code;_450xx;,zip_code;_451xx;,zip_code;_452xx;,zip_code;_453xx;,zip_code;_454xx;,zip_code;_456xx;,zip_code;_457xx;,zip_code;_458xx;,zip_code;_460xx;,zip_code;_461xx;,zip_code;_462xx;,zip_code;_463xx;,zip_code;_464xx;,zip_code;_465xx;,zip_code;_466xx;,zip_code;_467xx;,zip_code;_468xx;,zip_code;_469xx;,zip_code;_470xx;,zip_code;_471xx;,zip_code;_472xx;,zip_code;_473xx;,zip_code;_474xx;,zip_code;_476xx;,zip_code;_478xx;,zip_code;_479xx;,zip_code;_480xx;,zip_code;_481xx;,zip_code;_482xx;,zip_code;_483xx;,zip_code;_484xx;,zip_code;_485xx;,zip_code;_486xx;,zip_code;_487xx;,zip_code;_488xx;,zip_code;_489xx;,zip_code;_490xx;,zip_code;_491xx;,zip_code;_492xx;,zip_code;_493xx;,zip_code;_494xx;,zip_code;_495xx;,zip_code;_496xx;,zip_code;_497xx;,zip_code;_498xx;,zip_code;_530xx;,zip_code;_531xx;,zip_code;_532xx;,zip_code;_534xx;,zip_code;_535xx;,zip_code;_537xx;,zip_code;_539xx;,zip_code;_540xx;,zip_code;_541xx;,zip_code;_543xx;,zip_code;_544xx;,zip_code;_546xx;,zip_code;_547xx;,zip_code;_548xx;,zip_code;_549xx;,zip_code;_550xx;,zip_code;_551xx;,zip_code;_553xx;,zip_code;_554xx;,zip_code;_557xx;,zip_code;_558xx;,zip_code;_559xx;,zip_code;_560xx;,zip_code;_562xx;,zip_code;_563xx;,zip_code;_564xx;,zip_code;_565xx;,zip_code;_570xx;,zip_code;_571xx;,zip_code;_577xx;,zip_code;_591xx;,zip_code;_594xx;,zip_code;_596xx;,zip_code;_597xx;,zip_code;_598xx;,zip_code;_599xx;,zip_code;_600xx;,zip_code;_601xx;,zip_code;_604xx;,zip_code;_605xx;,zip_code;_606xx;,zip_code;_607xx;,zip_code;_608xx;,zip_code;_609xx;,zip_code;_610xx;,zip_code;_611xx;,zip_code;_612xx;,zip_code;_613xx;,zip_code;_614xx;,zip_code;_615xx;,zip_code;_616xx;,zip_code;_617xx;,zip_code;_618xx;,zip_code;_620xx;,zip_code;_622xx;,zip_code;_625xx;,zip_code;_626xx;,zip_code;_627xx;,zip_code;_628xx;,zip_code;_629xx;,zip_code;_630xx;,zip_code;_631xx;,zip_code;_633xx;,zip_code;_637xx;,zip_code;_640xx;,zip_code;_641xx;,zip_code;_647xx;,zip_code;_648xx;,zip_code;_650xx;,zip_code;_652xx;,zip_code;_655xx;,zip_code;_656xx;,zip_code;_657xx;,zip_code;_658xx;,zip_code;_660xx;,zip_code;_661xx;,zip_code;_662xx;,zip_code;_665xx;,zip_code;_666xx;,zip_code;_667xx;,zip_code;_670xx;,zip_code;_671xx;,zip_code;_672xx;,zip_code;_674xx;,zip_code;_678xx;,zip_code;_680xx;,zip_code;_681xx;,zip_code;_685xx;,zip_code;_688xx;,zip_code;_700xx;,zip_code;_701xx;,zip_code;_703xx;,zip_code;_704xx;,zip_code;_705xx;,zip_code;_706xx;,zip_code;_707xx;,zip_code;_708xx;,zip_code;_710xx;,zip_code;_711xx;,zip_code;_712xx;,zip_code;_713xx;,zip_code;_714xx;,zip_code;_716xx;,zip_code;_718xx;,zip_code;_719xx;,zip_code;_720xx;,zip_code;_721xx;,zip_code;_722xx;,zip_code;_723xx;,zip_code;_724xx;,zip_code;_726xx;,zip_code;_727xx;,zip_code;_728xx;,zip_code;_729xx;,zip_code;_730xx;,zip_code;_731xx;,zip_code;_735xx;,zip_code;_740xx;,zip_code;_741xx;,zip_code;_743xx;,zip_code;_744xx;,zip_code;_748xx;,zip_code;_750xx;,zip_code;_751xx;,zip_code;_752xx;,zip_code;_754xx;,zip_code;_755xx;,zip_code;_756xx;,zip_code;_757xx;,zip_code;_759xx;,zip_code;_760xx;,zip_code;_761xx;,zip_code;_762xx;,zip_code;_763xx;,zip_code;_764xx;,zip_code;_765xx;,zip_code;_766xx;,zip_code;_767xx;,zip_code;_769xx;,zip_code;_770xx;,zip_code;_773xx;,zip_code;_774xx;,zip_code;_775xx;,zip_code;_776xx;,zip_code;_778xx;,zip_code;_779xx;,zip_code;_780xx;,zip_code;_781xx;,zip_code;_782xx;,zip_code;_783xx;,zip_code;_784xx;,zip_code;_785xx;,zip_code;_786xx;,zip_code;_787xx;,zip_code;_788xx;,zip_code;_790xx;,zip_code;_791xx;,zip_code;_793xx;,zip_code;_794xx;,zip_code;_796xx;,zip_code;_797xx;,zip_code;_799xx;,zip_code;_800xx;,zip_code;_801xx;,zip_code;_802xx;,zip_code;_803xx;,zip_code;_804xx;,zip_code;_805xx;,zip_code;_806xx;,zip_code;_808xx;,zip_code;_809xx;,zip_code;_810xx;,zip_code;_812xx;,zip_code;_815xx;,zip_code;_816xx;,zip_code;_820xx;,zip_code;_826xx;,zip_code;_827xx;,zip_code;_829xx;,zip_code;_836xx;,zip_code;_837xx;,zip_code;_838xx;,zip_code;_840xx;,zip_code;_841xx;,zip_code;_843xx;,zip_code;_844xx;,zip_code;_846xx;,zip_code;_847xx;,zip_code;_850xx;,zip_code;_851xx;,zip_code;_852xx;,zip_code;_853xx;,zip_code;_855xx;,zip_code;_856xx;,zip_code;_857xx;,zip_code;_860xx;,zip_code;_863xx;,zip_code;_864xx;,zip_code;_870xx;,zip_code;_871xx;,zip_code;_874xx;,zip_code;_875xx;,zip_code;_880xx;,zip_code;_882xx;,zip_code;_883xx;,zip_code;_890xx;,zip_code;_891xx;,zip_code;_894xx;,zip_code;_895xx;,zip_code;_897xx;,zip_code;_898xx;,zip_code;_900xx;,zip_code;_902xx;,zip_code;_903xx;,zip_code;_904xx;,zip_code;_905xx;,zip_code;_906xx;,zip_code;_907xx;,zip_code;_908xx;,zip_code;_910xx;,zip_code;_911xx;,zip_code;_912xx;,zip_code;_913xx;,zip_code;_914xx;,zip_code;_915xx;,zip_code;_916xx;,zip_code;_917xx;,zip_code;_918xx;,zip_code;_919xx;,zip_code;_920xx;,zip_code;_921xx;,zip_code;_922xx;,zip_code;_923xx;,zip_code;_924xx;,zip_code;_925xx;,zip_code;_926xx;,zip_code;_927xx;,zip_code;_928xx;,zip_code;_930xx;,zip_code;_931xx;,zip_code;_932xx;,zip_code;_933xx;,zip_code;_934xx;,zip_code;_935xx;,zip_code;_936xx;,zip_code;_937xx;,zip_code;_939xx;,zip_code;_940xx;,zip_code;_941xx;,zip_code;_944xx;,zip_code;_945xx;,zip_code;_946xx;,zip_code;_947xx;,zip_code;_948xx;,zip_code;_949xx;,zip_code;_950xx;,zip_code;_951xx;,zip_code;_952xx;,zip_code;_953xx;,zip_code;_954xx;,zip_code;_955xx;,zip_code;_956xx;,zip_code;_957xx;,zip_code;_958xx;,zip_code;_959xx;,zip_code;_960xx;,zip_code;_961xx;,zip_code;_967xx;,zip_code;_968xx;,zip_code;_970xx;,zip_code;_971xx;,zip_code;_972xx;,zip_code;_973xx;,zip_code;_974xx;,zip_code;_975xx;,zip_code;_977xx;,zip_code;_978xx;,zip_code;_980xx;,zip_code;_981xx;,zip_code;_982xx;,zip_code;_983xx;,zip_code;_984xx;,zip_code;_985xx;,zip_code;_986xx;,zip_code;_988xx;,zip_code;_989xx;,zip_code;_990xx;,zip_code;_991xx;,zip_code;_992xx;,zip_code;_993xx;,zip_code;_995xx;,zip_code;_996xx;,zip_code;_997xx;,zip_code;_Other
0,0,80000.0,0,0,0,30.49,1996-10-01,689,685,16000,1,539.03,12.99,2016-04-01,16000,5,50,65,6,3,4,17,14,0,0,0,3,14,2,3,100.0,1,1,56.1,0,26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [103]:
# Повторяем аналогичные действия для X_test
onehot_encoder = OneHotEncoder(sparse_output=False)
onehot_encoded = onehot_encoder.fit_transform(X_test[['addr_state', 'emp_length', 'home_ownership',
                                                       'purpose', 'term', 'title', 'verification_status', 'zip_code;']])

onehot_X_test = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out(['addr_state', 'emp_length', 
                            'home_ownership', 'purpose', 'term', 'title', 'verification_status', 'zip_code;']))
# print(onehot_X_test)

X_test = pd.concat([X_test, onehot_X_test], axis=1).drop(['addr_state', 'emp_length', 
                            'home_ownership', 'purpose', 'term', 'title', 'verification_status', 'zip_code;'], axis=1)

X_test.head(1)

Unnamed: 0,acc_now_delinq,annual_inc,chargeoff_within_12_mths,collections_12_mths_ex_med,delinq_2yrs,dti,earliest_cr_line,fico_range_high,fico_range_low,funded_amnt,inq_last_12m,installment,int_rate,issue_d,loan_amnt,mort_acc,mths_since_last_delinq,mths_since_recent_bc_dlq,mths_since_recent_inq,num_accts_ever_120_pd,num_actv_bc_tl,num_rev_accts,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,open_acc,open_il_24m,open_rv_24m,percent_bc_gt_75,pub_rec,pub_rec_bankruptcies,revol_util,tax_liens,total_acc,addr_state_AK,addr_state_AL,addr_state_AR,addr_state_AZ,addr_state_CA,addr_state_CO,addr_state_CT,addr_state_DC,addr_state_DE,addr_state_FL,addr_state_GA,addr_state_HI,addr_state_ID,addr_state_IL,addr_state_IN,addr_state_KS,addr_state_KY,addr_state_LA,addr_state_MA,addr_state_MD,addr_state_ME,addr_state_MI,addr_state_MN,addr_state_MO,addr_state_MS,addr_state_MT,addr_state_NC,addr_state_ND,addr_state_NE,addr_state_NH,addr_state_NJ,addr_state_NM,addr_state_NV,addr_state_NY,addr_state_OH,addr_state_OK,addr_state_OR,addr_state_PA,addr_state_RI,addr_state_SC,addr_state_SD,addr_state_TN,addr_state_TX,addr_state_UT,addr_state_VA,addr_state_VT,addr_state_WA,addr_state_WI,addr_state_WV,addr_state_WY,emp_length_1 year,emp_length_10+ years,emp_length_2 years,emp_length_3 years,emp_length_4 years,emp_length_5 years,emp_length_6 years,emp_length_7 years,emp_length_8 years,emp_length_9 years,emp_length_< 1 year,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,purpose_car,purpose_credit_card,purpose_debt_consolidation,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_small_business,purpose_vacation,term_ 36 months,term_ 60 months,title_Business,title_CC Consolidation,title_Car financing,title_Consolidate,title_Consolidation,title_Consolidation Loan,title_Credit Card,title_Credit Card Consolidation,title_Credit Card Loan,title_Credit Card Payoff,title_Credit Card Refinance,title_Credit Cards,title_Credit Consolidation,title_Credit card consolidation,title_Credit card refinancing,title_DEBT CONSOLIDATION,title_Debt,title_Debt Consolidation,title_Debt Consolidation.1,title_Debt Consolidation Loan,title_Debt Free,title_Debt consolidation,title_Freedom,title_Green loan,title_Home Improvement,title_Home Improvement Loan,title_Home buying,title_Home improvement,title_Loan,title_Major purchase,title_Medical expenses,title_Moving and relocation,title_My Loan,title_Other,title_Payoff,title_Personal,title_Personal Loan,title_Refinance,title_Small Business Loan,title_Vacation,title_Wedding Loan,title_consolidate,title_consolidation,title_consolidation loan,title_credit card consolidation,title_credit card payoff,title_credit card refinance,title_debt,title_debt consolidation,title_freedom,title_home improvement,title_loan,title_payoff,title_personal,title_personal loan,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,zip_code;_010xx;,zip_code;_011xx;,zip_code;_012xx;,zip_code;_014xx;,zip_code;_015xx;,zip_code;_016xx;,zip_code;_017xx;,zip_code;_018xx;,zip_code;_019xx;,zip_code;_020xx;,zip_code;_021xx;,zip_code;_023xx;,zip_code;_024xx;,zip_code;_025xx;,zip_code;_026xx;,zip_code;_027xx;,zip_code;_028xx;,zip_code;_029xx;,zip_code;_030xx;,zip_code;_031xx;,zip_code;_032xx;,zip_code;_034xx;,zip_code;_038xx;,zip_code;_040xx;,zip_code;_054xx;,zip_code;_056xx;,zip_code;_060xx;,zip_code;_061xx;,zip_code;_062xx;,zip_code;_063xx;,zip_code;_064xx;,zip_code;_065xx;,zip_code;_066xx;,zip_code;_067xx;,zip_code;_068xx;,zip_code;_069xx;,zip_code;_070xx;,zip_code;_071xx;,zip_code;_072xx;,zip_code;_073xx;,zip_code;_074xx;,zip_code;_075xx;,zip_code;_076xx;,zip_code;_077xx;,zip_code;_078xx;,zip_code;_079xx;,zip_code;_080xx;,zip_code;_081xx;,zip_code;_082xx;,zip_code;_083xx;,zip_code;_085xx;,zip_code;_086xx;,zip_code;_087xx;,zip_code;_088xx;,zip_code;_089xx;,zip_code;_100xx;,zip_code;_101xx;,zip_code;_103xx;,zip_code;_104xx;,zip_code;_105xx;,zip_code;_106xx;,zip_code;_107xx;,zip_code;_108xx;,zip_code;_109xx;,zip_code;_110xx;,zip_code;_111xx;,zip_code;_112xx;,zip_code;_113xx;,zip_code;_114xx;,zip_code;_115xx;,zip_code;_116xx;,zip_code;_117xx;,zip_code;_118xx;,zip_code;_119xx;,zip_code;_120xx;,zip_code;_121xx;,zip_code;_122xx;,zip_code;_123xx;,zip_code;_124xx;,zip_code;_125xx;,zip_code;_126xx;,zip_code;_127xx;,zip_code;_128xx;,zip_code;_129xx;,zip_code;_130xx;,zip_code;_131xx;,zip_code;_132xx;,zip_code;_133xx;,zip_code;_134xx;,zip_code;_136xx;,zip_code;_137xx;,zip_code;_138xx;,zip_code;_140xx;,zip_code;_141xx;,zip_code;_142xx;,zip_code;_144xx;,zip_code;_145xx;,zip_code;_146xx;,zip_code;_147xx;,zip_code;_148xx;,zip_code;_150xx;,zip_code;_151xx;,zip_code;_152xx;,zip_code;_153xx;,zip_code;_154xx;,zip_code;_156xx;,zip_code;_160xx;,zip_code;_161xx;,zip_code;_163xx;,zip_code;_164xx;,zip_code;_165xx;,zip_code;_166xx;,zip_code;_168xx;,zip_code;_170xx;,zip_code;_171xx;,zip_code;_172xx;,zip_code;_173xx;,zip_code;_174xx;,zip_code;_175xx;,zip_code;_176xx;,zip_code;_177xx;,zip_code;_178xx;,zip_code;_179xx;,zip_code;_180xx;,zip_code;_181xx;,zip_code;_182xx;,zip_code;_183xx;,zip_code;_184xx;,zip_code;_185xx;,zip_code;_186xx;,zip_code;_187xx;,zip_code;_189xx;,zip_code;_190xx;,zip_code;_191xx;,zip_code;_193xx;,zip_code;_194xx;,zip_code;_195xx;,zip_code;_196xx;,zip_code;_197xx;,zip_code;_198xx;,zip_code;_199xx;,zip_code;_200xx;,zip_code;_201xx;,zip_code;_206xx;,zip_code;_207xx;,zip_code;_208xx;,zip_code;_209xx;,zip_code;_210xx;,zip_code;_211xx;,zip_code;_212xx;,zip_code;_214xx;,zip_code;_215xx;,zip_code;_216xx;,zip_code;_217xx;,zip_code;_218xx;,zip_code;_219xx;,zip_code;_220xx;,zip_code;_221xx;,zip_code;_222xx;,zip_code;_223xx;,zip_code;_224xx;,zip_code;_225xx;,zip_code;_226xx;,zip_code;_227xx;,zip_code;_228xx;,zip_code;_229xx;,zip_code;_230xx;,zip_code;_231xx;,zip_code;_232xx;,zip_code;_233xx;,zip_code;_234xx;,zip_code;_235xx;,zip_code;_236xx;,zip_code;_238xx;,zip_code;_240xx;,zip_code;_241xx;,zip_code;_242xx;,zip_code;_243xx;,zip_code;_244xx;,zip_code;_245xx;,zip_code;_254xx;,zip_code;_260xx;,zip_code;_265xx;,zip_code;_270xx;,zip_code;_271xx;,zip_code;_272xx;,zip_code;_273xx;,zip_code;_274xx;,zip_code;_275xx;,zip_code;_276xx;,zip_code;_277xx;,zip_code;_278xx;,zip_code;_279xx;,zip_code;_280xx;,zip_code;_281xx;,zip_code;_282xx;,zip_code;_283xx;,zip_code;_284xx;,zip_code;_285xx;,zip_code;_286xx;,zip_code;_287xx;,zip_code;_288xx;,zip_code;_290xx;,zip_code;_291xx;,zip_code;_292xx;,zip_code;_293xx;,zip_code;_294xx;,zip_code;_295xx;,zip_code;_296xx;,zip_code;_297xx;,zip_code;_298xx;,zip_code;_299xx;,zip_code;_300xx;,zip_code;_301xx;,zip_code;_302xx;,zip_code;_303xx;,zip_code;_304xx;,zip_code;_305xx;,zip_code;_306xx;,zip_code;_307xx;,zip_code;_308xx;,zip_code;_309xx;,zip_code;_310xx;,zip_code;_312xx;,zip_code;_313xx;,zip_code;_314xx;,zip_code;_315xx;,zip_code;_316xx;,zip_code;_317xx;,zip_code;_318xx;,zip_code;_319xx;,zip_code;_320xx;,zip_code;_321xx;,zip_code;_322xx;,zip_code;_323xx;,zip_code;_324xx;,zip_code;_325xx;,zip_code;_326xx;,zip_code;_327xx;,zip_code;_328xx;,zip_code;_329xx;,zip_code;_330xx;,zip_code;_331xx;,zip_code;_333xx;,zip_code;_334xx;,zip_code;_335xx;,zip_code;_336xx;,zip_code;_337xx;,zip_code;_338xx;,zip_code;_339xx;,zip_code;_341xx;,zip_code;_342xx;,zip_code;_344xx;,zip_code;_346xx;,zip_code;_347xx;,zip_code;_349xx;,zip_code;_350xx;,zip_code;_351xx;,zip_code;_352xx;,zip_code;_354xx;,zip_code;_355xx;,zip_code;_356xx;,zip_code;_357xx;,zip_code;_358xx;,zip_code;_359xx;,zip_code;_360xx;,zip_code;_361xx;,zip_code;_362xx;,zip_code;_363xx;,zip_code;_365xx;,zip_code;_366xx;,zip_code;_368xx;,zip_code;_370xx;,zip_code;_371xx;,zip_code;_372xx;,zip_code;_373xx;,zip_code;_374xx;,zip_code;_376xx;,zip_code;_377xx;,zip_code;_378xx;,zip_code;_379xx;,zip_code;_380xx;,zip_code;_381xx;,zip_code;_383xx;,zip_code;_384xx;,zip_code;_385xx;,zip_code;_386xx;,zip_code;_388xx;,zip_code;_390xx;,zip_code;_391xx;,zip_code;_392xx;,zip_code;_393xx;,zip_code;_394xx;,zip_code;_395xx;,zip_code;_397xx;,zip_code;_400xx;,zip_code;_402xx;,zip_code;_403xx;,zip_code;_404xx;,zip_code;_405xx;,zip_code;_410xx;,zip_code;_420xx;,zip_code;_421xx;,zip_code;_423xx;,zip_code;_424xx;,zip_code;_427xx;,zip_code;_430xx;,zip_code;_431xx;,zip_code;_432xx;,zip_code;_433xx;,zip_code;_434xx;,zip_code;_435xx;,zip_code;_436xx;,zip_code;_437xx;,zip_code;_439xx;,zip_code;_440xx;,zip_code;_441xx;,zip_code;_442xx;,zip_code;_443xx;,zip_code;_444xx;,zip_code;_445xx;,zip_code;_446xx;,zip_code;_447xx;,zip_code;_448xx;,zip_code;_450xx;,zip_code;_451xx;,zip_code;_452xx;,zip_code;_453xx;,zip_code;_454xx;,zip_code;_456xx;,zip_code;_457xx;,zip_code;_458xx;,zip_code;_460xx;,zip_code;_461xx;,zip_code;_462xx;,zip_code;_463xx;,zip_code;_464xx;,zip_code;_465xx;,zip_code;_466xx;,zip_code;_467xx;,zip_code;_468xx;,zip_code;_469xx;,zip_code;_470xx;,zip_code;_471xx;,zip_code;_472xx;,zip_code;_473xx;,zip_code;_474xx;,zip_code;_475xx;,zip_code;_476xx;,zip_code;_477xx;,zip_code;_478xx;,zip_code;_479xx;,zip_code;_480xx;,zip_code;_481xx;,zip_code;_482xx;,zip_code;_483xx;,zip_code;_484xx;,zip_code;_485xx;,zip_code;_486xx;,zip_code;_487xx;,zip_code;_488xx;,zip_code;_489xx;,zip_code;_490xx;,zip_code;_492xx;,zip_code;_493xx;,zip_code;_494xx;,zip_code;_495xx;,zip_code;_496xx;,zip_code;_497xx;,zip_code;_498xx;,zip_code;_530xx;,zip_code;_531xx;,zip_code;_532xx;,zip_code;_534xx;,zip_code;_535xx;,zip_code;_537xx;,zip_code;_539xx;,zip_code;_540xx;,zip_code;_541xx;,zip_code;_543xx;,zip_code;_544xx;,zip_code;_546xx;,zip_code;_547xx;,zip_code;_548xx;,zip_code;_549xx;,zip_code;_550xx;,zip_code;_551xx;,zip_code;_553xx;,zip_code;_554xx;,zip_code;_557xx;,zip_code;_558xx;,zip_code;_559xx;,zip_code;_560xx;,zip_code;_562xx;,zip_code;_563xx;,zip_code;_564xx;,zip_code;_565xx;,zip_code;_570xx;,zip_code;_571xx;,zip_code;_577xx;,zip_code;_581xx;,zip_code;_587xx;,zip_code;_591xx;,zip_code;_594xx;,zip_code;_597xx;,zip_code;_598xx;,zip_code;_599xx;,zip_code;_600xx;,zip_code;_601xx;,zip_code;_604xx;,zip_code;_605xx;,zip_code;_606xx;,zip_code;_607xx;,zip_code;_608xx;,zip_code;_609xx;,zip_code;_610xx;,zip_code;_611xx;,zip_code;_612xx;,zip_code;_613xx;,zip_code;_615xx;,zip_code;_616xx;,zip_code;_617xx;,zip_code;_618xx;,zip_code;_620xx;,zip_code;_622xx;,zip_code;_624xx;,zip_code;_625xx;,zip_code;_626xx;,zip_code;_627xx;,zip_code;_628xx;,zip_code;_629xx;,zip_code;_630xx;,zip_code;_631xx;,zip_code;_633xx;,zip_code;_637xx;,zip_code;_640xx;,zip_code;_641xx;,zip_code;_645xx;,zip_code;_647xx;,zip_code;_648xx;,zip_code;_650xx;,zip_code;_652xx;,zip_code;_656xx;,zip_code;_657xx;,zip_code;_658xx;,zip_code;_660xx;,zip_code;_661xx;,zip_code;_662xx;,zip_code;_665xx;,zip_code;_666xx;,zip_code;_667xx;,zip_code;_670xx;,zip_code;_672xx;,zip_code;_674xx;,zip_code;_680xx;,zip_code;_681xx;,zip_code;_685xx;,zip_code;_700xx;,zip_code;_701xx;,zip_code;_703xx;,zip_code;_704xx;,zip_code;_705xx;,zip_code;_706xx;,zip_code;_707xx;,zip_code;_708xx;,zip_code;_710xx;,zip_code;_711xx;,zip_code;_712xx;,zip_code;_713xx;,zip_code;_714xx;,zip_code;_716xx;,zip_code;_719xx;,zip_code;_720xx;,zip_code;_721xx;,zip_code;_722xx;,zip_code;_723xx;,zip_code;_724xx;,zip_code;_725xx;,zip_code;_726xx;,zip_code;_727xx;,zip_code;_728xx;,zip_code;_729xx;,zip_code;_730xx;,zip_code;_731xx;,zip_code;_735xx;,zip_code;_740xx;,zip_code;_741xx;,zip_code;_743xx;,zip_code;_744xx;,zip_code;_748xx;,zip_code;_750xx;,zip_code;_751xx;,zip_code;_752xx;,zip_code;_754xx;,zip_code;_755xx;,zip_code;_756xx;,zip_code;_757xx;,zip_code;_759xx;,zip_code;_760xx;,zip_code;_761xx;,zip_code;_762xx;,zip_code;_763xx;,zip_code;_765xx;,zip_code;_766xx;,zip_code;_767xx;,zip_code;_769xx;,zip_code;_770xx;,zip_code;_773xx;,zip_code;_774xx;,zip_code;_775xx;,zip_code;_776xx;,zip_code;_777xx;,zip_code;_778xx;,zip_code;_779xx;,zip_code;_780xx;,zip_code;_781xx;,zip_code;_782xx;,zip_code;_783xx;,zip_code;_784xx;,zip_code;_785xx;,zip_code;_786xx;,zip_code;_787xx;,zip_code;_788xx;,zip_code;_790xx;,zip_code;_791xx;,zip_code;_793xx;,zip_code;_794xx;,zip_code;_795xx;,zip_code;_796xx;,zip_code;_797xx;,zip_code;_799xx;,zip_code;_800xx;,zip_code;_801xx;,zip_code;_802xx;,zip_code;_803xx;,zip_code;_804xx;,zip_code;_805xx;,zip_code;_806xx;,zip_code;_808xx;,zip_code;_809xx;,zip_code;_810xx;,zip_code;_812xx;,zip_code;_813xx;,zip_code;_815xx;,zip_code;_816xx;,zip_code;_820xx;,zip_code;_826xx;,zip_code;_827xx;,zip_code;_829xx;,zip_code;_836xx;,zip_code;_837xx;,zip_code;_838xx;,zip_code;_840xx;,zip_code;_841xx;,zip_code;_843xx;,zip_code;_844xx;,zip_code;_846xx;,zip_code;_847xx;,zip_code;_850xx;,zip_code;_851xx;,zip_code;_852xx;,zip_code;_853xx;,zip_code;_856xx;,zip_code;_857xx;,zip_code;_860xx;,zip_code;_863xx;,zip_code;_864xx;,zip_code;_870xx;,zip_code;_871xx;,zip_code;_874xx;,zip_code;_875xx;,zip_code;_880xx;,zip_code;_882xx;,zip_code;_890xx;,zip_code;_891xx;,zip_code;_894xx;,zip_code;_895xx;,zip_code;_897xx;,zip_code;_898xx;,zip_code;_900xx;,zip_code;_902xx;,zip_code;_903xx;,zip_code;_904xx;,zip_code;_905xx;,zip_code;_906xx;,zip_code;_907xx;,zip_code;_908xx;,zip_code;_910xx;,zip_code;_911xx;,zip_code;_912xx;,zip_code;_913xx;,zip_code;_914xx;,zip_code;_915xx;,zip_code;_916xx;,zip_code;_917xx;,zip_code;_918xx;,zip_code;_919xx;,zip_code;_920xx;,zip_code;_921xx;,zip_code;_922xx;,zip_code;_923xx;,zip_code;_924xx;,zip_code;_925xx;,zip_code;_926xx;,zip_code;_927xx;,zip_code;_928xx;,zip_code;_930xx;,zip_code;_931xx;,zip_code;_932xx;,zip_code;_933xx;,zip_code;_934xx;,zip_code;_935xx;,zip_code;_936xx;,zip_code;_937xx;,zip_code;_939xx;,zip_code;_940xx;,zip_code;_941xx;,zip_code;_944xx;,zip_code;_945xx;,zip_code;_946xx;,zip_code;_947xx;,zip_code;_948xx;,zip_code;_949xx;,zip_code;_950xx;,zip_code;_951xx;,zip_code;_952xx;,zip_code;_953xx;,zip_code;_954xx;,zip_code;_955xx;,zip_code;_956xx;,zip_code;_957xx;,zip_code;_958xx;,zip_code;_959xx;,zip_code;_960xx;,zip_code;_961xx;,zip_code;_967xx;,zip_code;_968xx;,zip_code;_970xx;,zip_code;_971xx;,zip_code;_972xx;,zip_code;_973xx;,zip_code;_974xx;,zip_code;_975xx;,zip_code;_977xx;,zip_code;_978xx;,zip_code;_980xx;,zip_code;_981xx;,zip_code;_982xx;,zip_code;_983xx;,zip_code;_984xx;,zip_code;_985xx;,zip_code;_986xx;,zip_code;_988xx;,zip_code;_989xx;,zip_code;_990xx;,zip_code;_991xx;,zip_code;_992xx;,zip_code;_993xx;,zip_code;_995xx;,zip_code;_996xx;,zip_code;_997xx;,zip_code;_Other
0,0,46209.0,0,0,0,34.98,2009-04-01,669,665,15700,6,437.65,22.45,2016-01-01,15700,3,48,48,2,1,4,9,11,0,0,0,4,11,3,3,75.0,0,0,61.7,0,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### __Преобразуем даты в числа__

In [104]:
X_train['earliest_cr_line'] = pd.to_numeric(X_train['earliest_cr_line'])
X_train['issue_d'] = pd.to_numeric(X_train['issue_d'])


In [105]:
X_test['earliest_cr_line'] = pd.to_numeric(X_test['earliest_cr_line'])
X_test['issue_d'] = pd.to_numeric(X_test['issue_d'])

### __Проверим готовность данных для использования в модели__

In [106]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199999 entries, 0 to 199998
Columns: 831 entries, acc_now_delinq to zip_code;_Other
dtypes: float64(801), int32(28), int64(2)
memory usage: 1.2 GB


In [107]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199999 entries, 0 to 199998
Columns: 834 entries, acc_now_delinq to zip_code;_Other
dtypes: float64(804), int32(28), int64(2)
memory usage: 1.2 GB


__В датафреймах после OneHotEncoding получилось разное количество признаков. Необходимо вывести разницу в данных и удалить, для приведение выборок в одинаковое состояние__

In [108]:
columns_X_train = set(X_train.columns)
columns_X_test = set(X_test.columns)
columns_not_in_X_test = columns_X_train - columns_X_test
columns_not_in_X_train = columns_X_test - columns_X_train

In [109]:
columns_not_in_X_train

{'title_Credit Consolidation',
 'title_Wedding Loan',
 'title_consolidation loan',
 'title_debt',
 'title_freedom',
 'zip_code;_034xx;',
 'zip_code;_126xx;',
 'zip_code;_164xx;',
 'zip_code;_215xx;',
 'zip_code;_393xx;',
 'zip_code;_397xx;',
 'zip_code;_424xx;',
 'zip_code;_475xx;',
 'zip_code;_477xx;',
 'zip_code;_581xx;',
 'zip_code;_587xx;',
 'zip_code;_624xx;',
 'zip_code;_645xx;',
 'zip_code;_725xx;',
 'zip_code;_777xx;',
 'zip_code;_795xx;',
 'zip_code;_813xx;'}

In [110]:
columns_not_in_X_test

{'title_Debt Loan',
 'zip_code;_013xx;',
 'zip_code;_042xx;',
 'zip_code;_050xx;',
 'zip_code;_159xx;',
 'zip_code;_261xx;',
 'zip_code;_401xx;',
 'zip_code;_449xx;',
 'zip_code;_491xx;',
 'zip_code;_596xx;',
 'zip_code;_614xx;',
 'zip_code;_655xx;',
 'zip_code;_671xx;',
 'zip_code;_678xx;',
 'zip_code;_688xx;',
 'zip_code;_718xx;',
 'zip_code;_764xx;',
 'zip_code;_855xx;',
 'zip_code;_883xx;'}

In [111]:
X_test.drop(['title_Credit Consolidation', 'title_Wedding Loan', 'title_consolidation loan', 'title_debt', 'title_freedom',
            'zip_code;_034xx;', 'zip_code;_126xx;', 'zip_code;_164xx;', 'zip_code;_215xx;', 'zip_code;_393xx;',
            'zip_code;_397xx;', 'zip_code;_424xx;', 'zip_code;_475xx;', 'zip_code;_477xx;', 'zip_code;_581xx;',
            'zip_code;_587xx;', 'zip_code;_624xx;', 'zip_code;_645xx;', 'zip_code;_725xx;', 'zip_code;_777xx;',
            'zip_code;_795xx;', 'zip_code;_813xx;'], axis = 1, inplace = True)

In [112]:
X_train.drop(['title_Debt Loan', 'zip_code;_013xx;', 'zip_code;_042xx;', 'zip_code;_050xx;', 'zip_code;_159xx;',
              'zip_code;_261xx;', 'zip_code;_401xx;', 'zip_code;_449xx;', 'zip_code;_491xx;', 'zip_code;_596xx;',
              'zip_code;_614xx;', 'zip_code;_655xx;', 'zip_code;_671xx;', 'zip_code;_678xx;', 'zip_code;_688xx;',
              'zip_code;_718xx;', 'zip_code;_764xx;', 'zip_code;_855xx;', 'zip_code;_883xx;'], axis = 1, inplace = True)

In [113]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199999 entries, 0 to 199998
Columns: 812 entries, acc_now_delinq to zip_code;_Other
dtypes: float64(782), int32(28), int64(2)
memory usage: 1.2 GB


In [114]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199999 entries, 0 to 199998
Columns: 812 entries, acc_now_delinq to zip_code;_Other
dtypes: float64(782), int32(28), int64(2)
memory usage: 1.2 GB


# __ПОДБОР МОДЕЛИ__

In [134]:
# Создание и обучение модели RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=21, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Получение предсказанных вероятностей
y_train_pred = rf_model.predict(X_train)

# Установка порогового значения
threshold = 0.5

# Преобразование вероятностей в бинарные предсказания
y_train_pred_binary = (y_train_pred > threshold).astype(int)



  rf_model.fit(X_train, y_train)


In [136]:
# Вычисление точности
accuracy_train = accuracy_score(y_train, y_train_pred_binary)
print('Точность accuracy на обучающей выборке: {:.3f}'.format(accuracy_train))

roc_auc_train = roc_auc_score(y_train, y_train_pred)
print('ROC AUC на обучающей выборке: {:.3f}'.format(roc_auc_train))

Точность accuracy на обучающей выборке: 0.879
ROC AUC на обучающей выборке: 0.941


In [137]:
# Вычисление по требуемой метрике Gini
gini_train = 2 * roc_auc_train - 1
print('Gini на обучающей выборке: {:.3f}'.format(gini_train))

Gini на обучающей выборке: 0.882


In [126]:
y_train_pred_binary = pd.DataFrame(y_train_pred_binary, columns=['loan_status'])

In [128]:
y_train_pred_binary

Unnamed: 0,loan_status
0,0
1,0
2,0
3,0
4,0
...,...
199994,0
199995,0
199996,0
199997,0


In [130]:
y_train_pred_binary.to_csv('./predict.csv')