# Описание задачи
## В задачи пректа входит:
- Обработка предоставленных данных: нахождение и заполнение пропусков, кодировка, нормализация признаков;
- Создание новых признаков с использованием предоставленных данных;
- Отбор признаков;
- Обучение и тестирование модели на полученных признаках;
- Оптимизация размера набора данных и подбор макропараметров модели на оптимизированном наборе;
- Получение предсказанных моделью значений, подготовка и отправка submission

## Описание датасета
Первоначальная версия датасета состоит из 14-ти столбцов, содержащих следующую информацию:

- **client_id** - идентификатор клиента
- **education** - уровень образования
- **sex** - пол заёмщика
- **age** - возраст заёмщика
- **car** - флаг наличия автомобиля
- **car**_type	флаг - автомобиля-иномарки
- **decline_app_cnt** - количество отказанных прошлых заявок
- **good_work** - флаг наличия «хорошей» работы
- **bki_request_cnt** - количество запросов в БКИ
- **home_address** - категоризатор домашнего адреса
- **work_address** - категоризатор рабочего адреса
- **income** - доход заёмщика
- **foreign_passport** - наличие загранпаспорта
- **default** - наличие дефолта

# Импорт библиотек, установка параметров, определение функций

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc, roc_auc_score, roc_curve

import warnings
warnings.filterwarnings("ignore")

import os

In [127]:
# функции используемые в ноутбуке
def df_describe(df):
    desc = df.describe(include='all',percentiles=[0.5]).T
    desc['type'] = [type(x) for x in df.iloc[0]]
    desc['NaN'] = df.isna().sum()
    desc['not NaN'] = df.notna().sum()
    desc.unique = df.nunique()
    desc.top = df.mode(axis=0).iloc[0]
    desc.freq = [df[col].value_counts().iloc[0] for col in  df.columns]
    return desc

def num_vis(row):
    pass

def nums_vis(columns_name):
    pass

def pre_process(df):
    pass





In [64]:
# установка параметров
%pylab inline

pd.set_option('display.max_rows', 50) # выведем больше строк
pd.set_option('display.max_columns', 30) # выведем больше колонок

# альтернативные пути на kaggle и локальный (НЕ ЗАБЫВАТЬ МЕНЯТЬ!)
path = './Project_4_data/'
# path = '/kaggle/input/'

RANDOM_SEED = 42

Populating the interactive namespace from numpy and matplotlib


In [65]:
# проверка пути и имен файлов
for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./Project_4_data/train.csv


In [66]:
# чтение данных

# df_train = pd.read_csv(f'{path}main_task.csv')
# df_test = pd.read_csv(f'{path}kaggle_task.csv')
# sample_submission = pd.read_csv(f'{path}sample_submission.csv')

data = pd.read_csv(f'{path}train.csv')

# Ознакомление с данными

In [128]:
data.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73799 entries, 0 to 73798
Columns: 14 entries, client_id to default
dtypes: int64(9), object(5)
memory usage: 7.9+ MB


In [129]:
df_describe(data)

Unnamed: 0,count,unique,top,freq,mean,std,min,50%,max,type,NaN,not NaN
client_id,73799,73799,1,1,55138.0,31841.9,1.0,55274.0,110147.0,<class 'numpy.int64'>,0,73799
education,73492,5,SCH,38860,,,,,,<class 'str'>,307,73492
sex,73799,2,F,41562,,,,,,<class 'str'>,0,73799
age,73799,52,31,2727,39.2806,11.5204,21.0,37.0,72.0,<class 'numpy.int64'>,0,73799
car,73799,2,N,49832,,,,,,<class 'str'>,0,73799
car_type,73799,2,N,59791,,,,,,<class 'str'>,0,73799
decline_app_cnt,73799,21,0,61214,0.275749,0.804272,0.0,0.0,33.0,<class 'numpy.int64'>,0,73799
good_work,73799,2,0,61630,0.164894,0.371087,0.0,0.0,1.0,<class 'numpy.int64'>,0,73799
bki_request_cnt,73799,38,0,19381,2.00034,2.25207,0.0,1.0,53.0,<class 'numpy.int64'>,0,73799
home_address,73799,3,2,39956,1.57551,0.527631,1.0,2.0,3.0,<class 'numpy.int64'>,0,73799


Как видиим 9 признаков представлены числовым типом данных

In [8]:
data.head(3)

Unnamed: 0,client_id,education,sex,age,car,car_type,decline_app_cnt,good_work,bki_request_cnt,home_address,work_address,income,foreign_passport,default
0,25905,SCH,M,62,Y,Y,0,0,1,1,2,18000,N,0
1,63161,SCH,F,59,N,N,0,0,3,2,3,19000,N,0
2,25887,SCH,M,25,Y,N,2,0,1,1,2,30000,Y,0


In [11]:
data.nunique()

client_id           73799
education               5
sex                     2
age                    52
car                     2
car_type                2
decline_app_cnt        21
good_work               2
bki_request_cnt        38
home_address            3
work_address            3
income                966
foreign_passport        2
default                 2
dtype: int64

In [57]:
desc = data.describe(include='all',percentiles=[0.5])

feature_type = pd.Series([type(x) for x in data.iloc[0]],index=data.columns,name='type')
feature_missings = data.isna().sum(); feature_missings.name = 'NaN'
feature_fillings = data.notna().sum(); feature_fillings.name = 'not NaN'

desc.append([feature_type,feature_missings,feature_fillings]).T

Unnamed: 0,count,unique,top,freq,mean,std,min,50%,max,type,NaN,not NaN
client_id,73799,,,,55138.0,31841.9,1.0,55274.0,110147.0,<class 'numpy.int64'>,0,73799
education,73492,5.0,SCH,38860.0,,,,,,<class 'str'>,307,73492
sex,73799,2.0,F,41562.0,,,,,,<class 'str'>,0,73799
age,73799,,,,39.2806,11.5204,21.0,37.0,72.0,<class 'numpy.int64'>,0,73799
car,73799,2.0,N,49832.0,,,,,,<class 'str'>,0,73799
car_type,73799,2.0,N,59791.0,,,,,,<class 'str'>,0,73799
decline_app_cnt,73799,,,,0.275749,0.804272,0.0,0.0,33.0,<class 'numpy.int64'>,0,73799
good_work,73799,,,,0.164894,0.371087,0.0,0.0,1.0,<class 'numpy.int64'>,0,73799
bki_request_cnt,73799,,,,2.00034,2.25207,0.0,1.0,53.0,<class 'numpy.int64'>,0,73799
home_address,73799,,,,1.57551,0.527631,1.0,2.0,3.0,<class 'numpy.int64'>,0,73799


client_id           73799
education           73492
sex                 73799
age                 73799
car                 73799
car_type            73799
decline_app_cnt     73799
good_work           73799
bki_request_cnt     73799
home_address        73799
work_address        73799
income              73799
foreign_passport    73799
default             73799
dtype: int64

In [27]:

aa

Unnamed: 0,client_id,education,sex,age,car,car_type,decline_app_cnt,good_work,bki_request_cnt,home_address,work_address,income,foreign_passport,default
count,73799.0,73492,73799,73799.0,73799,73799,73799.0,73799.0,73799.0,73799.0,73799.0,73799.0,73799,73799.0
unique,,5,2,,2,2,,,,,,,2,
top,,SCH,F,,N,N,,,,,,,N,
freq,,38860,41562,,49832,59791,,,,,,,62733,
mean,55137.965094,,,39.280641,,,0.275749,0.164894,2.000339,1.575509,2.49628,41099.775417,,0.126994
std,31841.921168,,,11.520378,,,0.804272,0.371087,2.252073,0.527631,0.689818,46166.3224,,0.332968
min,1.0,,,21.0,,,0.0,0.0,0.0,1.0,1.0,1000.0,,0.0
50%,55274.0,,,37.0,,,0.0,0.0,1.0,2.0,3.0,30000.0,,0.0
max,110147.0,,,72.0,,,33.0,1.0,53.0,3.0,3.0,1000000.0,,1.0


In [10]:
num_uniq = data.nunique()
bin_cols = data.columns[num_uniq==2]
cat_cols = data.columns[(num_uniq>2) & (num_uniq<7)] 
num_cols = data.columns[num_uniq>6]

# Работа с предоставленными данными

In [14]:
data.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73799 entries, 0 to 73798
Columns: 14 entries, client_id to default
dtypes: int64(9), object(5)
memory usage: 7.9+ MB
None
