# Заливка данных

In [1]:
import pandas as pd
import numpy as np

### Чтение csv-файла

In [47]:
df = pd.read_csv('german-credit.csv')

In [48]:
pd.set_option('display.float_format', '{:,.2f}'.format)
# разделитель ',' и два знака после запятой у чисел с плавающей точкой

In [50]:
df.head().style.format(thousands=',')
# разделитель ',' для всех чисел

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,default
0,0,67,male,2,own,,little,1169,6,radio/TV,0
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,1
2,2,49,male,1,own,little,,2096,12,education,0
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,0
4,4,53,male,2,free,little,little,4870,24,car,1


# Предобработка

### Удаление столбца

In [31]:
df.drop('Unnamed: 0', axis=1, inplace=True)

### Переименование столбцов

In [33]:
df = df.rename(columns=lambda x: x.lower().replace(' ', '_'))

In [41]:
df.head()

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default
0,67,male,2,own,,little,1169,6,radio/TV,0
1,22,female,2,own,little,moderate,5951,48,radio/TV,1
2,49,male,1,own,little,,2096,12,education,0
3,45,male,2,free,little,little,7882,42,furniture/equipment,0
4,53,male,2,free,little,little,4870,24,car,1


### Агрегирование

#### встроенные функции агрегирования

In [42]:
(df.groupby(['checking_account', 'sex'])
    [['credit_amount', 'default']]
    .agg({
        'default': ['mean', 'count'],
        'credit_amount': ['count', 'median']
    },))

Unnamed: 0_level_0,Unnamed: 1_level_0,default,default,credit_amount,credit_amount
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count,count,median
checking_account,sex,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
little,female,0.52,88,88,1838.5
little,male,0.48,186,186,2589.5
moderate,female,0.48,86,86,2651.0
moderate,male,0.35,183,183,2520.0
rich,female,0.2,20,20,1400.5
rich,male,0.23,43,43,1925.0


базовые мат функции: включая количество, сумму (sum), среднее значение (mean), медианное значение (median),минимум (minimum), максимум (maximum), стандартное отклонение (standard deviation), дисперсию (variance), среднее абсолютное отклонение (mean absolute deviation) и произведение (product)

In [43]:
agg_func_math = {
    'credit_amount': ['count', 'sum', 'mean', 'median', 'min', 'max', 'std', 'var', 'mad', 'prod']
}
# задаем список функций по определенному столбцу

In [45]:
df.groupby('checking_account', dropna=False).agg(agg_func_math).round(2)
# учитываем строки с NaN: dropna=False

Unnamed: 0_level_0,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount
Unnamed: 0_level_1,count,sum,mean,median,min,max,std,var,mad,prod
checking_account,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
little,274,870010,3175.22,2353.5,338,15857,2636.38,6950520.91,1919.64,inf
moderate,269,1029614,3827.56,2622.0,276,18424,3465.2,12007639.4,2595.81,inf
rich,63,137192,2177.65,1881.0,392,6289,1343.19,1804156.2,1070.15,1.8952449088117201e+205
,394,1234442,3133.1,2248.0,250,15653,2554.16,6523755.47,1878.22,inf


In [55]:
df['checking_account'] = df['checking_account'].fillna('no_info')
# переименовываем NaN: fillna('no_info')

In [56]:
df.groupby('checking_account').agg(agg_func_math).round(2)

Unnamed: 0_level_0,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount
Unnamed: 0_level_1,count,sum,mean,median,min,max,std,var,mad,prod
checking_account,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
little,274,870010,3175.22,2353.5,338,15857,2636.38,6950520.91,1919.64,inf
moderate,269,1029614,3827.56,2622.0,276,18424,3465.2,12007639.4,2595.81,inf
no_info,394,1234442,3133.1,2248.0,250,15653,2554.16,6523755.47,1878.22,inf
rich,63,137192,2177.65,1881.0,392,6289,1343.19,1804156.2,1070.15,1.8952450000000002e+205


In [41]:
agg_func_describe = {'credit_amount': ['describe']}
# describe вызывается описательная статистика

In [69]:
df.groupby('checking_account').agg(agg_func_describe).round(2)

Unnamed: 0_level_0,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount
Unnamed: 0_level_1,describe,describe,describe,describe,describe,describe,describe,describe
Unnamed: 0_level_2,count,mean,std,min,25%,50%,75%,max
checking_account,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
little,274.0,3175.22,2636.38,338.0,1353.5,2353.5,3954.0,15857.0
moderate,269.0,3827.56,3465.2,276.0,1391.0,2622.0,5084.0,18424.0
no_info,394.0,3133.1,2554.16,250.0,1414.25,2248.0,3804.0,15653.0
rich,63.0,2177.65,1343.19,392.0,1275.0,1881.0,2969.5,6289.0
