# Заливка данных

In [1]:
import pandas as pd
import numpy as np

### Чтение csv-файла

In [2]:
df = pd.read_csv('german-credit.csv')

In [3]:
pd.set_option('display.float_format', '{:,.2f}'.format)
# разделитель ',' и два знака после запятой у чисел с плавающей точкой

In [4]:
df.head().style.format(thousands=',')
# разделитель ',' для всех чисел

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,default
0,0,67,male,2,own,,little,1169,6,radio/TV,0
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,1
2,2,49,male,1,own,little,,2096,12,education,0
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,0
4,4,53,male,2,free,little,little,4870,24,car,1


# Предобработка

### Удаление столбца

In [5]:
df.drop('Unnamed: 0', axis=1, inplace=True)

### Переименование столбцов

In [6]:
df = df.rename(columns=lambda x: x.lower().replace(' ', '_'))

In [7]:
df.head()

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,default
0,67,male,2,own,,little,1169,6,radio/TV,0
1,22,female,2,own,little,moderate,5951,48,radio/TV,1
2,49,male,1,own,little,,2096,12,education,0
3,45,male,2,free,little,little,7882,42,furniture/equipment,0
4,53,male,2,free,little,little,4870,24,car,1


In [13]:
t = df.groupby(['housing', 'sex'])['age'].agg(['count', 'mean', 'median', 'min', 'max'])

In [16]:
t.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,median,min,max
housing,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
free,female,19,48.11,48.0,24,74
free,male,89,42.9,42.0,22,75
own,female,196,33.67,30.0,20,75
own,male,517,36.32,34.0,20,74
rent,female,95,27.96,24.0,19,59


In [15]:
t['count'].sum()

1000

### Агрегирование

#### встроенные функции агрегирования

In [42]:
(df.groupby(['checking_account', 'sex'])
    [['credit_amount', 'default']]
    .agg({
        'default': ['mean', 'count'],
        'credit_amount': ['count', 'median']
    },))

Unnamed: 0_level_0,Unnamed: 1_level_0,default,default,credit_amount,credit_amount
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count,count,median
checking_account,sex,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
little,female,0.52,88,88,1838.5
little,male,0.48,186,186,2589.5
moderate,female,0.48,86,86,2651.0
moderate,male,0.35,183,183,2520.0
rich,female,0.2,20,20,1400.5
rich,male,0.23,43,43,1925.0


базовые мат функции: включая количество, сумму (sum), среднее значение (mean), медианное значение (median),минимум (minimum), максимум (maximum), стандартное отклонение (standard deviation), дисперсию (variance), среднее абсолютное отклонение (mean absolute deviation) и произведение (product)

In [35]:
agg_func_math = {
    'credit_amount': ['count', 'sum', 'mean', 'median', 'min', 'max', 'std', 'var', 'mad', 'prod']
}
# задаем список функций по определенному столбцу

In [36]:
df.groupby('checking_account', dropna=False).agg(agg_func_math).round(2)
# учитываем строки с NaN: dropna=False

Unnamed: 0_level_0,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount
Unnamed: 0_level_1,count,sum,mean,median,min,max,std,var,mad,prod
checking_account,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
little,274,870010,3175.22,2353.5,338,15857,2636.38,6950520.91,1919.64,inf
moderate,269,1029614,3827.56,2622.0,276,18424,3465.2,12007639.4,2595.81,inf
rich,63,137192,2177.65,1881.0,392,6289,1343.19,1804156.2,1070.15,"18,952,449,088,117,196,686,161,985,600,512,499,..."
,394,1234442,3133.1,2248.0,250,15653,2554.16,6523755.47,1878.22,inf


In [37]:
df['checking_account'] = df['checking_account'].fillna('no_info')
# переименовываем NaN: fillna('no_info')

In [56]:
df.groupby('checking_account').agg(agg_func_math).round(2)

Unnamed: 0_level_0,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount
Unnamed: 0_level_1,count,sum,mean,median,min,max,std,var,mad,prod
checking_account,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
little,274,870010,3175.22,2353.5,338,15857,2636.38,6950520.91,1919.64,inf
moderate,269,1029614,3827.56,2622.0,276,18424,3465.2,12007639.4,2595.81,inf
no_info,394,1234442,3133.1,2248.0,250,15653,2554.16,6523755.47,1878.22,inf
rich,63,137192,2177.65,1881.0,392,6289,1343.19,1804156.2,1070.15,1.8952450000000002e+205


In [41]:
agg_func_describe = {'credit_amount': ['describe']}
# describe вызывается описательная статистика

In [69]:
df.groupby('checking_account').agg(agg_func_describe).round(2)

Unnamed: 0_level_0,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount,credit_amount
Unnamed: 0_level_1,describe,describe,describe,describe,describe,describe,describe,describe
Unnamed: 0_level_2,count,mean,std,min,25%,50%,75%,max
checking_account,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
little,274.0,3175.22,2636.38,338.0,1353.5,2353.5,3954.0,15857.0
moderate,269.0,3827.56,3465.2,276.0,1391.0,2622.0,5084.0,18424.0
no_info,394.0,3133.1,2554.16,250.0,1414.25,2248.0,3804.0,15653.0
rich,63.0,2177.65,1343.19,392.0,1275.0,1881.0,2969.5,6289.0


In [30]:
df.pivot_table(
    index='checking_account',
    columns='sex',
    values=['default', 'credit_amount'],
    aggfunc=['count','mean'],
    margins=True
)

Unnamed: 0_level_0,count,count,count,count,count,count,mean,mean,mean,mean,mean,mean
Unnamed: 0_level_1,credit_amount,credit_amount,credit_amount,default,default,default,credit_amount,credit_amount,credit_amount,default,default,default
sex,female,male,All,female,male,All,female,male,All,female,male,All
checking_account,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
little,88,186,274,88,186,274,2564.22,3464.3,3175.22,0.52,0.48,0.49
moderate,86,183,269,86,183,269,3720.03,3878.09,3827.56,0.48,0.35,0.39
rich,20,43,63,20,43,63,1616.05,2438.86,2177.65,0.2,0.23,0.22
All,194,412,606,194,412,606,2978.84,3541.07,3361.08,0.47,0.4,0.42


In [39]:
df.groupby('checking_account')['default'].agg(['count', 'mean'])

Unnamed: 0_level_0,count,mean
checking_account,Unnamed: 1_level_1,Unnamed: 2_level_1
little,274,0.49
moderate,269,0.39
no_info,394,0.12
rich,63,0.22


In [42]:
(df.groupby('checking_account')['default']
.agg(['count', 'mean'])
.sort_values('count', ascending=False)
.style
.format({
    'mean': '{:,.2}'.format
})
.applymap(
    lambda x: 'background-color : green' if x > 0.3 else (
        'background-color : blue' if 0.2 < x <= 0.3 else ''),
    subset=['mean']
)
    )

Unnamed: 0_level_0,count,mean
checking_account,Unnamed: 1_level_1,Unnamed: 2_level_1
no_info,394,0.12
little,274,0.49
moderate,269,0.39
rich,63,0.22


In [43]:
t = df.pivot_table(
    index='checking_account',
    columns='sex',
    values=['default', 'credit_amount'],
    aggfunc=['count','mean'],
    margins=True
)

In [44]:
t.columns = ['_'.join(col).strip() for col in t.columns.values]

In [45]:
t

Unnamed: 0_level_0,count_credit_amount_female,count_credit_amount_male,count_credit_amount_All,count_default_female,count_default_male,count_default_All,mean_credit_amount_female,mean_credit_amount_male,mean_credit_amount_All,mean_default_female,mean_default_male,mean_default_All
checking_account,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
little,88,186,274,88,186,274,2564.22,3464.3,3175.22,0.52,0.48,0.49
moderate,86,183,269,86,183,269,3720.03,3878.09,3827.56,0.48,0.35,0.39
no_info,116,278,394,116,278,394,2708.75,3310.17,3133.1,0.16,0.1,0.12
rich,20,43,63,20,43,63,1616.05,2438.86,2177.65,0.2,0.23,0.22
All,310,690,1000,310,690,1000,2877.77,3448.04,3271.26,0.35,0.28,0.3
