# Data Preprocessing

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler

In [3]:
data = pd.read_csv('cleanest_data.csv')

## Попередній огляд даних

In [4]:
data

Unnamed: 0,is_successful,create_date,order_amount,order_messages,order_changes,partner_success_rate,partner_total_orders,partner_order_age_days,partner_avg_amount,partner_success_avg_amount,...,partner_success_avg_changes,partner_fail_avg_changes,day_of_week,month,quarter,hour_of_day,order_lines_count,discount_total,salesperson,source
0,1,2017-07-29 07:48:26.812523,5235.66,25,22,0.000000,0,0,0.000000,0.000000,...,0.000000,0.000,Saturday,July,3,7,6,0.0,user-1-76,False
1,1,2017-07-29 07:54:09.954757,876.96,10,5,0.000000,0,0,0.000000,0.000000,...,0.000000,0.000,Saturday,July,3,7,3,0.0,user-1-76,False
2,1,2017-07-29 08:04:13.162858,3012.77,7,4,0.000000,0,0,0.000000,0.000000,...,0.000000,0.000,Saturday,July,3,8,4,0.0,user-1-9,False
3,1,2017-07-29 08:11:38.086709,621.34,10,6,0.000000,0,0,0.000000,0.000000,...,0.000000,0.000,Saturday,July,3,8,4,0.0,user-1-2,False
4,1,2017-07-29 08:15:05.548616,813.12,6,3,0.000000,0,0,0.000000,0.000000,...,0.000000,0.000,Saturday,July,3,8,3,0.0,user-1-9,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86789,1,2024-12-11 11:09:57.124395,0.53,5,2,11.111111,9,57,71388.001111,0.000000,...,4.000000,3.375,Wednesday,December,4,11,1,0.0,user-1-49,False
86790,0,2024-12-16 08:38:35.387458,697.37,2,0,100.000000,24,752,530.264583,530.264583,...,2.458333,0.000,Monday,December,4,8,3,0.0,user-1-113,False
86791,1,2024-12-19 13:59:08.130686,129.96,2,1,0.000000,0,0,0.000000,0.000000,...,0.000000,0.000,Thursday,December,4,13,1,0.0,user-1-2,False
86792,1,2025-01-02 08:33:33.424152,129.96,2,1,100.000000,1,13,129.960000,129.960000,...,1.000000,0.000,Thursday,January,1,8,1,0.0,user-1-2,False


## ДОСЛІДЖЕННЯ РОЗПОДІЛУ

*Перевірка на асиметрію*:

In [5]:
data.skew(numeric_only=True)

is_successful                    -0.542771
order_amount                     63.597978
order_messages                    5.475379
order_changes                     8.201835
partner_success_rate             -0.708427
partner_total_orders              3.626058
partner_order_age_days            0.570445
partner_avg_amount               92.806970
partner_success_avg_amount       18.085258
partner_fail_avg_amount          46.378813
partner_total_messages            3.568156
partner_success_avg_messages      1.976863
partner_fail_avg_messages         4.583495
partner_avg_changes               4.765490
partner_success_avg_changes       2.553038
partner_fail_avg_changes          7.842901
quarter                           0.015067
hour_of_day                       0.331366
order_lines_count               175.787705
discount_total                  113.972894
dtype: float64

### Статистичні тести нормальності розподілу
**Тест Шапіро-Уілка** (scipy.stats.shapiro)
- Дуже точний, але для малих вибірок (до 5000 записів).

**Тест Д'Агостіно-Кільмейра** (scipy.stats.normaltest)
- Підходить для великих вибірок.

**Тест Андерсона-Дарлінга** (scipy.stats.anderson)
- Ще один строгий тест для нормальності.

In [6]:
from scipy.stats import normaltest, anderson

In [7]:
# Вибираємо тільки числові колонки
numeric_cols = data.select_dtypes(include=[np.number]).columns

# Створюємо результуючу таблицю
results = []

for col in numeric_cols:
    col_data = data[col]

    # Тест Д'Агостіно-Кільмейра
    stat_dagostino, p_dagostino = normaltest(col_data)

    # Тест Андерсона-Дарлінга
    result_anderson = anderson(col_data)
    stat_anderson = result_anderson.statistic
    critical_anderson = result_anderson.critical_values[2]  # поріг для рівня значущості 5%

    # Оцінка нормальності
    is_normal_dagostino = p_dagostino > 0.05
    is_normal_anderson = stat_anderson < critical_anderson

    results.append({
        'column': col,
        'dagostino_stat': stat_dagostino,
        'dagostino_p': p_dagostino,
        'dagostino_is_normal': is_normal_dagostino,
        'anderson_stat': stat_anderson,
        'anderson_critical_5%': critical_anderson,
        'anderson_is_normal': is_normal_anderson
    })

# Перетворюємо в DataFrame для красивого вигляду
normality_results = pd.DataFrame(results)

In [8]:
normality_results

Unnamed: 0,column,dagostino_stat,dagostino_p,dagostino_is_normal,anderson_stat,anderson_critical_5%,anderson_is_normal
0,is_successful,374718.8,0.0,False,17025.011072,0.787,False
1,order_amount,297068.2,0.0,False,26444.435265,0.787,False
2,order_messages,96401.25,0.0,False,7509.998823,0.787,False
3,order_changes,127814.4,0.0,False,7624.119674,0.787,False
4,partner_success_rate,8396.103,0.0,False,2632.940401,0.787,False
5,partner_total_orders,66082.09,0.0,False,11546.87962,0.787,False
6,partner_order_age_days,10442.13,0.0,False,1867.569814,0.787,False
7,partner_avg_amount,337083.1,0.0,False,22123.840104,0.787,False
8,partner_success_avg_amount,181832.4,0.0,False,13989.165778,0.787,False
9,partner_fail_avg_amount,266663.3,0.0,False,21310.127781,0.787,False


Поояснення показників:

| Показник           | Що означає                                                               |
| :----------------- | :----------------------------------------------------------------------- |
| dagostino\_stat    | Статистика тесту Д’Агостіно-Кільмейра (скошеність + ексцес).               |
| dagostino\_p       | P-значення тесту Д’Агостіно-Кільмейра.                                   |
| dagostino\_is\_normal | Чи приймаємо нормальність за результатами Д’Агостіно-Кільмейра?         |
| anderson\_stat     | Статистика тесту Андерсона-Дарлінга.                                     |
| anderson\_critical\_5% | Критичне значення тесту Андерсона-Дарлінга при рівні 5%.                |
| anderson\_is\_normal | Чи приймаємо нормальність за результатами Андерсона-Дарлінга?           |

**Висновок:** *всі данні мають не нормальний розподіл*

### ДОСЛІДЖЕННЯ ЧИСЛОВИХ СТАТИСТИК

In [9]:
def numeric_statistic(data):
    # 1. Виділяємо тільки числові колонки
    numeric_columns = data.select_dtypes(include=['number']).columns

    # 2. Створюємо список для зберігання результатів
    stats_list = []

    # 3. Обходимо всі числові колонки
    for col in numeric_columns:
        col_min = data[col].min()
        col_max = data[col].max()
        col_mean = data[col].mean()
        col_median = data[col].median()
        # mode() може повертати кілька значень, беремо перше
        col_mode = data[col].mode().iloc[0] if not data[col].mode().empty else None

        stats_list.append({
            'column': col,
            'min': col_min,
            'max': col_max,
            'mean': col_mean,
            'median': col_median,
            'mode': col_mode
        })

    # 4. Створюємо фінальний DataFrame
    stats_df = pd.DataFrame(stats_list)
    return stats_df

In [10]:
num_stat = numeric_statistic(data)

In [11]:
num_stat

Unnamed: 0,column,min,max,mean,median,mode
0,is_successful,0.0,1.0,0.630954,1.0,1.0
1,order_amount,-6242.4,4140000.0,3824.406985,615.985,0.0
2,order_messages,1.0,282.0,9.408542,7.0,6.0
3,order_changes,0.0,274.0,3.904521,3.0,2.0
4,partner_success_rate,0.0,100.0,59.600411,66.666667,0.0
5,partner_total_orders,0.0,1307.0,83.696258,26.0,0.0
6,partner_order_age_days,0.0,2685.0,899.232136,764.0,0.0
7,partner_avg_amount,0.0,2700000.0,2914.208745,1127.518158,0.0
8,partner_success_avg_amount,0.0,119310.0,1597.019479,933.96,0.0
9,partner_fail_avg_amount,0.0,2700000.0,4112.98543,821.6,0.0


#### Заміна від'ємних на 0

In [12]:
# Виділяємо тільки числові колонки
numeric_columns = data.select_dtypes(include=['number']).columns

# Заміна від'ємних значень на 0 у числових стовпцях
for col in numeric_columns:
    data[col] = data[col].apply(lambda x: max(0, x))  # Застосовуємо функцію max(0, x) до кожного елемента стовпця

####  Виключення даних за 2025 рік

In [13]:
data['create_date'] = pd.to_datetime(data['create_date'])  # Перетворення стовпця 'create_date' у формат datetime
data = data[data['create_date'].dt.year < 2025].copy()  # Фільтруємо рядки, залишаючи лише ті, де рік менш як 2025

#### Нормалізація `partner_success_rate`

In [14]:
data['partner_success_rate'] = data['partner_success_rate'] / 100

### МАСШТАБУВАННЯ

In [15]:
# Список колонок, які потрібно масштабувати
columns_to_scale = [
    'order_amount',
    'order_messages',
    'order_changes',
    'partner_total_orders',
    'partner_order_age_days',
    'partner_avg_amount',
    'partner_success_avg_amount',
    'partner_fail_avg_amount',
    'partner_total_messages',
    'partner_success_avg_messages',
    'partner_fail_avg_messages',
    'partner_avg_changes',
    'partner_success_avg_changes',
    'partner_fail_avg_changes',
    'order_lines_count',
    'discount_total'
]

In [16]:
from sklearn.preprocessing import RobustScaler

In [17]:
# Ініціалізуємо RobustScaler
scaler = RobustScaler()

# Масштабуємо тільки числові дані
scaled_array = scaler.fit_transform(data[columns_to_scale])

In [18]:
# Перетворюємо назад у DataFrame
scaled_df = pd.DataFrame(scaled_array, columns=[col for col in columns_to_scale])

In [19]:
# Замінюємо оригінальні колонки в data масштабованими значеннями
data[columns_to_scale] = scaled_df[columns_to_scale]

In [20]:
data

Unnamed: 0,is_successful,create_date,order_amount,order_messages,order_changes,partner_success_rate,partner_total_orders,partner_order_age_days,partner_avg_amount,partner_success_avg_amount,...,partner_success_avg_changes,partner_fail_avg_changes,day_of_week,month,quarter,hour_of_day,order_lines_count,discount_total,salesperson,source
0,1,2017-07-29 07:48:26.812523,2.421132,3.000000,9.5,0.000000,-0.305882,-0.630884,-0.544190,-0.687509,...,-1.755274,-1.775,Saturday,July,3,7,1.5,0.0,user-1-76,False
1,1,2017-07-29 07:54:09.954757,0.136770,0.500000,1.0,0.000000,-0.305882,-0.630884,-0.544190,-0.687509,...,-1.755274,-1.775,Saturday,July,3,7,0.0,0.0,user-1-76,False
2,1,2017-07-29 08:04:13.162858,1.256132,0.000000,0.5,0.000000,-0.305882,-0.630884,-0.544190,-0.687509,...,-1.755274,-1.775,Saturday,July,3,8,0.5,0.0,user-1-9,False
3,1,2017-07-29 08:11:38.086709,0.002801,0.500000,1.5,0.000000,-0.305882,-0.630884,-0.544190,-0.687509,...,-1.755274,-1.775,Saturday,July,3,8,0.5,0.0,user-1-2,False
4,1,2017-07-29 08:15:05.548616,0.103312,-0.166667,0.0,0.000000,-0.305882,-0.630884,-0.544190,-0.687509,...,-1.755274,-1.775,Saturday,July,3,8,0.0,0.0,user-1-9,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86787,1,2024-12-09 10:22:13.166600,-0.215944,-0.833333,-1.0,0.816514,0.976471,1.586292,2.705002,3.408798,...,1.121130,1.495,Monday,December,4,10,-0.5,0.0,user-1-10,False
86788,1,2024-12-10 11:07:58.049169,-0.266079,-0.333333,0.5,0.000000,-0.305882,-0.630884,-0.544190,-0.687509,...,-1.755274,-1.775,Tuesday,December,4,11,-1.0,0.0,user-1-39,False
86789,1,2024-12-11 11:09:57.124395,-0.322561,-0.333333,-0.5,0.111111,-0.200000,-0.583815,33.907794,-0.687509,...,-0.135021,0.250,Wednesday,December,4,11,-1.0,0.0,user-1-49,False
86790,0,2024-12-16 08:38:35.387458,0.042648,-0.833333,-1.5,1.000000,-0.023529,-0.009909,-0.288283,-0.297169,...,-0.759494,-1.775,Monday,December,4,8,0.0,0.0,user-1-113,False
