# Предполагаемые шаги

1. просуммировать помесячно количество денег и попробавть предсказать на основе этого
2. Оставить деньги на ежедневной основе
3. Попробовать раскурить идею с МСС кодом, поскольку это классификация основных транзакций => что может передать некоторый патерн персонажа

# Импорт необходимых библиотек 

In [4]:
!pip install xgboost -q
!pip install lightgbm -q

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import phik

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import (StratifiedKFold, cross_validate, cross_val_score, train_test_split, GridSearchCV)
from sklearn.metrics import make_scorer, f1_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

In [6]:
RANDOM_STATE = 413241

# Чтение файлов

In [8]:
df = pd.read_parquet("df_transaction.pa")
train = pd.read_parquet("train.pa")

In [9]:
data_iter_1 = df[['client_num', 'date_time', 'amount']].copy()
train_1 = train.copy()

In [10]:
data_iter_1['date_time'] = pd.to_datetime(data_iter_1['date_time']).dt.month

In [11]:
data_iter_1.head()

Unnamed: 0,client_num,date_time,amount
0,0,7,2900
1,0,7,455
2,0,7,1003
3,0,7,1480
4,0,7,88


In [12]:
pivot_data_iter_1 = data_iter_1.pivot_table(columns='date_time', index='client_num', aggfunc='sum').reset_index()
pivot_data_iter_1.columns = pivot_data_iter_1.columns.droplevel()

In [13]:
pivot_data_iter_1 = pivot_data_iter_1.rename(columns={'' : 'client_num'})

In [14]:
pivot_data_iter_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109143 entries, 0 to 109142
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   client_num  109143 non-null  int64  
 1   7           95267 non-null   float64
 2   8           101294 non-null  float64
 3   9           102025 non-null  float64
 4   10          4 non-null       float64
dtypes: float64(4), int64(1)
memory usage: 4.2 MB


In [15]:
pivot_data_iter_1 = pivot_data_iter_1.drop(columns = 10, axis =1)

In [16]:
pivot_data_iter_1.head()

date_time,client_num,7,8,9
0,0,7261.0,56962.0,42712.0
1,1,422749.0,236673.0,204456.0
2,2,114647.0,80550.0,148911.0
3,3,1483913.0,68103.0,69809.0
4,4,91422.0,63710.0,44664.0


# Вторая итерация 
Проверяем следующую информацию по шагам. 
1. смотрим по датам
2. к датам добавляем месячную трату
3. каким то образом пытаемся внедрить CVM?????????????????????????

In [18]:
second_train = df[df['client_num'].isin(train['client_num'])].reset_index(drop = True)

In [19]:
second_test = df[~df['client_num'].isin(train['client_num'])].reset_index(drop = True)

In [20]:
# Проверка разделения
(len(second_train['client_num'].unique()) + len(second_test['client_num'].unique())) == (len(df['client_num'].unique()))

True

In [21]:
second_train = second_train.drop(columns=['mcc_code', 'merchant_name'], axis=1)
second_test = second_test.drop(columns=['mcc_code', 'merchant_name'], axis=1)

In [22]:
amount_border = 2000000

second_train = second_train[second_train['amount'] < amount_border]

In [23]:
def preparation_matrix_second (df):
    df['date_time'] = df['date_time'].dt.date #Преобразовали в дату для группировки 
    grouped = df.groupby(['client_num', 'date_time'])['amount'].agg('sum').reset_index()

    client_nums = grouped['client_num'].unique()
    date_range = pd.date_range(start='2024-07-01', end='2024-09-30', freq='D')  
    index = pd.MultiIndex.from_product([client_nums, date_range], names=['client_num', 'date_time'])
    result_date = pd.DataFrame(index=index).reset_index()
    result_date['date_time'] = pd.to_datetime(result_date['date_time']).dt.date
    grouped['date_time'] = pd.to_datetime(grouped['date_time']).dt.date
    df = result_date.merge(grouped, on =['client_num', 'date_time'], how = 'left')

    df = df.pivot_table(columns = 'date_time', index = 'client_num', values='amount', aggfunc='sum').reset_index()
    
    return(df)

In [24]:
second_train_prepered = preparation_matrix_second(second_train)

In [25]:
second_test_prepered = preparation_matrix_second(second_test)

In [26]:
second_train_prepered = second_train_prepered.merge(pivot_data_iter_1, on='client_num', how='left').fillna(0)
second_test_prepered = second_test_prepered.merge(pivot_data_iter_1, on='client_num', how='left').fillna(0)

Сделаем таблицу с поиском по MCC кодам

In [28]:
mcc_code = df.copy()

In [29]:
mcc_code['date_time'] = mcc_code['date_time'].dt.month

In [30]:
mcc_table = mcc_code.pivot_table(columns=['date_time', 'mcc_code'], index='client_num', values='merchant_name', aggfunc='count').reset_index().fillna(0)

In [31]:
jule = mcc_table[7]
august = mcc_table[8]
september = mcc_table[9]

In [32]:
jule.columns = ['jule_' + col for col in jule.columns]
august.columns = ['august_' + col for col in august.columns]
september.columns = ['september_' + col for col in september.columns]

jule = jule.reset_index()
august = august.reset_index()
september = september.reset_index()

In [33]:
mcc_table = jule.merge(august, on='index', how='left')
mcc_table = mcc_table.merge(september, on='index', how='left')

mcc_table = mcc_table.rename(columns = {'index':'client_num'})

In [34]:
second_train_prepered_mcc = second_train_prepered.merge(mcc_table, on='client_num', how='left')
second_test_prepered_mcc = second_test_prepered.merge(mcc_table, on='client_num', how='left')

Подготовили таблицы, объеденяем и после этого смотрим на метрику на кросс валидации

In [36]:
second_XY_train_prepered = second_train_prepered_mcc.merge(train, on ='client_num', how='left')

In [37]:
X_train_2 = second_XY_train_prepered.drop(columns=['client_num', 'target'], axis=1)
y_train_2 = second_XY_train_prepered['target']

In [38]:
X_test_2 = second_test_prepered_mcc.drop(columns='client_num', axis=1)

In [39]:
X_train_2.columns = X_train_2.columns.astype(str)
X_test_2.columns = X_test_2.columns.astype(str)

In [40]:
scaler = StandardScaler()

# Масштабирование данных
X_train_scaled = scaler.fit_transform(X_train_2)
X_test_scaled = scaler.transform(X_test_2)

# Преобразование в DataFrame с сохранением колонок и индексов
X_train_2 = pd.DataFrame(X_train_scaled, columns=X_train_2.columns, index=X_train_2.index)
X_test_2 = pd.DataFrame(X_test_scaled, columns=X_test_2.columns, index=X_test_2.index)

In [41]:
X_train_2

Unnamed: 0,2024-07-01,2024-07-02,2024-07-03,2024-07-04,2024-07-05,2024-07-06,2024-07-07,2024-07-08,2024-07-09,2024-07-10,...,september_8734,september_8911,september_8931,september_8999,september_9222,september_9311,september_9390,september_9399,september_9402,september_9406
0,0.356359,3.115925,-0.047016,0.018094,0.347422,-0.100429,-0.091886,-0.045316,1.528993,-0.020553,...,-0.008452,-0.010102,-0.005415,-0.112449,1.844571,-0.129656,-0.091038,-0.05679,-0.058675,-0.018844
1,-0.036699,-0.045660,0.064789,-0.039551,-0.054191,0.113408,0.016169,0.085083,-0.039546,0.027655,...,-0.008452,-0.010102,-0.005415,-0.112449,-0.045469,-0.129656,-0.091038,-0.05679,-0.058675,-0.018844
2,-0.022344,-0.076703,-0.097328,0.078691,29.093719,-0.014843,0.076782,-0.052948,-0.061843,-0.067405,...,-0.008452,-0.010102,-0.005415,-0.112449,7.514691,4.076061,-0.091038,-0.05679,-0.058675,-0.018844
3,-0.085369,-0.064401,0.008857,-0.072179,1.160383,-0.100429,-0.091886,-0.046129,-0.054771,-0.049589,...,-0.008452,-0.010102,-0.005415,-0.112449,-0.045469,-0.129656,-0.091038,-0.05679,-0.058675,-0.018844
4,-0.032581,-0.067802,-0.094652,-0.080196,-0.043312,-0.062771,-0.091886,-0.056870,-0.066345,-0.059472,...,-0.008452,-0.010102,-0.005415,3.196254,-0.045469,-0.129656,-0.091038,-0.05679,-0.058675,-0.018844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,-0.085369,-0.082733,-0.097328,0.068715,-0.078282,-0.100429,-0.091886,-0.057401,-0.067619,-0.067405,...,-0.008452,-0.010102,-0.005415,-0.112449,-0.045469,-0.129656,-0.091038,-0.05679,-0.058675,-0.018844
69996,-0.085369,-0.082733,-0.097328,-0.080959,-0.081827,-0.100429,-0.091886,-0.057401,-0.067619,-0.067405,...,-0.008452,-0.010102,-0.005415,-0.112449,-0.045469,-0.129656,-0.091038,-0.05679,-0.058675,-0.018844
69997,-0.085369,-0.082733,-0.097328,-0.080959,-0.081827,-0.100429,-0.091886,-0.057401,-0.067619,-0.067405,...,-0.008452,-0.010102,-0.005415,-0.112449,-0.045469,-0.129656,-0.091038,-0.05679,-0.058675,-0.018844
69998,-0.085369,-0.082733,-0.097328,-0.080959,-0.081827,-0.100429,-0.091886,0.136935,-0.067619,-0.067405,...,-0.008452,-0.010102,-0.005415,-0.112449,-0.045469,-0.129656,-0.091038,-0.05679,-0.058675,-0.018844


In [42]:
pipeline = Pipeline(
    [
        #('imputer', SimpleImputer(strategy='mean')),
        ('model', CatBoostClassifier(random_state = RANDOM_STATE))
    ])

params = [
    { 
       'model': [CatBoostClassifier(random_state=RANDOM_STATE, iterations = 4000)]#, 
        #'model__iterations': [100, 500, 1000, 2000, 3000],
        # #'model__depth': [6, 8, 10],
        # 'model__learning_rate': [0.01, 0.05, 0.1],
        # #'model__l2_leaf_reg': [1, 3, 5, 7],
        #'model__loss_function': ['MultiClass']
    }
]

In [None]:
GridSearch = GridSearchCV (estimator = pipeline,
                         param_grid = params,
                         cv = 3,
                         scoring = 'f1_weighted',
                         n_jobs = -1)

GridSearch.fit(X_train_2, y_train_2)

In [None]:
results = pd.DataFrame(GridSearch.cv_results_)

In [None]:
results = results[
    [
        'param_model',
        'mean_test_score',
        'std_test_score',
        'mean_fit_time',
        'mean_score_time',
        'params'
    ]
]
results

In [None]:
print(f'Лучшая модель: {GridSearch.best_estimator_}')
print(F'Метрика ROC-AUC на трейновой выборке: (ROC-AUC) {GridSearch.best_score_}')

In [None]:
model_2 = GridSearch.best_estimator_

In [None]:
# model_2 = CatBoostClassifier(border_count = None)

# # Оценка модели с использованием кросс-валидации и метрики F1
# f1_scores = cross_val_score(model_2, X_train_2, y_train_2, cv=5, scoring='f1_weighted')

# # Выводим метрики F1 для каждой итерации кросс-валидации
# print(f"F1 Scores for each fold: {f1_scores}")
# print(f"Mean F1 Score: {np.mean(f1_scores)}")

In [None]:
# model_2.fit(X_train_2, y_train_2)

In [None]:
# f1_weighted = f1_score(y_train_2, model_2.predict(X_train_2), average='weighted')
# print(f1_weighted)
# # Прогнозирование на тестовом наборе данных
# y_pred_2 = model_2.predict(X_test_2)

In [None]:
y_pred_2 = model_2.predict(X_test_2)

y_pred_2 = pd.Series(y_pred_2.flatten(), name='predictions')  

y_pred_2 = pd.concat([second_test_prepered, y_pred_2], axis=1)

In [None]:
y_pred_2 = y_pred_2[['client_num', 'predictions']]

In [None]:
y_pred_2 = y_pred_2.rename(columns = {'predictions' : 'target'})

In [None]:
y_pred_2.to_csv("3_full_table_everyday_plus_month_cmm.csv", index=False, encoding='utf-8')

In [None]:
         RandomForestClassifier(random_state=413241)	0.254422	0.021893	127.396442	10.457033	{'model': RandomForestClassifier(random_state=...
1	<catboost.core.CatBoostClassifier object at 0x...	0.288678	0.021375	490.235836	2.073135	{'model': <catboost.core.CatBoostClassifier ob...
2	XGBClassifier(base_score=None, booster=None, c...	0.278350	0.017236	254.088340	11.657960	{'model': XGBClassifier(base_score=None, boost...
3	LGBMClassifier(objective='multiclass', random_...	0.280033	0.019517	448.554455	3.825516	{'model': LGBMClassifier(objective='multiclass...


# Выводы

1. По первичному анализу можно сделать выводы что люди с категорией 6 тратят больше чем люди с категорией 0
2. проведен анализ и предсказание по 3м месяцам (сумма за каждый месяц) в итоге на линейной регрессии получили значения метрики > 0.18 => что метрика на тестовой выборке 1.06.
3. Применена более сложная модель для вычисления - catboost, результат применения которого дал значения метрики на кросс валидации = 0.24, в системе метрика улучшилась до 0.91
4. катбуст без обрезания данных дал лучшее качество на 0.5% 

In [None]:
[CatBoostClassifier(random_state=RANDOM_STATE, iterations = 3000)] с ограничением по переводам - 0.30335804501238783

## 