In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [3]:
# Загрузка данных
brand_agg_statistic = pd.read_csv('../data/brand_agg_statistic.csv', delimiter=';')
brand_agg_statistic.head()

Unnamed: 0,brand_id,month,trxn_sum,trxn_count,clients_count
0,mim5r0,01.10.2023,3050501000.0,6818034,2418145
1,mim5r0,01.11.2023,10782900000.0,23058459,3723509
2,mim5r0,01.12.2023,12549280000.0,24084327,3899866
3,mim5r0,01.01.2024,11114060000.0,22466069,3869203
4,mim5r0,01.02.2024,11595810000.0,23629149,3964515


In [5]:
cb_accruals = pd.read_csv('../data/cb_accruals.csv').head(200000)
cb_accruals.head()

Unnamed: 0,id,offer_id,amount_trans,cb_percent,client_id,client_type,point_sale_name,month
0,67334953714,TrZM9PAS,1208.15,181.22,,1,,2024-02-01
1,67364101370,lnCWFYzKb,989.97,99.0,,2,"Казань, ул Гаврилова, зд 17",2024-02-01
2,67358688069,ArEWh,-66.04,-3.0,,1,Москва,2024-02-01
3,67340480432,kmAXIta,647.0,90.58,,1,,2024-02-01
4,67356565036,lnCWFYzKb,1334.92,199.0,,1,"Серпухов, ул Ворошилова, д 128",2024-02-01


In [6]:
transactions_1 = pd.read_csv('../data/transactions_1.csv').head(200000)
transactions_1.head()

Unnamed: 0,brand_id,date,client_id,trxn_count,trxn_sum
0,zii1gFsQg,2024-08-01,N7XeQca,1,294
1,lteX9M,2024-05-01,MYYew,1,2790
2,T0wR6s,2024-10-01,pK5iDrST,1,17080
3,ScP3gdA,2024-10-01,3huHE3fq,1,5800
4,USxhCR,2023-12-01,x240kg,1,17800


In [15]:
# Преобразование типов данных
transactions_1['trxn_sum'] = pd.to_numeric(transactions_1['trxn_sum'], errors='coerce')
transactions_1['trxn_count'] = pd.to_numeric(transactions_1['trxn_count'], errors='coerce')
cb_accruals['cb_percent'] = pd.to_numeric(cb_accruals['cb_percent'], errors='coerce')

In [16]:
# Объединение данных и создание признаков
data = transactions_1.merge(brand_agg_statistic, on='brand_id', how='left')

In [25]:
data = data.merge(cb_accruals, on='client_id', how='left')
data = data.fillna(0)

In [28]:
data.head()

Unnamed: 0,brand_id,date,client_id,trxn_count_x,trxn_sum_x,month_x,trxn_sum_y,trxn_count_y,clients_count,id_x,...,GMV,purchase_count,total_cb,id_y,offer_id_y,amount_trans_y,cb_percent_y,client_type_y,point_sale_name_y,month
0,zii1gFsQg,2024-08-01,N7XeQca,1,0.0,01.10.2023,1979305000.0,4411636,1773534,0.0,...,0.0,1,0.0,0.0,0,0.0,0.0,0.0,0,0
1,zii1gFsQg,2024-08-01,N7XeQca,1,0.0,01.11.2023,7077004000.0,15117075,2941240,0.0,...,0.0,1,0.0,0.0,0,0.0,0.0,0.0,0,0
2,zii1gFsQg,2024-08-01,N7XeQca,1,0.0,01.12.2023,8757913000.0,16391639,3122329,0.0,...,0.0,1,0.0,0.0,0,0.0,0.0,0.0,0,0
3,zii1gFsQg,2024-08-01,N7XeQca,1,0.0,01.01.2024,7770271000.0,15441359,3110779,0.0,...,0.0,1,0.0,0.0,0,0.0,0.0,0.0,0,0
4,zii1gFsQg,2024-08-01,N7XeQca,1,0.0,01.02.2024,8015332000.0,15969045,3154993,0.0,...,0.0,1,0.0,0.0,0,0.0,0.0,0.0,0,0


In [29]:
# Создание целевых переменных
data['GMV'] = data['trxn_sum_x']
data['purchase_count'] = data['trxn_count_x']
data['total_cb'] = data['cb_percent_y'] / 100 * data['trxn_sum_x']

In [30]:
# Разделение данных на обучающую и тестовую выборки
X = data[['brand_id', 'client_id', 'trxn_sum_x', 'trxn_count_x']]
y_GMV = data['GMV']
y_purchase_count = data['purchase_count']
y_total_cb = data['total_cb']

In [31]:
X_train, X_test, y_GMV_train, y_GMV_test = train_test_split(X, y_GMV, test_size=0.2, random_state=42)
_, _, y_purchase_count_train, y_purchase_count_test = train_test_split(X, y_purchase_count, test_size=0.2, random_state=42)
_, _, y_total_cb_train, y_total_cb_test = train_test_split(X, y_total_cb, test_size=0.2, random_state=42)

In [35]:
y_GMV_train

2439694    0.0
2498221    0.0
2153119    0.0
1294205    0.0
1647242    0.0
          ... 
110268     0.0
1692743    0.0
2356330    0.0
2229084    0.0
2219110    0.0
Name: GMV, Length: 2178233, dtype: float64

In [32]:
# Выбор модели
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [33]:
# Обучение модели
model.fit(X_train, y_GMV_train)

ValueError: could not convert string to float: 'jAQ8z'

In [None]:
# Предсказание
y_GMV_pred = model.predict(X_test)
y_purchase_count_pred = model.predict(X_test)
y_total_cb_pred = model.predict(X_test)

In [None]:
# Оценка модели
print('RMSE GMV:', np.sqrt(mean_squared_error(y_GMV_test, y_GMV_pred)))
print('RMSE Purchase Count:', np.sqrt(mean_squared_error(y_purchase_count_test, y_purchase_count_pred)))
print('RMSE Total Cashback:', np.sqrt(mean_squared_error(y_total_cb_test, y_total_cb_pred)))