In [1]:
!pip install catboost



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
import seaborn as sns


In [3]:
from sklearn.preprocessing import StandardScaler
import ast
import json

In [4]:
geo_info = pd.read_csv('geo_info.csv', delimiter = ';')
referer_vectors = pd.read_csv('referer_vectors.csv', delimiter = ';')
test_users = pd.read_csv('test_users.csv', delimiter = ';')
test = pd.read_csv('test.csv', delimiter = ';')
train = pd.read_csv('train.csv', delimiter = ';')
train_labels = pd.read_csv('train_labels.csv', delimiter = ';')

In [5]:
train.head()

Unnamed: 0,request_ts,user_id,referer,geo_id,user_agent
0,1701011363,fb858e8e0a2bec074450eaf94b627fd3,https://9b48ee5/,4799,"{'browser': 'Chrome Mobile', 'browser_version'..."
1,1700986581,46a5f128fd569c764a92c2eaa788095e,https://9b48ee5/,8257,"{'browser': 'Chrome Mobile', 'browser_version'..."
2,1701011071,5a74e9ac53ffb21a20cce117c0ad77ba,https://9634fd0/1409e548,3150,"{'browser': 'Yandex Browser', 'browser_version..."
3,1700992803,af735816ca19115431ae3d89518c8c91,https://9b48ee5/,2740,"{'browser': 'Chrome Mobile', 'browser_version'..."
4,1701021666,364f0ae0a3f29a685c4fb5bae6033b9a,https://9b48ee5/,4863,"{'browser': 'Yandex Browser', 'browser_version..."


In [6]:
train.isnull().sum()

Unnamed: 0,0
request_ts,0
user_id,0
referer,0
geo_id,0
user_agent,1


In [7]:
#Удалим пропущенные строки, так как они влияют на обучение модели

In [8]:
train = train.dropna(subset = ['user_agent'])

In [9]:
train['user_agent'].nunique()

8432

In [10]:
#Преобразуем колонку user_agent

In [11]:
train['user_agent'] = train['user_agent'].apply(ast.literal_eval)
df_exp = pd.json_normalize(train['user_agent'])
train = pd.concat([train, df_exp], axis = 1).drop(columns = ['user_agent'], axis = 1)

In [12]:
train.head()

Unnamed: 0,request_ts,user_id,referer,geo_id,browser,browser_version,os,os_version
0,1701011000.0,fb858e8e0a2bec074450eaf94b627fd3,https://9b48ee5/,4799.0,Chrome Mobile,119.0.0,Android,10
1,1700987000.0,46a5f128fd569c764a92c2eaa788095e,https://9b48ee5/,8257.0,Chrome Mobile,111.0.0,Android,10
2,1701011000.0,5a74e9ac53ffb21a20cce117c0ad77ba,https://9634fd0/1409e548,3150.0,Yandex Browser,20.12.5,Android,11
3,1700993000.0,af735816ca19115431ae3d89518c8c91,https://9b48ee5/,2740.0,Chrome Mobile,119.0.0,Android,10
4,1701022000.0,364f0ae0a3f29a685c4fb5bae6033b9a,https://9b48ee5/,4863.0,Yandex Browser,18.11.1,Android,4.4.4


In [13]:
train['request_ts'].nunique()

85501

In [14]:
#Удалим колонку request_ts из за большого количества различных данных

In [15]:
train = train.drop(['request_ts'], axis = 1)

In [16]:
train.head()

Unnamed: 0,user_id,referer,geo_id,browser,browser_version,os,os_version
0,fb858e8e0a2bec074450eaf94b627fd3,https://9b48ee5/,4799.0,Chrome Mobile,119.0.0,Android,10
1,46a5f128fd569c764a92c2eaa788095e,https://9b48ee5/,8257.0,Chrome Mobile,111.0.0,Android,10
2,5a74e9ac53ffb21a20cce117c0ad77ba,https://9634fd0/1409e548,3150.0,Yandex Browser,20.12.5,Android,11
3,af735816ca19115431ae3d89518c8c91,https://9b48ee5/,2740.0,Chrome Mobile,119.0.0,Android,10
4,364f0ae0a3f29a685c4fb5bae6033b9a,https://9b48ee5/,4863.0,Yandex Browser,18.11.1,Android,4.4.4


In [17]:
train.isnull().sum()

Unnamed: 0,0
user_id,1
referer,1
geo_id,1
browser,1
browser_version,1
os,1
os_version,1


In [18]:
#Удалим пропущенные значения

In [19]:
train = train.dropna(subset = ['user_id', 'referer', 'geo_id', 'os'])

In [20]:
train.isnull().sum()

Unnamed: 0,0
user_id,0
referer,0
geo_id,0
browser,0
browser_version,0
os,0
os_version,0


In [21]:
#Повторим все то же самое для test df

In [22]:
test.isnull().sum()

Unnamed: 0,0
request_ts,0
user_id,0
referer,0
geo_id,0
user_agent,0


In [23]:
test['user_agent'] = test['user_agent'].apply(ast.literal_eval)
df_exp_test = pd.json_normalize(test['user_agent'])
test = pd.concat([test, df_exp_test], axis = 1).drop(columns = ['user_agent'], axis = 1)

In [24]:
test.head()

Unnamed: 0,request_ts,user_id,referer,geo_id,browser,browser_version,os,os_version
0,1700993094,c2802dadd33d8ae09bb366bdd41212ea,https://9b48ee5/,8816,Chrome Mobile,96.0.4664,Android,12
1,1701005579,e5b1988db74527ec092f28b0bbfdaac9,https://9b48ee5/,3663,Chrome,116.0.5845,Android,10
2,1700969752,6ef1eedbdb72554e53e69782066065c5,https://72879b4/12411b9e,2336,Chrome,114.0.0,Android,10
3,1700991608,7e057293ecae62985a327b7af51858ea,https://9b48ee5/,9652,Chrome Mobile,91.0.4472,Android,11
4,1701019815,a27bd7ce8828497823fa8d5d05e7bbf7,https://9b48ee5/,3871,Chrome Mobile,119.0.0,Android,10


In [25]:
test.isnull().sum()

Unnamed: 0,0
request_ts,0
user_id,0
referer,0
geo_id,0
browser,0
browser_version,0
os,0
os_version,0


In [26]:
print('region_id', geo_info['region_id'].nunique())
print('country_id', geo_info['country_id'].nunique())
print('timezone_id', geo_info['timezone_id'].nunique())
print('geo_id', geo_info['geo_id'].nunique())

region_id 277
country_id 203
timezone_id 314
geo_id 5533


In [27]:
geo_info.isnull().sum()

Unnamed: 0,0
geo_id,0
country_id,0
region_id,1707
timezone_id,0


In [28]:
geo_info['region_id'] = geo_info['region_id'].fillna('Unknown')

In [29]:
train_with_geo = pd.merge(train, geo_info, on = 'geo_id', how = 'inner')
train_with_geo.isnull().sum()

Unnamed: 0,0
user_id,0
referer,0
geo_id,0
browser,0
browser_version,0
os,0
os_version,0
country_id,0
region_id,0
timezone_id,0


In [30]:
test_with_geo = pd.merge(test, geo_info, on = 'geo_id', how = 'inner')
test_with_geo.isnull().sum()

Unnamed: 0,0
request_ts,0
user_id,0
referer,0
geo_id,0
browser,0
browser_version,0
os,0
os_version,0
country_id,0
region_id,0


In [31]:
train_with_geo.head()

Unnamed: 0,user_id,referer,geo_id,browser,browser_version,os,os_version,country_id,region_id,timezone_id
0,fb858e8e0a2bec074450eaf94b627fd3,https://9b48ee5/,4799.0,Chrome Mobile,119.0.0,Android,10,c31b4e,470e75,f6155e
1,46a5f128fd569c764a92c2eaa788095e,https://9b48ee5/,8257.0,Chrome Mobile,111.0.0,Android,10,c31b4e,44520b,e56e80
2,5a74e9ac53ffb21a20cce117c0ad77ba,https://9634fd0/1409e548,3150.0,Yandex Browser,20.12.5,Android,11,c31b4e,616bb9,af47f1
3,af735816ca19115431ae3d89518c8c91,https://9b48ee5/,2740.0,Chrome Mobile,119.0.0,Android,10,c31b4e,3c9dca,e56e80
4,364f0ae0a3f29a685c4fb5bae6033b9a,https://9b48ee5/,4863.0,Yandex Browser,18.11.1,Android,4.4.4,c31b4e,776e76,10b7947


In [32]:
train_geo_labels = pd.merge(train_with_geo, train_labels, on = 'user_id', how = 'inner')

In [33]:
train_geo_labels.isnull().sum()

Unnamed: 0,0
user_id,0
referer,0
geo_id,0
browser,0
browser_version,0
os,0
os_version,0
country_id,0
region_id,0
timezone_id,0


In [34]:
train_geo_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593442 entries, 0 to 593441
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   user_id          593442 non-null  object 
 1   referer          593442 non-null  object 
 2   geo_id           593442 non-null  float64
 3   browser          593442 non-null  object 
 4   browser_version  593442 non-null  object 
 5   os               593442 non-null  object 
 6   os_version       593442 non-null  object 
 7   country_id       593442 non-null  object 
 8   region_id        593442 non-null  object 
 9   timezone_id      593442 non-null  object 
 10  target           593442 non-null  int64  
dtypes: float64(1), int64(1), object(9)
memory usage: 49.8+ MB


In [35]:
referer_vectors.head()

Unnamed: 0,component0,component1,component2,component3,component4,component5,component6,component7,component8,component9,referer
0,16708,-3741,11395,-1597,-3212,6269,5610,-15351,13779,14102,https://a6899a4/15652e67
1,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,https://9b48ee5/
2,10551,2947,12282,-470,16222,4472,-3316,9606,4197,18948,https://7a4c700/161af7e3
3,12816,20498,-10110,7731,-569,12035,3014,6398,11439,-271,https://9653126/159bc361
4,3710,11096,11333,14673,8030,1852,10554,11625,4306,13210,https://72879b4/125c29e6


In [36]:
referer_vectors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   component0  200000 non-null  int64 
 1   component1  200000 non-null  int64 
 2   component2  200000 non-null  int64 
 3   component3  200000 non-null  int64 
 4   component4  200000 non-null  int64 
 5   component5  200000 non-null  int64 
 6   component6  200000 non-null  int64 
 7   component7  200000 non-null  int64 
 8   component8  200000 non-null  int64 
 9   component9  200000 non-null  int64 
 10  referer     200000 non-null  object
dtypes: int64(10), object(1)
memory usage: 16.8+ MB


In [37]:
full_train = pd.merge(train_geo_labels, referer_vectors, on = 'referer', how = "inner")

In [38]:
full_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 601289 entries, 0 to 601288
Data columns (total 21 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   user_id          601289 non-null  object 
 1   referer          601289 non-null  object 
 2   geo_id           601289 non-null  float64
 3   browser          601289 non-null  object 
 4   browser_version  601289 non-null  object 
 5   os               601289 non-null  object 
 6   os_version       601289 non-null  object 
 7   country_id       601289 non-null  object 
 8   region_id        601289 non-null  object 
 9   timezone_id      601289 non-null  object 
 10  target           601289 non-null  int64  
 11  component0       601289 non-null  int64  
 12  component1       601289 non-null  int64  
 13  component2       601289 non-null  int64  
 14  component3       601289 non-null  int64  
 15  component4       601289 non-null  int64  
 16  component5       601289 non-null  int6

In [39]:
full_train.isnull().sum()

Unnamed: 0,0
user_id,0
referer,0
geo_id,0
browser,0
browser_version,0
os,0
os_version,0
country_id,0
region_id,0
timezone_id,0


In [40]:
full_train.head()

Unnamed: 0,user_id,referer,geo_id,browser,browser_version,os,os_version,country_id,region_id,timezone_id,...,component0,component1,component2,component3,component4,component5,component6,component7,component8,component9
0,fb858e8e0a2bec074450eaf94b627fd3,https://9b48ee5/,4799.0,Chrome Mobile,119.0.0,Android,10,c31b4e,470e75,f6155e,...,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
1,46a5f128fd569c764a92c2eaa788095e,https://9b48ee5/,8257.0,Chrome Mobile,111.0.0,Android,10,c31b4e,44520b,e56e80,...,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
2,5a74e9ac53ffb21a20cce117c0ad77ba,https://9634fd0/1409e548,3150.0,Yandex Browser,20.12.5,Android,11,c31b4e,616bb9,af47f1,...,12498,2451,10304,-6380,11608,3106,-2188,10573,3347,21870
3,af735816ca19115431ae3d89518c8c91,https://9b48ee5/,2740.0,Chrome Mobile,119.0.0,Android,10,c31b4e,3c9dca,e56e80,...,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
4,364f0ae0a3f29a685c4fb5bae6033b9a,https://9b48ee5/,4863.0,Yandex Browser,18.11.1,Android,4.4.4,c31b4e,776e76,10b7947,...,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817


In [41]:
for i in  full_train.columns:
    print(i, full_train[i].nunique())

user_id 499999
referer 135012
geo_id 2828
browser 59
browser_version 1437
os 13
os_version 230
country_id 151
region_id 259
timezone_id 218
target 2
component0 27196
component1 31467
component2 26412
component3 30622
component4 30426
component5 26702
component6 27506
component7 28597
component8 28248
component9 36867


In [42]:
#Удалим те колонки, которые не получиться преобразовать в dummy переменные

In [43]:
full_train = full_train.drop(['referer', 'geo_id'], axis = 1)

In [44]:
full_train = full_train.drop(['user_id'], axis = 1)

In [45]:
full_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 601289 entries, 0 to 601288
Data columns (total 18 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   browser          601289 non-null  object
 1   browser_version  601289 non-null  object
 2   os               601289 non-null  object
 3   os_version       601289 non-null  object
 4   country_id       601289 non-null  object
 5   region_id        601289 non-null  object
 6   timezone_id      601289 non-null  object
 7   target           601289 non-null  int64 
 8   component0       601289 non-null  int64 
 9   component1       601289 non-null  int64 
 10  component2       601289 non-null  int64 
 11  component3       601289 non-null  int64 
 12  component4       601289 non-null  int64 
 13  component5       601289 non-null  int64 
 14  component6       601289 non-null  int64 
 15  component7       601289 non-null  int64 
 16  component8       601289 non-null  int64 
 17  component9

In [46]:
full_train.head()

Unnamed: 0,browser,browser_version,os,os_version,country_id,region_id,timezone_id,target,component0,component1,component2,component3,component4,component5,component6,component7,component8,component9
0,Chrome Mobile,119.0.0,Android,10,c31b4e,470e75,f6155e,0,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
1,Chrome Mobile,111.0.0,Android,10,c31b4e,44520b,e56e80,0,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
2,Yandex Browser,20.12.5,Android,11,c31b4e,616bb9,af47f1,0,12498,2451,10304,-6380,11608,3106,-2188,10573,3347,21870
3,Chrome Mobile,119.0.0,Android,10,c31b4e,3c9dca,e56e80,0,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
4,Yandex Browser,18.11.1,Android,4.4.4,c31b4e,776e76,10b7947,0,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817


In [47]:
#Попробуем также удалить колонки browser_version, os_version в связи с неспособностью нормально их представить в связи с большим количеством различных значений

In [48]:
full_train = full_train.drop(['browser_version', 'os_version'], axis = 1)

In [49]:
full_train.head()

Unnamed: 0,browser,os,country_id,region_id,timezone_id,target,component0,component1,component2,component3,component4,component5,component6,component7,component8,component9
0,Chrome Mobile,Android,c31b4e,470e75,f6155e,0,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
1,Chrome Mobile,Android,c31b4e,44520b,e56e80,0,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
2,Yandex Browser,Android,c31b4e,616bb9,af47f1,0,12498,2451,10304,-6380,11608,3106,-2188,10573,3347,21870
3,Chrome Mobile,Android,c31b4e,3c9dca,e56e80,0,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
4,Yandex Browser,Android,c31b4e,776e76,10b7947,0,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817


In [50]:
full_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 601289 entries, 0 to 601288
Data columns (total 16 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   browser      601289 non-null  object
 1   os           601289 non-null  object
 2   country_id   601289 non-null  object
 3   region_id    601289 non-null  object
 4   timezone_id  601289 non-null  object
 5   target       601289 non-null  int64 
 6   component0   601289 non-null  int64 
 7   component1   601289 non-null  int64 
 8   component2   601289 non-null  int64 
 9   component3   601289 non-null  int64 
 10  component4   601289 non-null  int64 
 11  component5   601289 non-null  int64 
 12  component6   601289 non-null  int64 
 13  component7   601289 non-null  int64 
 14  component8   601289 non-null  int64 
 15  component9   601289 non-null  int64 
dtypes: int64(11), object(5)
memory usage: 73.4+ MB


In [51]:
full_train['target'] = full_train['target'].astype('category')

In [52]:
full_train.isnull().sum()

Unnamed: 0,0
browser,0
os,0
country_id,0
region_id,0
timezone_id,0
target,0
component0,0
component1,0
component2,0
component3,0


In [53]:
#Также попробуем создать новые датафреймы с различными данными

In [54]:
from sklearn.metrics import accuracy_score

In [55]:
#full_train_country = full_train.drop(['region_id', 'timezone_id'], axis = 1)
#full_train_region = full_train.drop(['country_id', 'timezone_id'], axis = 1)
#full_train_timezone = full_train.drop(['region_id', 'country_id'], axis = 1)

In [56]:
full_train.head()

Unnamed: 0,browser,os,country_id,region_id,timezone_id,target,component0,component1,component2,component3,component4,component5,component6,component7,component8,component9
0,Chrome Mobile,Android,c31b4e,470e75,f6155e,0,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
1,Chrome Mobile,Android,c31b4e,44520b,e56e80,0,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
2,Yandex Browser,Android,c31b4e,616bb9,af47f1,0,12498,2451,10304,-6380,11608,3106,-2188,10573,3347,21870
3,Chrome Mobile,Android,c31b4e,3c9dca,e56e80,0,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
4,Yandex Browser,Android,c31b4e,776e76,10b7947,0,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817


In [57]:
from sklearn.preprocessing import LabelEncoder

# Определение колонок для кодирования
columns_to_encode = ['browser', 'os', 'country_id', 'region_id', 'timezone_id']

# Инициализация LabelEncoder
label_encoders = {}

# Кодирование каждой колонки
for column in columns_to_encode:
    le = LabelEncoder()
    full_train[column] = le.fit_transform(full_train[column])
    label_encoders[column] = le  # Сохраняем encoder для дальнейшего использования, если это необходимо

   browser  os  country_id  region_id  timezone_id
0        7   0         103        116          205
1        7   0         103        111          179
2       56   0         103        160           96
3        7   0         103         99          179
4       56   0         103        200           22


In [58]:
full_train.head()

Unnamed: 0,browser,os,country_id,region_id,timezone_id,target,component0,component1,component2,component3,component4,component5,component6,component7,component8,component9
0,7,0,103,116,205,0,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
1,7,0,103,111,179,0,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
2,56,0,103,160,96,0,12498,2451,10304,-6380,11608,3106,-2188,10573,3347,21870
3,7,0,103,99,179,0,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
4,56,0,103,200,22,0,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817


In [63]:
# Разделение данных на признаки и целевую переменную
X = full_train.drop(['target'], axis=1)
y = full_train['target']

# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Масштабирование данных
scaler = StandardScaler()
scaler.fit(X_train)
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

# Инициализация модели CatBoostClassifier с фиксированными гиперпараметрами
model = CatBoostClassifier(
    learning_rate=0.05,  # Установите желаемое значение
    iterations=200,      # Установите желаемое значение
    depth=8,             # Установите желаемое значение
    l2_leaf_reg=3,       # Установите желаемое значение
    eval_metric='Accuracy',
    verbose=0,
    random_state=42
)

# Обучение модели на обучающей выборке
model.fit(scaled_X_train, y_train)

# Предсказание на тестовой выборке
y_preds = model.predict(scaled_X_test)

# Оценка точности модели
accuracy = accuracy_score(y_test, y_preds)
print("Точность модели:", accuracy)

# Вывод отчета о классификации
print(classification_report(y_test, y_preds))

Точность модели: 0.7648197547946755
              precision    recall  f1-score   support

           0       0.77      0.79      0.78     78305
           1       0.76      0.74      0.75     72018

    accuracy                           0.76    150323
   macro avg       0.76      0.76      0.76    150323
weighted avg       0.76      0.76      0.76    150323



In [65]:
#Попробуем удалить колонки region_id и timezone_id
full_train_country = full_train.drop(['region_id', 'timezone_id'], axis = 1)

In [64]:
# Разделение данных на признаки и целевую переменную
X = full_train_country.drop(['target'], axis=1)
y = full_train_country['target']

# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Масштабирование данных
scaler = StandardScaler()
scaler.fit(X_train)
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

# Инициализация модели CatBoostClassifier с фиксированными гиперпараметрами
model = CatBoostClassifier(
    learning_rate=0.05,  # Установите желаемое значение
    iterations=200,      # Установите желаемое значение
    depth=8,             # Установите желаемое значение
    l2_leaf_reg=3,       # Установите желаемое значение
    eval_metric='Accuracy',
    verbose=0,
    random_state=42
)

# Обучение модели на обучающей выборке
model.fit(scaled_X_train, y_train)

# Предсказание на тестовой выборке
y_preds = model.predict(scaled_X_test)

# Оценка точности модели
accuracy = accuracy_score(y_test, y_preds)
print("Точность модели:", accuracy)

# Вывод отчета о классификации
print(classification_report(y_test, y_preds))

Точность модели: 0.7635757668487191
              precision    recall  f1-score   support

           0       0.77      0.79      0.78     78305
           1       0.76      0.74      0.75     72018

    accuracy                           0.76    150323
   macro avg       0.76      0.76      0.76    150323
weighted avg       0.76      0.76      0.76    150323



In [67]:
from sklearn.ensemble import RandomForestClassifier


# Разделение данных на признаки и целевую переменную
X = full_train_country.drop(['target'], axis=1)
y = full_train_country['target']

# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Масштабирование данных (не обязательно для Random Forest, но можно оставить)
scaler = StandardScaler()
scaler.fit(X_train)
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

# Инициализация модели RandomForestClassifier с фиксированными гиперпараметрами
model = RandomForestClassifier(
    n_estimators=100,  # Количество деревьев в лесу
    max_depth=None,     # Максимальная глубина дерева (None означает, что узлы будут расширяться до тех пор, пока все листья не будут чистыми или пока все листья не будут содержать меньше минимального числа образцов)
    random_state=42,
    n_jobs=-1           # Использовать все доступные ядра процессора
)

# Обучение модели на обучающей выборке
model.fit(scaled_X_train, y_train)

# Предсказание на тестовой выборке
y_preds = model.predict(scaled_X_test)

# Оценка точности модели
accuracy = accuracy_score(y_test, y_preds)
print("Точность модели:", accuracy)

# Вывод отчета о классификации
print(classification_report(y_test, y_preds))

Точность модели: 0.7784504034645396
              precision    recall  f1-score   support

           0       0.79      0.79      0.79     78305
           1       0.77      0.77      0.77     72018

    accuracy                           0.78    150323
   macro avg       0.78      0.78      0.78    150323
weighted avg       0.78      0.78      0.78    150323



In [68]:
test.head()

Unnamed: 0,request_ts,user_id,referer,geo_id,browser,browser_version,os,os_version
0,1700993094,c2802dadd33d8ae09bb366bdd41212ea,https://9b48ee5/,8816,Chrome Mobile,96.0.4664,Android,12
1,1701005579,e5b1988db74527ec092f28b0bbfdaac9,https://9b48ee5/,3663,Chrome,116.0.5845,Android,10
2,1700969752,6ef1eedbdb72554e53e69782066065c5,https://72879b4/12411b9e,2336,Chrome,114.0.0,Android,10
3,1700991608,7e057293ecae62985a327b7af51858ea,https://9b48ee5/,9652,Chrome Mobile,91.0.4472,Android,11
4,1701019815,a27bd7ce8828497823fa8d5d05e7bbf7,https://9b48ee5/,3871,Chrome Mobile,119.0.0,Android,10


In [69]:
test_with_geo = pd.merge(test, geo_info, on = 'geo_id', how = 'inner')

In [70]:
test_with_geo.head()

Unnamed: 0,request_ts,user_id,referer,geo_id,browser,browser_version,os,os_version,country_id,region_id,timezone_id
0,1700993094,c2802dadd33d8ae09bb366bdd41212ea,https://9b48ee5/,8816,Chrome Mobile,96.0.4664,Android,12,c31b4e,36e3f3,f6155e
1,1701005579,e5b1988db74527ec092f28b0bbfdaac9,https://9b48ee5/,3663,Chrome,116.0.5845,Android,10,c31b4e,8ccc01,e56e80
2,1700969752,6ef1eedbdb72554e53e69782066065c5,https://72879b4/12411b9e,2336,Chrome,114.0.0,Android,10,c31b4e,1fbfa5,e56e80
3,1700991608,7e057293ecae62985a327b7af51858ea,https://9b48ee5/,9652,Chrome Mobile,91.0.4472,Android,11,c31b4e,f66ff,f6155e
4,1701019815,a27bd7ce8828497823fa8d5d05e7bbf7,https://9b48ee5/,3871,Chrome Mobile,119.0.0,Android,10,c31b4e,245864,e56e80


In [71]:
full_test = pd.merge(test_with_geo, referer_vectors, on = 'referer', how = "inner")

In [72]:
full_test.isnull().sum()

Unnamed: 0,0
request_ts,0
user_id,0
referer,0
geo_id,0
browser,0
browser_version,0
os,0
os_version,0
country_id,0
region_id,0


In [73]:
full_test.head()

Unnamed: 0,request_ts,user_id,referer,geo_id,browser,browser_version,os,os_version,country_id,region_id,...,component0,component1,component2,component3,component4,component5,component6,component7,component8,component9
0,1700993094,c2802dadd33d8ae09bb366bdd41212ea,https://9b48ee5/,8816,Chrome Mobile,96.0.4664,Android,12,c31b4e,36e3f3,...,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
1,1701005579,e5b1988db74527ec092f28b0bbfdaac9,https://9b48ee5/,3663,Chrome,116.0.5845,Android,10,c31b4e,8ccc01,...,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
2,1700969752,6ef1eedbdb72554e53e69782066065c5,https://72879b4/12411b9e,2336,Chrome,114.0.0,Android,10,c31b4e,1fbfa5,...,-7307,11682,9741,13564,13577,1200,10169,16461,-3932,3340
3,1700991608,7e057293ecae62985a327b7af51858ea,https://9b48ee5/,9652,Chrome Mobile,91.0.4472,Android,11,c31b4e,f66ff,...,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
4,1701019815,a27bd7ce8828497823fa8d5d05e7bbf7,https://9b48ee5/,3871,Chrome Mobile,119.0.0,Android,10,c31b4e,245864,...,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817


In [74]:
#Удалим ненужные колонки

In [75]:
users = full_test['user_id']

In [76]:
full_test = full_test.drop(['user_id', 'request_ts', 'referer', 'browser_version', 'os_version', 'region_id', 'timezone_id'], axis = 1)

In [77]:
full_test = full_test.drop(['geo_id'], axis = 1)

In [78]:
full_test.head()

Unnamed: 0,browser,os,country_id,component0,component1,component2,component3,component4,component5,component6,component7,component8,component9
0,Chrome Mobile,Android,c31b4e,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
1,Chrome,Android,c31b4e,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
2,Chrome,Android,c31b4e,-7307,11682,9741,13564,13577,1200,10169,16461,-3932,3340
3,Chrome Mobile,Android,c31b4e,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817
4,Chrome Mobile,Android,c31b4e,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817


In [80]:
# Определение колонок для кодирования
columns_to_encode = ['browser', 'os', 'country_id',]

# Инициализация LabelEncoder
label_encoders = {}

# Кодирование каждой колонки
for column in columns_to_encode:
    le = LabelEncoder()
    full_test[column] = le.fit_transform(full_test[column])

In [81]:
X = full_test

In [82]:
scaled_X_full_test = scaler.fit_transform(X)

In [83]:
y_preds = model.predict(scaled_X_test)

In [84]:
finished_df = pd.concat([users, pd.DataFrame(y_preds)], axis = 1)

In [85]:
finished_df.head()

Unnamed: 0,user_id,0
0,c2802dadd33d8ae09bb366bdd41212ea,1.0
1,e5b1988db74527ec092f28b0bbfdaac9,0.0
2,6ef1eedbdb72554e53e69782066065c5,0.0
3,7e057293ecae62985a327b7af51858ea,0.0
4,a27bd7ce8828497823fa8d5d05e7bbf7,1.0


In [86]:
finished_df_unique = finished_df[['user_id', 0]].drop_duplicates(subset = ['user_id'])

In [87]:
answer = test_users.merge(finished_df_unique, on = 'user_id', how = "left")

In [88]:
answer.isnull().sum()

Unnamed: 0,0
user_id,0
0,0


In [89]:
test_users.shape

(85000, 1)

In [90]:
answer.head()

Unnamed: 0,user_id,0
0,c2802dadd33d8ae09bb366bdd41212ea,1.0
1,e5b1988db74527ec092f28b0bbfdaac9,0.0
2,6ef1eedbdb72554e53e69782066065c5,0.0
3,7e057293ecae62985a327b7af51858ea,0.0
4,a27bd7ce8828497823fa8d5d05e7bbf7,1.0


In [91]:
answer[0] = answer[0].astype(int)

In [92]:
answer.rename(columns = {0:'target'}, inplace = True)

In [93]:
answer.head()

Unnamed: 0,user_id,target
0,c2802dadd33d8ae09bb366bdd41212ea,1
1,e5b1988db74527ec092f28b0bbfdaac9,0
2,6ef1eedbdb72554e53e69782066065c5,0
3,7e057293ecae62985a327b7af51858ea,0
4,a27bd7ce8828497823fa8d5d05e7bbf7,1


In [94]:
answer.to_csv('test_users_with_target.csv', index = False)