In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# импортируем необходимые библиотеки
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing, model_selection, metrics
from sklearn.model_selection import train_test_split

import lightgbm as lgb

In [3]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/train.csv", parse_dates=["activation_date"])
data.head()

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,price,item_seq_number,activation_date,user_type,image,image_top_1,deal_probability
0,b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,,,Кокоби(кокон для сна),"Кокон для сна малыша,пользовались меньше месяц...",400.0,2,2017-03-28,Private,d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,0.12789
1,2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,,,Стойка для Одежды,"Стойка для одежды, под вешалки. С бутика.",3000.0,19,2017-03-26,Private,79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,0.0
2,ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",,,Philips bluray,"В хорошем состоянии, домашний кинотеатр с blu ...",4000.0,9,2017-03-20,Private,b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,0.43177
3,02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,Автокресло,Продам кресло от0-25кг,2200.0,286,2017-03-25,Company,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,0.80323
4,7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110.0,"ВАЗ 2110, 2003",Все вопросы по телефону.,40000.0,3,2017-03-16,Private,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,0.20797


In [4]:
train, test = train_test_split(data, random_state=13, test_size=0.3)
val, test = train_test_split(test, random_state=13, test_size=0.3)

In [5]:
# Давайте обучим простенькую модель

# Отделите целевую переменную
y_train = train['deal_probability']
y_val = val['deal_probability']
y_test = test['deal_probability']

# Давайте добавим фактор: день недели активации
train["activation_weekday"] = train['activation_date'].dt.dayofweek
val["activation_weekday"] = val['activation_date'].dt.dayofweek
test["activation_weekday"] = test['activation_date'].dt.dayofweek

# Давайте заэнкодим категориальные переменные
cat_vars = ["region", "city", "parent_category_name", "category_name", "user_type", "param_1", "param_2", "param_3"]
for col in cat_vars:
    train[col] = preprocessing.LabelEncoder().fit_transform(train[col])
    val[col] = preprocessing.LabelEncoder().fit_transform(val[col])
    test[col] = preprocessing.LabelEncoder().fit_transform(test[col])

cols_to_drop = ["item_id", "user_id", "title", "description", "activation_date", "image", "deal_probability"]
X_train = train.drop(cols_to_drop, axis=1)
X_val = val.drop(cols_to_drop, axis=1)
X_test = test.drop(cols_to_drop, axis=1)

In [6]:
X_train.head()

Unnamed: 0,region,city,parent_category_name,category_name,param_1,param_2,param_3,price,item_seq_number,user_type,image_top_1,activation_weekday
575324,15,1688,2,38,222,264,1137,,7,1,,3
648443,9,709,0,45,216,264,1137,9000.0,22,1,2985.0,1
206934,12,1026,3,43,367,264,1137,150.0,49,1,816.0,0
161941,7,582,2,22,242,264,1137,30300.0,18,1,1379.0,4
447081,16,1041,4,29,124,134,1137,400.0,27,1,80.0,3


In [7]:
params = {
    "objective" : "regression",
    "metric" : "rmse",
    "num_leaves" : 30,
    "learning_rate" : 0.1,
    "bagging_fraction" : 0.7,
    "feature_fraction" : 0.7,
    "bagging_frequency" : 5,
    "bagging_seed" : 2018,
    "verbosity" : -1
}
    
lgtrain = lgb.Dataset(X_train, label=y_train) # Создать lgb Dataset с тренировочной выборкой
lgval =  lgb.Dataset(X_val, label=y_val) # Создать lgb Dataset с валидационной выборкой
model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20)

pred_test_y = model.predict(X_test, num_iteration=model.best_iteration)

Training until validation scores don't improve for 100 rounds.
[20]	valid_0's rmse: 0.241899
[40]	valid_0's rmse: 0.239068
[60]	valid_0's rmse: 0.238293
[80]	valid_0's rmse: 0.237899
[100]	valid_0's rmse: 0.23769
[120]	valid_0's rmse: 0.237679
[140]	valid_0's rmse: 0.237365
[160]	valid_0's rmse: 0.237355
[180]	valid_0's rmse: 0.237919
[200]	valid_0's rmse: 0.238557
[220]	valid_0's rmse: 0.238391
[240]	valid_0's rmse: 0.238361
[260]	valid_0's rmse: 0.238374
Early stopping, best iteration is:
[167]	valid_0's rmse: 0.237311


In [8]:
print(metrics.mean_squared_error(y_test, pred_test_y))

0.057209751841091144
