In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import pickle
import json

import warnings
warnings.filterwarnings("ignore")

In [2]:
pd.options.display.max_colwidth = 300
pd.options.display.max_rows = 70
pd.options.display.float_format = '{:.10f}'.format
pd.set_option('display.max_columns', None)

In [3]:
def types_change(data):
    # Переходим от списков значений к строкам
    # Переведем колонки к числовым типам, выпишем те, где не получилось
    exceptions = []
    
    # Разобьем нечисловые колонки на id, даты и прочее
    id_columns = list(filter(lambda x: 'id' in x, data.columns))
    object_columns = sorted(data.select_dtypes(include='object').columns)
    date_columns = list(filter(lambda x: 'date' in x, object_columns))
    
    for col in sorted(set(id_columns)):
        try:
            data[col] = data[col].astype(np.int64)
        except:
            data[col] = data[col].astype(np.float64)
    
    for col in sorted(set(object_columns) - set(date_columns) - set(id_columns)):
        try:
            data[col] = data[col].astype(np.float64)
        except:
            print(col)
            exceptions.append(col)
    
    for col in sorted(set(date_columns)):
        try:
            data[col] = data[col].astype(np.datetime64)
        except:
            print(col)
            exceptions.append(col)
    print('--------------------------------------------------------')
    print("Can't transfer to np.datetime or np.float64:")
    print(exceptions)
    print('--------------------------------------------------------')
    
    # Конкретные поля
    data['isclick'] = data['isclick'].astype(int, errors='ignore')
    data['categoryid'] = data['categoryid'].astype(int, errors='ignore')
    data['price'] = data['price'].astype(int, errors='ignore')
    data['level'] = data['level'].astype(int, errors='ignore')
    data['parentcategoryid'] = data['parentcategoryid'].astype(int, errors='ignore')
    data['subcategoryid'] = data['subcategoryid'].astype(int, errors='ignore')
    
    return data, exceptions

In [4]:
def drop_high_null_cols(data, addon_features, trashhold = 0.8):
    
    # Создаем список полей с большим количеством пропусков
    cols_to_drop = data.isnull().sum()[data.isnull().sum() > trashhold * data.shape[0]].index.tolist()
    
    for feature in addon_features:
        try:
            cols_to_drop.remove(feature)
        except:
            continue
    
    print('Колонки с высоким уровнем пропусков:')
    print(sorted(cols_to_drop))
    
    data.drop(cols_to_drop, axis=1, inplace=True, errors = 'ignore')

In [5]:
addon_features = ['searchid', 'adid', 'locationid']

In [6]:
data = pd.read_parquet("./data/merged_data.parquet")
data.columns = data.columns.str.lower()
data.head()

Unnamed: 0,searchid,adid,position,objecttype,histctr,isclick,locationid,categoryid,params,price,title,iscontext,level,parentcategoryid,subcategoryid,level_y,regionid,cityid
0,21729517,25814825,1,3,0.005468,0.0,,60.0,"{110:'Комбинезоны и боди', 178:'Для мальчиков'}",711.0,Комбинезон COCOON,1,3.0,3.0,43.0,,,
1,27658647,32454379,7,3,1e-05,0.0,,60.0,"{110:'Обувь', 178:'Для мальчиков'}",1699.0,Кеды,1,3.0,3.0,43.0,,,
2,112185387,8284605,1,3,0.00561,0.0,,50.0,{44:'Инструменты'},54503.0,Виброрейка MASALTA MCD-4,1,3.0,9.0,23.0,,,
3,95746917,5835167,1,3,0.001788,0.0,,22.0,"{83:'Верхняя одежда', 175:'Женская одежда'}",3998.0,Куртка утепленная Savage с доставкой,1,3.0,3.0,47.0,,,
4,113355233,27473310,7,3,0.001435,0.0,,41.0,{143:'iPhone'},399.0,Внешний аккумулятор для iPhone 2600 mAh,1,3.0,12.0,42.0,,,


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4847835 entries, 0 to 4847834
Data columns (total 18 columns):
 #   Column            Dtype  
---  ------            -----  
 0   searchid          int64  
 1   adid              int64  
 2   position          int64  
 3   objecttype        int64  
 4   histctr           float64
 5   isclick           float64
 6   locationid        float64
 7   categoryid        float64
 8   params            object 
 9   price             float64
 10  title             object 
 11  iscontext         int64  
 12  level             float64
 13  parentcategoryid  float64
 14  subcategoryid     float64
 15  level_y           float64
 16  regionid          float64
 17  cityid            float64
dtypes: float64(11), int64(5), object(2)
memory usage: 665.7+ MB


In [8]:
data, _ = types_change(data)

params
title
--------------------------------------------------------
Can't transfer to np.datetime or np.float64:
['params', 'title']
--------------------------------------------------------


In [9]:
drop_high_null_cols(data, addon_features)

Колонки с высоким уровнем пропусков:
['cityid', 'level_y', 'regionid']


In [10]:
data['params'] = data['params'].apply(lambda x: np.nan if pd.isnull(x) else eval(x)[list(eval(x).keys())[0]])

In [11]:
data = data.rename(columns = {'isclick':'target'})
data.columns = data.columns.str.lower()

In [12]:
data.head()

Unnamed: 0,searchid,adid,position,objecttype,histctr,target,locationid,categoryid,params,price,title,iscontext,level,parentcategoryid,subcategoryid
0,21729517,25814825,1,3,0.005468,0,,60.0,Комбинезоны и боди,711.0,Комбинезон COCOON,1,3.0,3.0,43.0
1,27658647,32454379,7,3,1e-05,0,,60.0,Обувь,1699.0,Кеды,1,3.0,3.0,43.0
2,112185387,8284605,1,3,0.00561,0,,50.0,Инструменты,54503.0,Виброрейка MASALTA MCD-4,1,3.0,9.0,23.0
3,95746917,5835167,1,3,0.001788,0,,22.0,Верхняя одежда,3998.0,Куртка утепленная Savage с доставкой,1,3.0,3.0,47.0
4,113355233,27473310,7,3,0.001435,0,,41.0,iPhone,399.0,Внешний аккумулятор для iPhone 2600 mAh,1,3.0,12.0,42.0


In [13]:
data.to_parquet("./data/preprocessed_data.parquet")

In [14]:
with open("./inter_files/addon_features.pickle", 'wb') as f:
    pickle.dump(addon_features, f)