In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
df = pd.read_csv(r"data/sessions.csv", index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
def delete_without_answer(df: pd.DataFrame):
    return df[df.is_target_action != -1]

In [4]:
def delete_data(df: pd.DataFrame):
    df_new = df.drop(['client_id', 'session_id', 'utm_keyword', 'device_os', 'device_model', 'device_brand'], axis=1)
    return df_new.dropna()

In [5]:
def to_types(df: pd.DataFrame, inplace=False):
    df_new = df if inplace else df.copy()
    df_new['visit_datetime'] = pd.to_datetime(df_new.apply(lambda x: x["visit_date"] + ' ' + x["visit_time"], axis=1))
    df_new = df_new.drop(['visit_date', 'visit_time'], axis=1)
    return df_new

In [6]:
def screen_area(screen_resol: pd.Series):
    return screen_resol.apply(lambda screen: np.prod([int(size) for size in screen.split('x')]))

In [7]:
def visit_num_to_category(visit_num: pd.Series):
    def get_cat(vn):
        boundaries = (20, 100)
        for i, val in enumerate(boundaries):
            if val > vn:
                return i
        return len(boundaries)
    return visit_num.apply(get_cat)

In [8]:
def create_new_feat(df: pd.DataFrame, inplace=False):
    df_new = df if inplace else df.copy()
    df_new['screen_area'] = screen_area(df_new.device_screen_resolution)
    df_new['visit_number_cat'] = visit_num_to_category(df_new.visit_number)
    return df_new

In [9]:
def scale_numeric_feat(df: pd.DataFrame, inplace=False):
    df_new = df if inplace else df.copy()
    scaler = StandardScaler()
    df_new['screen_area_scl'] = scaler.fit_transform(df_new[['screen_area']])
    return df_new

In [24]:
def transform_datetime_feat(df: pd.DataFrame, inplace=False):
    def transform_trigonomy(num, diap_min, diap_max):
        arg = 2*np.pi * (diap_max - num) / (diap_max - diap_min)
        return (np.cos(arg), np.sin(arg))
    
    df_new = df if inplace else df.copy()
    df_new['weekday_c'] = df.visit_datetime.apply(lambda dt: transform_trigonomy(dt.weekday(), 0, 6)[0])
    df_new['weekday_s'] = df.visit_datetime.apply(lambda dt: transform_trigonomy(dt.weekday(), 0, 6)[1])
    df_new['day_of_year_c'] = df.visit_datetime.apply(lambda dt: transform_trigonomy(dt.dayofyear, 1, 366 if dt.is_leap_year else 365)[0])
    df_new['day_of_year_s'] = df.visit_datetime.apply(lambda dt: transform_trigonomy(dt.dayofyear, 1, 366 if dt.is_leap_year else 365)[1])
    df_new['time_c'] = df.visit_datetime.apply(lambda dt: transform_trigonomy(dt.hour * 60 + dt.minute, 0, 1439)[0])
    df_new['time_s'] = df.visit_datetime.apply(lambda dt: transform_trigonomy(dt.hour * 60 + dt.minute, 0, 1439)[1])
    return df_new

In [37]:
def short_utm_feat(df: pd.DataFrame, inplace=False):
    df_new = df if inplace else df.copy()
    
    df_new.utm_source = df_new.utm_source.apply(lambda s: s[:5])
    df_new.utm_campaign = df_new.utm_campaign.apply(lambda s: s[:5])
    df_new.utm_adcontent = df_new.utm_adcontent.apply(lambda s: s[:5])
    return df_new

In [43]:
def delete_columns(df: pd.DataFrame, inplace=False):
    df_new = df if inplace else df.copy()
    df_new.drop(['device_screen_resolution', 'visit_datetime', 'screen_area', 'visit_number'], axis=1, inplace=True)
    return df_new

In [76]:
def category_coder(df: pd.DataFrame, inplace=False):
    df_new = df if inplace else df.copy()
    ohe = OneHotEncoder(sparse=False)
    obj_cols = list(df.dtypes[df.dtypes == 'object'].index)
    ohe.fit(df[obj_cols])
    df_new[ohe.get_feature_names()] = ohe.transform(df[obj_cols])
    df_new.drop(obj_cols, axis=1, inplace=True)
    return df_new

In [55]:
def preprocess(df: pd.DataFrame):
    df_new = df.copy()
    df_new = delete_without_answer(df_new)
    df_new = delete_data(df_new)
    df_new = to_types(df_new)
    df_new = create_new_feat(df_new)
    df_new = scale_numeric_feat(df_new)
    df_new = transform_datetime_feat(df_new)
    df_new = short_utm_feat(df_new)
    df_new = delete_columns(df_new)
    df_new = category_coder(df_new)
    return df_new

Конверсия 2.9%

Идея: количество визитов преобразовать в категории разбив на интервалы

In [12]:
df_old = df.copy()

In [18]:
df = preprocess(df)

In [45]:
df

Unnamed: 0,utm_source,utm_medium,utm_campaign,utm_adcontent,device_category,device_browser,geo_country,geo_city,is_target_action,visit_number_cat,screen_area_scl,weekday_c,weekday_s,day_of_year_c,day_of_year_s,time_c,time_s
0,ZpYIo,banner,LEoPH,vCIpm,mobile,Chrome,Russia,Zlatoust,0,0,-0.473727,-0.5,-8.660254e-01,0.802886,0.596132,-0.775472,0.631382
1,MvfHs,cpm,FTjNL,xhoen,mobile,Samsung Internet,Russia,Moscow,0,0,-0.369922,1.0,0.000000e+00,0.688563,0.725176,-0.578385,-0.815764
2,ZpYIo,banner,LEoPH,vCIpm,mobile,Chrome,Russia,Krasnoyarsk,0,0,-0.473727,0.5,-8.660254e-01,0.998659,0.051761,0.760087,-0.649821
5,kjsLg,organic,LTuZk,JNHcP,mobile,Safari,Russia,Saint Petersburg,0,0,-0.406155,1.0,-2.449294e-16,-0.851284,-0.524704,-0.999998,0.002183
6,TxKUc,cpc,FTjNL,LcGIU,tablet,YaBrowser,Russia,Saint Petersburg,0,0,0.063657,-1.0,1.224647e-16,0.999851,0.017261,-0.695827,-0.718210
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1860034,ISrKo,blogger_stories,zfwIe,JNHcP,mobile,Safari,Russia,Zheleznodorozhny,0,0,-0.307043,1.0,0.000000e+00,-0.539320,0.842101,-0.864749,0.502204
1860038,fDLlA,(none),LTuZk,JNHcP,mobile,Safari,Russia,Stavropol,0,0,-0.369371,-0.5,-8.660254e-01,-0.802886,-0.596132,-0.501260,-0.865297
1860039,fDLlA,(none),LTuZk,JNHcP,mobile,Safari,Russia,Moscow,0,0,-0.487264,1.0,-2.449294e-16,0.700978,0.713183,-0.603044,0.797708
1860040,ZpYIo,banner,LEoPH,JNHcP,mobile,Chrome,Russia,Chelyabinsk,0,0,-0.361490,1.0,-2.449294e-16,0.851284,0.524704,-0.585487,0.810682


In [53]:
list(df.dtypes[df.dtypes == 'object'].index)

['utm_source',
 'utm_medium',
 'utm_campaign',
 'utm_adcontent',
 'device_category',
 'device_browser',
 'geo_country',
 'geo_city']

In [57]:
df.head()

Unnamed: 0,utm_source,utm_medium,utm_campaign,utm_adcontent,device_category,device_browser,geo_country,geo_city,is_target_action,visit_number_cat,screen_area_scl,weekday_c,weekday_s,day_of_year_c,day_of_year_s,time_c,time_s
0,ZpYIo,banner,LEoPH,vCIpm,mobile,Chrome,Russia,Zlatoust,0,0,-0.473727,-0.5,-0.8660254,0.802886,0.596132,-0.775472,0.631382
1,MvfHs,cpm,FTjNL,xhoen,mobile,Samsung Internet,Russia,Moscow,0,0,-0.369922,1.0,0.0,0.688563,0.725176,-0.578385,-0.815764
2,ZpYIo,banner,LEoPH,vCIpm,mobile,Chrome,Russia,Krasnoyarsk,0,0,-0.473727,0.5,-0.8660254,0.998659,0.051761,0.760087,-0.649821
5,kjsLg,organic,LTuZk,JNHcP,mobile,Safari,Russia,Saint Petersburg,0,0,-0.406155,1.0,-2.449294e-16,-0.851284,-0.524704,-0.999998,0.002183
6,TxKUc,cpc,FTjNL,LcGIU,tablet,YaBrowser,Russia,Saint Petersburg,0,0,0.063657,-1.0,1.224647e-16,0.999851,0.017261,-0.695827,-0.71821


In [80]:
df_ohe = category_coder(df)

MemoryError: Unable to allocate 34.4 GiB for an array with shape (1413265, 3263) and data type float64