In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
df = pd.read_csv(r"data/sessions.csv", index_col=0)

  df = pd.read_csv(r"data/sessions.csv", index_col=0)


In [3]:
def delete_without_answer(df: pd.DataFrame):
    return df[df.is_target_action != -1]

In [4]:
def delete_data(df: pd.DataFrame):
    df_new = df.drop(['client_id', 'session_id', 'utm_keyword', 'device_os', 'device_model', 'device_brand'], axis=1)
    return df_new.dropna()

In [5]:
def to_types(df: pd.DataFrame, inplace=False):
    df_new = df if inplace else df.copy()
    df_new['visit_datetime'] = pd.to_datetime(df_new.apply(lambda x: x["visit_date"] + ' ' + x["visit_time"], axis=1))
    df_new = df_new.drop(['visit_date', 'visit_time'], axis=1)
    return df_new

In [6]:
def screen_area(screen_resol: pd.Series):
    return screen_resol.apply(lambda screen: np.prod([int(size) for size in screen.split('x')]))

In [7]:
def visit_num_to_category(visit_num: pd.Series):
    def get_cat(vn):
        boundaries = (20, 100)
        for i, val in enumerate(boundaries):
            if val > vn:
                return i
        return len(boundaries)
    return visit_num.apply(get_cat)

In [8]:
def create_new_feat(df: pd.DataFrame, inplace=False):
    df_new = df if inplace else df.copy()
    df_new['screen_area'] = screen_area(df_new.device_screen_resolution)
    df_new['visit_number_cat'] = visit_num_to_category(df_new.visit_number)
    return df_new

In [9]:
def scale_numeric_feat(df: pd.DataFrame, inplace=False):
    df_new = df if inplace else df.copy()
    scaler = StandardScaler()
    df_new['screen_area_scl'] = scaler.fit_transform(df_new[['screen_area']])
    return df_new

In [10]:
def transform_datetime_feat(df: pd.DataFrame, inplace=False):
    def transform_trigonomy(num, diap_min, diap_max):
        arg = 2*np.pi * (diap_max - num) / (diap_max - diap_min)
        return (np.cos(arg), np.sin(arg))
    
    df_new = df if inplace else df.copy()
    df_new['year'] = df.visit_datetime.apply(lambda dt: dt.year)
    df_new['weekday_c'] = df.visit_datetime.apply(lambda dt: transform_trigonomy(dt.day_of_week, 0, 6)[0])
    df_new['weekday_s'] = df.visit_datetime.apply(lambda dt: transform_trigonomy(dt.day_of_week, 0, 6)[1])
    df_new['day_of_year_c'] = df.visit_datetime.apply(lambda dt: transform_trigonomy(dt.day_of_year, 1, 366 if dt.is_leap_year else 365)[0])
    df_new['day_of_year_s'] = df.visit_datetime.apply(lambda dt: transform_trigonomy(dt.day_of_year, 1, 366 if dt.is_leap_year else 365)[1])
    df_new['time_c'] = df.visit_datetime.apply(lambda dt: transform_trigonomy(dt.hour * 60 + dt.minute, 0, 1439)[0])
    df_new['time_s'] = df.visit_datetime.apply(lambda dt: transform_trigonomy(dt.hour * 60 + dt.minute, 0, 1439)[1])
    return df_new

In [11]:
def preprocess(df: pd.DataFrame):
    df_new = df.copy()
    df_new = delete_without_answer(df_new)
    df_new = delete_data(df_new)
    df_new = to_types(df_new)
    df_new = create_new_feat(df_new)
    df_new = scale_numeric_feat(df_new)
    df_new = transform_datetime_feat(df_new)
    return df_new

Конверсия 2.9%

Идея: количество визитов преобразовать в категории разбив на интервалы

In [12]:
df_old = df.copy()

In [13]:
df = preprocess(df)

In [14]:
df.head()

Unnamed: 0,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,device_category,device_screen_resolution,device_browser,geo_country,geo_city,...,screen_area,visit_number_cat,screen_area_scl,year,weekday_c,weekday_s,day_of_year_c,day_of_year_s,time_c,time_s
0,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,mobile,360x720,Chrome,Russia,Zlatoust,...,259200,0,-0.473727,2021,-0.5,-0.8660254,0.802886,0.596132,-0.775472,0.631382
1,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,mobile,385x854,Samsung Internet,Russia,Moscow,...,328790,0,-0.369922,2021,1.0,0.0,0.688563,0.725176,-0.578385,-0.815764
2,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,mobile,360x720,Chrome,Russia,Krasnoyarsk,...,259200,0,-0.473727,2021,0.5,-0.8660254,0.998659,0.051761,0.760087,-0.649821
5,1,kjsLglQLzykiRbcDiGcD,organic,LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,mobile,375x812,Safari,Russia,Saint Petersburg,...,304500,0,-0.406155,2021,1.0,-2.449294e-16,-0.851284,-0.524704,-0.999998,0.002183
6,1,TxKUcPpthBDPieTGmVhx,cpc,FTjNLDyTrXaWYgZymFkV,LcGIUNPUAmXtQJaDfFBR,tablet,602x1029,YaBrowser,Russia,Saint Petersburg,...,619458,0,0.063657,2021,-1.0,1.224647e-16,0.999851,0.017261,-0.695827,-0.71821


In [15]:
df.year.describe()

count    1413265.0
mean        2021.0
std            0.0
min         2021.0
25%         2021.0
50%         2021.0
75%         2021.0
max         2021.0
Name: year, dtype: float64