In [1]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [23]:
df = pd.read_csv(r"data/sessions.csv", index_col=0);

  interactivity=interactivity, compiler=compiler, result=result)


In [24]:
def delete_without_answer(df: pd.DataFrame):
    return df[df.is_target_action != -1]

In [25]:
def delete_data(df: pd.DataFrame):
    df_new = df.drop(['client_id', 'session_id', 'utm_keyword', 'device_os', 'device_model', 'device_brand'], axis=1)
    return df_new.dropna()

In [26]:
def to_types(df: pd.DataFrame, inplace=False):
    df_new = df if inplace else df.copy()
    df_new['visit_datetime'] = pd.to_datetime(df_new.apply(lambda x: x["visit_date"] + ' ' + x["visit_time"], axis=1))
    df_new = df_new.drop(['visit_date', 'visit_time'], axis=1)
    return df_new

In [27]:
def screen_area(screen_resol: pd.Series):
    return screen_resol.apply(lambda screen: np.prod([int(size) for size in screen.split('x')]))

In [28]:
def visit_num_to_category(visit_num: pd.Series):
    def get_cat(vn):
        boundaries = (20, 100)
        for i, val in enumerate(boundaries):
            if val > vn:
                return i
        return len(boundaries)
    return visit_num.apply(get_cat)

In [29]:
def create_new_feat(df: pd.DataFrame, inplace=False):
    df_new = df if inplace else df.copy()
    df_new['screen_area'] = screen_area(df_new.device_screen_resolution)
    df_new['visit_number_cat'] = visit_num_to_category(df_new.visit_number)
    return df_new

In [30]:
def scale_numeric_feat(df: pd.DataFrame, inplace=False):
    df_new = df if inplace else df.copy()
    scaler = StandardScaler()
    df_new['screen_area_scl'] = scaler.fit_transform(df_new[['screen_area']])
    return df_new

In [31]:
def transform_datetime_feat(df: pd.DataFrame, inplace=False):
    def transform_trigonomy(num, diap_min, diap_max):
        arg = 2*np.pi * (diap_max - num) / (diap_max - diap_min)
        return (np.cos(arg), np.sin(arg))
    
    df_new = df if inplace else df.copy()
    df_new['weekday_c'] = df.visit_datetime.apply(lambda dt: transform_trigonomy(dt.weekday(), 0, 6)[0])
    df_new['weekday_s'] = df.visit_datetime.apply(lambda dt: transform_trigonomy(dt.weekday(), 0, 6)[1])
    df_new['day_of_year_c'] = df.visit_datetime.apply(lambda dt: transform_trigonomy(dt.dayofyear, 1, 366 if dt.is_leap_year else 365)[0])
    df_new['day_of_year_s'] = df.visit_datetime.apply(lambda dt: transform_trigonomy(dt.dayofyear, 1, 366 if dt.is_leap_year else 365)[1])
    df_new['time_c'] = df.visit_datetime.apply(lambda dt: transform_trigonomy(dt.hour * 60 + dt.minute, 0, 1439)[0])
    df_new['time_s'] = df.visit_datetime.apply(lambda dt: transform_trigonomy(dt.hour * 60 + dt.minute, 0, 1439)[1])
    return df_new

In [32]:
def short_utm_feat(df: pd.DataFrame, inplace=False):
    df_new = df if inplace else df.copy()
    
    df_new.utm_source = df_new.utm_source.apply(lambda s: s[:5])
    df_new.utm_campaign = df_new.utm_campaign.apply(lambda s: s[:5])
    df_new.utm_adcontent = df_new.utm_adcontent.apply(lambda s: s[:5])
    return df_new

In [33]:
def delete_columns(df: pd.DataFrame, inplace=False):
    df_new = df if inplace else df.copy()
    df_new.drop(['device_screen_resolution', 'visit_datetime', 'screen_area', 'visit_number'], axis=1, inplace=True)
    return df_new

In [34]:
def category_coder(df: pd.DataFrame, inplace=False):
    df_new = df if inplace else df.copy()
    ohe = OneHotEncoder(sparse=False)
    obj_cols = list(df.dtypes[df.dtypes == 'object'].index)
    ohe.fit(df[obj_cols])
    df_new[ohe.get_feature_names()] = ohe.transform(df[obj_cols])
    df_new.drop(obj_cols, axis=1, inplace=True)
    return df_new

In [35]:
def preprocess(df: pd.DataFrame):
    df_new = df.copy()
    df_new = delete_without_answer(df_new)
    df_new = delete_data(df_new)
    df_new = to_types(df_new)
    df_new = create_new_feat(df_new)
    df_new = scale_numeric_feat(df_new)
    df_new = transform_datetime_feat(df_new)
    df_new = short_utm_feat(df_new)
    df_new = delete_columns(df_new)
#     df_new = category_coder(df_new)
    return df_new

Конверсия 2.9%

Идея: количество визитов преобразовать в категории разбив на интервалы

In [12]:
df_old = df.copy()

In [36]:
df = preprocess(df)

## Загрузка обработанного датасета

In [2]:
df = pd.read_csv(r"data/session_prepared_0.csv", index_col=0)

In [3]:
ohe = OneHotEncoder(sparse=False)
obj_cols = list(df.dtypes[df.dtypes == 'object'].index)
ohe.fit(df[obj_cols])

OneHotEncoder(sparse=False)

In [4]:
feat_names = ohe.get_feature_names()
fieldnames = list(df.columns) + list(feat_names) 

In [5]:
with open(r"data/session_prepared_1.csv", 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    df_for_transform = df[obj_cols]
    for i in range(df.shape[0]):
        row = df.iloc[i].to_dict()
        new_row = dict(zip(feat_names, ohe.transform(df_for_transform.iloc[[i]])))
        new_row.update(row)
        writer.writerow(new_row)
        print(f'\r{i + 1} / {df.shape[0]}', end='')
    

1413265 / 1413265