In [1]:
import pandas as pd

# add processing that will analyse the _mapped data and will output boolean column for "filled incorrectly"
raw_df = pd.read_parquet("data/raw/corrected_raw_data.parquet")

In [2]:
import numpy as np
from sklearn.discriminant_analysis import StandardScaler
from exploration.utils import (
    cast_to_string,
    gender_map,
    marital_status_map,
    optimize_dtypes,
    scale_with_max_value,
    day_name_to_num,
    cast_ints_and_floats,
    exclude_user_hourly_cum_perc_revenue_columns,
    scale_and_log_numerical_df,
    encode_categorical_features,
    convert_to_absolute_values,
)
from exploration.column_config import (
    cum_perc_revenue_map,
    drop_columns,
    max_scalers,
    one_hot_columns,
    high_cardinality_columns,
    log_scalers,
    targets,
)

In [3]:
df = raw_df.copy(True)
df = df.rename(columns=cum_perc_revenue_map)
cols = sorted([col for col in df.columns if col not in drop_columns])
df = df[cols]
df = exclude_user_hourly_cum_perc_revenue_columns(df)

df["payout"] = df["payout"].astype(float).round(2)
df = cast_to_string(df)
df["user_gender"] = df["user_gender"].apply(gender_map)
df["user_marital_status"] = df["user_marital_status"].apply(marital_status_map)
df["user_age"] = df["user_age"].clip(lower=16, upper=100)
df["user_registration_day_of_week"] = df["user_registration_day_of_week"].apply(
    day_name_to_num
)
df["user_registration_year"] = (
    df["user_registration_year"] - df["user_registration_year"].min()
)

# Creating aboslute values
df = convert_to_absolute_values(df)
df = df.fillna(0)

for scaler in max_scalers:
    df = scale_with_max_value(df, scaler)

df.to_parquet("data/raw/_pre_scaled_data.parquet")

# numerical_df
log_df = df[log_scalers].copy(True)

# proces assymetricla log columns
col = "diff_avg_user_order_revenue_usd_24h_48h_vs_0h_24h"
log_df[f"positive_{col}"] = np.where(log_df[col].values >= 0, log_df[col].values, 0)
log_df[f"negative_{col}"] = np.where(
    log_df[col].values < 0, np.abs(log_df[col].values), 0
)
log_df = log_df.drop(columns=[col])

# user_registration_confirmation_minutes_diff
log_df["user_registration_confirmation_minutes_diff"] = log_df[
    "user_registration_confirmation_minutes_diff"
].clip(
    lower=0, upper=log_df["user_registration_confirmation_minutes_diff"].quantile(0.99)
)

log_df = cast_ints_and_floats(log_df)
log_df = scale_and_log_numerical_df(log_df)

# categorical_df
one_hot_df = df[one_hot_columns].copy()
one_hot_df = pd.get_dummies(
    one_hot_df,
    columns=one_hot_columns,
    prefix=one_hot_columns,
    prefix_sep="_",
    dummy_na=False,
    drop_first=True,
)

# high cardinality columns
high_df = encode_categorical_features(
    df=df[high_cardinality_columns],
    encoding_type="frequency",
)

df = pd.concat([log_df, one_hot_df, high_df], axis=1)
df = df[sorted(df.columns)]
df = df.dropna()

df.to_parquet("data/scaled_data.parquet")

  result = func(self.values, **kwargs)


NA rows number: 10


In [4]:
filter_users_30d = df["user_revenue_usd_30d"] > 0
filter_users_60d = df["user_revenue_usd_60d"] > 0
filter_users_90d = df["user_revenue_usd_90d"] > 0

df_users_30d = df[filter_users_30d].reset_index(drop=True)
df_users_60d = df[filter_users_60d].reset_index(drop=True)
df_users_90d = df[filter_users_90d].reset_index(drop=True)

non_converted_users_30d = df[~filter_users_30d]
non_converted_users_60d = df[~filter_users_60d]
non_converted_users_90d = df[~filter_users_90d]

sample_non_converted_users_30d = non_converted_users_30d.sample(
    df_users_30d.shape[0], random_state=42
)
sample_non_converted_users_60d = non_converted_users_60d.sample(
    df_users_60d.shape[0], random_state=42
)
sample_non_converted_users_90d = non_converted_users_90d.sample(
    df_users_90d.shape[0], random_state=42
)

df_users_30d_balanced = pd.concat(
    [df_users_30d, sample_non_converted_users_30d]
).reset_index(drop=True)
df_users_60d_balanced = pd.concat(
    [df_users_60d, sample_non_converted_users_60d]
).reset_index(drop=True)
df_users_90d_balanced = pd.concat(
    [df_users_90d, sample_non_converted_users_90d]
).reset_index(drop=True)

df_users_30d.to_parquet("data/scaled_revenue_30d.parquet")
df_users_60d.to_parquet("data/scaled_revenue_60d.parquet")
df_users_90d.to_parquet("data/scaled_revenue_90d.parquet")

df_users_30d_balanced.to_parquet("data/scaled_revenue_30d_balanced.parquet")
df_users_60d_balanced.to_parquet("data/scaled_revenue_60d_balanced.parquet")
df_users_90d_balanced.to_parquet("data/scaled_revenue_90d_balanced.parquet")

In [7]:
small_sample_size = int(df_users_30d.shape[0] / 10)
sample_non_converted_users_30d_small = non_converted_users_30d.sample(
    small_sample_size, random_state=42
)
sample_non_converted_users_60d_small = non_converted_users_60d.sample(
    small_sample_size, random_state=42
)
sample_non_converted_users_90d_small = non_converted_users_90d.sample(
    small_sample_size, random_state=42
)

df_users_30d_balanced_small = pd.concat(
    [df_users_30d, sample_non_converted_users_30d_small]
).reset_index(drop=True)
df_users_60d_balanced_small = pd.concat(
    [df_users_60d, sample_non_converted_users_60d_small]
).reset_index(drop=True)
df_users_90d_balanced_small = pd.concat(
    [df_users_90d, sample_non_converted_users_90d_small]
).reset_index(drop=True)

df_users_30d_balanced_small.to_parquet("data/scaled_revenue_30d_balanced_small.parquet")
df_users_60d_balanced_small.to_parquet("data/scaled_revenue_60d_balanced_small.parquet")
df_users_90d_balanced_small.to_parquet("data/scaled_revenue_90d_balanced_small.parquet")

In [8]:
df_users_30d_balanced_small

Unnamed: 0,aff_id_freq_encoded,avg_user_order_revenue_usd_0h_24h,avg_user_order_revenue_usd_0h_48h,avg_user_order_revenue_usd_24h_48h,country_AU,country_CA,country_DE,country_IT,country_UK,country_US,...,user_registration_device_smartphone,user_registration_device_tablet,user_registration_device_unknown,user_registration_device_web,user_revenue_usd_30d,user_revenue_usd_60d,user_revenue_usd_90d,user_seek_gender_female,user_seek_gender_male,user_site_alias_freq_encoded
0,0.080556,2.668616,2.668616,0.000000,False,False,False,False,False,True,...,False,False,False,False,3.979682,3.979682,3.979682,True,False,0.078534
1,0.017888,4.563410,4.649187,4.801312,False,False,False,False,True,False,...,False,False,False,False,5.741367,5.741367,5.741367,True,False,0.050662
2,0.029309,0.000000,0.000000,0.000000,False,True,False,False,False,False,...,False,False,False,False,4.419684,6.909454,6.922023,True,False,0.040929
3,0.057554,0.000000,0.000000,0.000000,False,True,False,False,False,False,...,False,False,False,False,2.165619,2.165619,2.165619,True,False,0.024259
4,0.015056,2.078191,2.078191,0.000000,False,False,False,False,False,True,...,False,False,True,False,2.078191,2.078191,2.078191,True,False,0.226680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282514,0.015706,0.000000,0.000000,0.000000,False,False,False,False,False,True,...,False,False,False,False,0.000000,0.000000,0.000000,True,False,0.226680
282515,0.087582,0.000000,0.000000,0.000000,False,False,False,False,False,True,...,False,False,False,False,0.000000,0.000000,0.000000,True,False,0.226680
282516,0.015706,0.000000,0.000000,0.000000,False,False,False,False,False,True,...,False,False,False,False,0.000000,0.000000,0.000000,True,False,0.226680
282517,0.287380,0.000000,0.000000,0.000000,False,False,False,False,False,True,...,False,False,True,False,0.000000,0.000000,0.000000,True,False,0.193971
