In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import sklearn
import sklearn.metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.impute import SimpleImputer
import category_encoders as ce
import xfeat
from xfeat.cat_encoder import TargetEncoder
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
import lightgbm as lgb
import gensim.downloader
from gensim.models import KeyedVectors

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
def concat_train_test(train, test):
    all_df = pd.merge(train, test, how="outer")
    all_df["data_type"] = ""
    for n in range(len(all_df)):
        all_df["data_type"][n] = "test" if np.isnan(all_df["state"][n]) else "train"
    return all_df

In [4]:
all_df = concat_train_test(train, test)
all_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_df["data_type"][n] = "test" if np.isnan(all_df["state"][n]) else "train"


Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,data_type
0,0,4001-5000,CH,29,publishing,young adult,"<div class=""contents""><div><span class=""bold"">...",0.0,train
1,1,3001-4000,NL,34,fashion,ready-to-wear,"<div class=""contents""><div><h1 class=""page-anc...",0.0,train
2,2,19001-20000,US,30,food,spaces,"<div class=""contents""><div><p> As our society ...",0.0,train
3,3,2001-3000,US,41,technology,3d printing,"<div class=""contents""><div><p>My name is Donal...",0.0,train
4,4,2001-3000,GB,29,technology,diy electronics,"<div class=""contents""><div><div class=""templat...",1.0,train
...,...,...,...,...,...,...,...,...,...
21084,21084,9001-10000,US,30,food,drinks,"<div class=""contents""><div><p>Its time to get ...",,test
21085,21085,1-1000,US,29,food,small batch,"<div class=""contents""><div><p>I have been roas...",,test
21086,21086,1001-2000,US,27,crafts,pottery,"<div class=""contents""><div><p> I have ...",,test
21087,21087,2001-3000,US,30,design,graphic design,"<div class=""contents""><div><h1 class=""page-anc...",,test


In [5]:
def goal2feature(input_df):
    tmp = input_df["goal"]
    tmp = tmp.replace("100000+", "100000-100999")
    tmp = np.array([g.split("-") for g in tmp], dtype="int")
    output_df = pd.DataFrame(tmp, columns=["goal_min", "goal_max"])
    output_df["goal_upper_flag"] = output_df["goal_min"] == 100000
    output_df["goal_lower_flag"] = output_df["goal_min"] == 1
    output_df["goal_mean"] = output_df[["goal_min", "goal_max"]].mean(axis=1)
    output_df["goal_q25"] = output_df[["goal_min", "goal_max"]].quantile(q=0.25, axis=1)
    output_df["goal_q75"] = output_df[["goal_min", "goal_max"]].quantile(q=0.75, axis=1)
    return output_df

def get_numerical_feature(input_df):
    cols = ["duration"]
    return input_df[cols].copy()

def get_bins(input_df):
    _input_df = pd.concat([
        input_df[["duration"]],
        goal2feature(input_df),
    ], axis=1)
    output_df = pd.DataFrame()
    output_df["bins_duration"] = pd.cut(_input_df["duration"],
                                        bins=[-1, 30, 45, 60, 100],
                                        labels=[1, 2, 3, 4])
    output_df["bins_goal"] = pd.cut(_input_df["goal_max"],
                                    bins=[-1, 19999, 49999, 79999, 99999, np.inf],
                                    labels=[1, 2, 3, 4, 5])
    return output_df.astype(int)

In [6]:
def get_cross_features(df_origin, cols_operator):
    df = df_origin.copy()
    output_df = pd.DataFrame()
    for col1, col2, operator in cols_operator:
        if operator == "+":
            output_df[f"{col1}{operator}{col2}"] = df[col1] + df[col2]
        elif operator == "*":
            output_df[f"{col1}{operator}{col2}"] = df[col1] * df[col2]
        elif operator == "/":
            output_df[f"{col1}{operator}{col2}"] = df[col1] / (df[col2] +0.0000000001)
    return output_df


def labelencord(df_origin, encode_cols):
    df = df_origin.copy()
    le = LabelEncoder()
    for f in encode_cols:
        try:
            df[f] = le.fit_transform(list(df[f].values))
        except:
            print(f"LE didn't work for column ''{f}''")
    return df


def get_coe_features(df_origin, categorical_cols):
    df = df_origin.copy()
    encoder = ce.CountEncoder()
    output_df = encoder.fit_transform(df[categorical_cols]).add_prefix("coe_")
    return output_df


def get_te_features(df_origin, categorical_cols, target_col, fold):
    df = df_origin.copy()
    encoder = TargetEncoder(
        input_cols=categorical_cols, 
        target_col=target_col,
        fold=fold,
        output_prefix="te_",
        output_suffix=""
        )
    output_df = encoder.fit_transform(df)
    return output_df

def get_agg_features(df_origin, group_keys, group_values):
    df = df_origin.copy()
    #agg_methods = ["sum", "min", "max", "mean", "std", "count"]
    #agg_methods = ["sum", "mean", "std", "count"]
    agg_methods = ["std"]
    cols_keys = []
    df_keys = df.copy()
    for group_key in group_keys:
        df_key, cols_key = xfeat.aggregation(df, group_key, group_values, agg_methods)
        cols_keys.extend(cols_key)
        df_keys = pd.concat([df_keys, df_key], axis=1)
    return df_keys[cols_keys]

In [7]:
def apply_preprofuncs(df_origin, cols_operator, group_keys, group_values, categorical_cols):
    df = df_origin.copy()
    preprofuncs = [
        (goal2feature, None),
        (get_bins, None),
        (get_cross_features, [cols_operator]),
        (get_coe_features, [categorical_cols]),
        (get_agg_features, [group_keys, group_values]),
    ]
    for func, args in preprofuncs:
        if args:
            df = pd.concat([df, func(df, *args)], axis=1)
        else:
            df = pd.concat([df, func(df)], axis=1)
    
    df = labelencord(df, categorical_cols)
    df = df.drop("goal", axis=1)    
    return df

In [None]:
cols_operator = [
    ("category1", "category2", "+"),
    ("country", "category1", "+"),
    ("country", "category2", "+"),
    ("bins_duration", "bins_goal", "+"),
    ("goal_max", "duration", "/"),
    ("goal_min", "duration", "/"),
    ("goal_mean", "duration", "/"),
    ("goal_max", "duration", "*"),
    ("goal_min", "duration", "*"),
    ("goal_mean", "duration", "*"),
    ]
group_keys = (
    "country",
    "category1",
    "category2",
    "bins_duration",
    "bins_goal",
    "country+category1",
    "country+category2",
    "category1+category2",
    "bins_duration+bins_goal",
    "coe_country",
    "coe_category1",
    "coe_category2",
    "coe_bins_duration",
    "coe_bins_goal",
    "coe_country+category1",
    "coe_country+category2",
    "coe_category1+category2",
    "coe_bins_duration+bins_goal",
    )
group_values = (
    "goal_min",
    "goal_max",
    "goal_mean",
    "duration",
    "goal_max/duration",
    "goal_min/duration",
    "goal_mean/duration",
    )
categorical_cols = [
    "country",
    "category1",
    "category2",
    'bins_duration',
    'bins_goal',
    'category1+category2',
    'country+category1',
    'country+category2',
    'bins_duration+bins_goal',  
    ]


in_df = all_df.copy()
#base_line_df = apply_preprofuncs(in_df, cols_operator, group_keys, group_values, categorical_cols)
#base_line_df.head()

In [10]:
#base_line_df.to_csv('data/prepared_df.csv', index=False)

In [8]:
nlp_prepared_df = pd.read_csv('data/nlp_prepared_df.csv')
nlp_prepared_df.head()

Unnamed: 0,id,goal,country,duration,category1,category2,state,data_type,number_of_chars,number_of_words,...,number_of_</figure>,number_of_<polygon>,number_of_<button>,number_of_<ul>,number_of_<track>,number_of_<!>,number_of_</span>,number_of_</i>,number_of_</iframe>,number_of_<svg>
0,0,4001-5000,CH,29,publishing,young adult,0.0,train,5289,961,...,6,0,0,0,0,0,18,0,0,0
1,1,3001-4000,NL,34,fashion,ready-to-wear,0.0,train,1144,202,...,15,0,0,0,0,0,0,0,0,0
2,2,19001-20000,US,30,food,spaces,0.0,train,3316,549,...,5,5,4,0,0,6,5,0,0,3
3,3,2001-3000,US,41,technology,3d printing,0.0,train,1670,293,...,0,0,0,0,0,0,0,0,0,0
4,4,2001-3000,GB,29,technology,diy electronics,1.0,train,7560,1211,...,67,45,36,1,0,54,66,0,0,27


In [9]:
def make_cols_operator(nlp_df):
    df = nlp_df.copy()
    cols_operator = []
    #columns = list(df.drop(["id", "goal", "country", "duration", "category1", "category2", "state", "data_type",], axis=1).columns)
    columns = [
        "duration",
        "goal_min",
        "bins_duration",
        "bins_goal",
        "number_of_chars",
        "number_of_words",
        "number_of_sentences",
        "number_of_excmark",
        "number_of_questmark",
        "number_of_punctuation",
        "number_of_symbols",
        "number_of_unique_words",
        "number_of_<span>",
        "number_of_<p>",
        "number_of_<div>",
        "number_of_<video>",
        "number_of_<iframe>",
        "number_of_<img>",
        "number_of_<a>",
        "number_of_<source>",
        "number_of_<figure>",
        "number_of_<li>",
        "number_of_<figcaption>",
    ]
    for c1, col1 in enumerate(columns):
        for c2 in range(c1+1, len(columns)):
            col2 = columns[c2]
            for operator in "+*/":
                cols_operator.append((f"{col1}", f"{col2}", operator))
    return cols_operator

numeric_cols_operator = make_cols_operator(nlp_prepared_df)
len(numeric_cols_operator)

759

In [4]:
def colname_by_colsoperator(cols_operators):
    colnames = []
    for col_operator in cols_operators:
        colname = f"{col_operator[0]}{col_operator[2]}{col_operator[1]}"
        colnames.append(colname)
    return colnames

In [12]:
cols_operator = [
    ("category1", "category2", "+"),
    ("country", "category1", "+"),
    ("country", "category2", "+"),
    #("bins_duration", "bins_goal", "+"),
    ]
cols_operator.extend(numeric_cols_operator)
group_keys = [
    "country",
    "category1",
    "category2",
    "country+category1",
    "country+category2",
    "category1+category2",
]
group_values = [
    "goal_min",
    "duration",
    "bins_duration",
    "bins_goal",
    "number_of_chars",
    "number_of_words",
    "number_of_sentences",
    "number_of_excmark",
    "number_of_questmark",
    "number_of_punctuation",
    "number_of_symbols",
    "number_of_unique_words",
    "number_of_<span>",
    "number_of_<p>",
    "number_of_<div>",
    "number_of_<video>",
    "number_of_<iframe>",
    "number_of_<img>",
    "number_of_<a>",
    "number_of_<source>",
    "number_of_<figure>",
    "number_of_<li>",
    "number_of_<figcaption>",
]
colnames = colname_by_colsoperator(numeric_cols_operator)
group_values.extend(colnames)

categorical_cols = [
    "country",
    "category1",
    "category2",
    'category1+category2',
    'country+category1',
    'country+category2',  
    ]


in_df = nlp_prepared_df.copy()
base_line_df = apply_preprofuncs(in_df, cols_operator, group_keys, group_values, categorical_cols)
base_line_df.head()

Unnamed: 0,id,country,duration,category1,category2,state,data_type,number_of_chars,number_of_words,number_of_sentences,...,agg_std_number_of_<source>/number_of_<figcaption>_grpby_category1+category2,agg_std_number_of_<figure>+number_of_<li>_grpby_category1+category2,agg_std_number_of_<figure>*number_of_<li>_grpby_category1+category2,agg_std_number_of_<figure>/number_of_<li>_grpby_category1+category2,agg_std_number_of_<figure>+number_of_<figcaption>_grpby_category1+category2,agg_std_number_of_<figure>*number_of_<figcaption>_grpby_category1+category2,agg_std_number_of_<figure>/number_of_<figcaption>_grpby_category1+category2,agg_std_number_of_<li>+number_of_<figcaption>_grpby_category1+category2,agg_std_number_of_<li>*number_of_<figcaption>_grpby_category1+category2,agg_std_number_of_<li>/number_of_<figcaption>_grpby_category1+category2
0,0,4,29,12,143,0.0,train,5289,961,40,...,0.05200425,6.800589,21.392277,44077850000.0,7.742554,65.3645,33284710000.0,4.209213,8.61413,26092880000.0
1,1,16,34,5,108,0.0,train,1144,202,9,...,6860526000.0,7.366999,23.450289,66419080000.0,9.688144,71.741247,55278440000.0,4.88929,16.79887,11156860000.0
2,2,21,30,7,122,0.0,train,3316,549,25,...,4850713000.0,8.706562,62.175788,56172160000.0,8.142969,42.251006,38230160000.0,6.069596,4.587076,54173290000.0
3,3,21,41,13,0,0.0,train,1670,293,11,...,7481980000.0,20.866561,277.649709,95167640000.0,18.110084,146.543704,92301750000.0,10.090725,52.483326,46190860000.0
4,4,9,29,13,33,1.0,train,7560,1211,67,...,32253560000.0,20.850752,188.135382,111137300000.0,16.169746,62.979656,139015100000.0,14.568249,102.185141,37724980000.0


In [13]:
#base_line_df.to_csv('data/feature_df.csv', index=False)

In [2]:
#base_line_df = pd.read_csv('data/feature_df.csv')

In [13]:
base_line_df.head()

Unnamed: 0,id,country,duration,category1,category2,state,data_type,number_of_chars,number_of_words,number_of_sentences,...,agg_std_number_of_<source>/number_of_<figcaption>_grpby_category1+category2,agg_std_number_of_<figure>+number_of_<li>_grpby_category1+category2,agg_std_number_of_<figure>*number_of_<li>_grpby_category1+category2,agg_std_number_of_<figure>/number_of_<li>_grpby_category1+category2,agg_std_number_of_<figure>+number_of_<figcaption>_grpby_category1+category2,agg_std_number_of_<figure>*number_of_<figcaption>_grpby_category1+category2,agg_std_number_of_<figure>/number_of_<figcaption>_grpby_category1+category2,agg_std_number_of_<li>+number_of_<figcaption>_grpby_category1+category2,agg_std_number_of_<li>*number_of_<figcaption>_grpby_category1+category2,agg_std_number_of_<li>/number_of_<figcaption>_grpby_category1+category2
0,0,4,29,12,143,0.0,train,5289,961,40,...,0.05200425,6.800589,21.392277,44077850000.0,7.742554,65.3645,33284710000.0,4.209213,8.61413,26092880000.0
1,1,16,34,5,108,0.0,train,1144,202,9,...,6860526000.0,7.366999,23.450289,66419080000.0,9.688144,71.741247,55278440000.0,4.88929,16.79887,11156860000.0
2,2,21,30,7,122,0.0,train,3316,549,25,...,4850713000.0,8.706562,62.175788,56172160000.0,8.142969,42.251006,38230160000.0,6.069596,4.587076,54173290000.0
3,3,21,41,13,0,0.0,train,1670,293,11,...,7481980000.0,20.866561,277.649709,95167640000.0,18.110084,146.543704,92301750000.0,10.090725,52.483326,46190860000.0
4,4,9,29,13,33,1.0,train,7560,1211,67,...,32253560000.0,20.850752,188.135382,111137300000.0,16.169746,62.979656,139015100000.0,14.568249,102.185141,37724980000.0


In [24]:
base_line_df.columns[846]

'agg_std_goal_min_grpby_country'

In [20]:
base_idx = base_line_df.columns[:593]
base_df = base_line_df[base_idx]
base_df
#base_df.to_csv("data/base_df.csv", index=False)

Unnamed: 0,id,country,duration,category1,category2,state,data_type,number_of_chars,number_of_words,number_of_sentences,...,number_of_punctuation*number_of_<video>,number_of_punctuation/number_of_<video>,number_of_punctuation+number_of_<iframe>,number_of_punctuation*number_of_<iframe>,number_of_punctuation/number_of_<iframe>,number_of_punctuation+number_of_<img>,number_of_punctuation*number_of_<img>,number_of_punctuation/number_of_<img>,number_of_punctuation+number_of_<a>,number_of_punctuation*number_of_<a>
0,0,4,29,12,143,0.0,train,5289,961,40,...,0,1.820000e+12,182,0,1.820000e+12,188,1092,3.033333e+01,183,182
1,1,16,34,5,108,0.0,train,1144,202,9,...,0,1.800000e+11,18,0,1.800000e+11,33,270,1.200000e+00,19,18
2,2,21,30,7,122,0.0,train,3316,549,25,...,34,3.400000e+01,34,0,3.400000e+11,39,170,6.800000e+00,39,170
3,3,21,41,13,0,0.0,train,1670,293,11,...,0,2.100000e+11,21,0,2.100000e+11,21,0,2.100000e+11,21,0
4,4,9,29,13,33,1.0,train,7560,1211,67,...,1584,1.955556e+01,177,176,1.760000e+02,243,11792,2.626866e+00,224,8448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21084,21084,21,30,7,36,,test,370,67,5,...,0,6.000000e+10,6,0,6.000000e+10,6,0,6.000000e+10,6,0
21085,21085,21,29,7,117,,test,984,184,11,...,0,1.500000e+11,15,0,1.500000e+11,15,0,1.500000e+11,15,0
21086,21086,21,27,2,98,,test,1288,238,4,...,0,9.000000e+10,12,27,3.000000e+00,17,72,1.125000e+00,9,0
21087,21087,21,30,4,56,,test,2255,382,19,...,0,4.700000e+11,47,0,4.700000e+11,58,517,4.272727e+00,54,329


In [25]:
def apply_svd(feature_df, num_features_svd):
    base_idx = feature_df.columns[:846]
    base_df = feature_df[base_idx]
    feature_idx = feature_df.columns[846:]
    
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp_feature = imp.fit(feature_df[feature_idx])
    feature_df = pd.DataFrame(imp_feature.transform(feature_df[feature_idx]))
    
    transformer = TruncatedSVD(n_components=num_features_svd)
    matrix = transformer.fit_transform(feature_df)
    
    columns = [f"agg_svd_{dim}" for dim in range(num_features_svd)]
    feature_df = pd.DataFrame(matrix, columns=columns)
    ids = [n for n in range(len(feature_df))]
    feature_df["id"] = ids
    feature_df = pd.merge(base_df, feature_df, on="id", how="outer")
    #feature_df.to_csv(f"data/feature_svd{num_features_svd}.csv", index=False)
    return feature_df

In [None]:
feature_df = apply_svd(base_line_df, 16)
feature_df

In [None]:
feature_df.to_csv("data/feature_df_agg_svd16.csv", index=False)