In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import GroupShuffleSplit
from catboost import CatBoostClassifier

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

Length = 32

In [2]:
dial_train = pd.read_parquet("./Hackathon/dial_train.parquet")
dial_test = pd.read_parquet("./Hackathon/dial_test.parquet")
train_target = pd.read_parquet("./Hackathon/train_target.parquet")
test_target = pd.read_parquet("./Hackathon/test_target_b.parquet")

In [3]:
dial_train.embedding = dial_train.embedding.apply(lambda x: x[:Length])
dial_test.embedding = dial_test.embedding.apply(lambda x: x[:Length])

In [4]:
dial_train['month_year'] = dial_train['event_time'].dt.to_period('M')
dial_train_ = dial_train.groupby(['client_id',
                                  'month_year'])['embedding'].apply(lambda x: np.mean(x.tolist(),
                                                                                      axis=0)).reset_index()
dial_train_['month_year']= dial_train_['month_year'].astype('str')
train_target['month_year'] = train_target['mon'].apply(lambda x: x[:-3])
train_data = pd.merge(train_target, dial_train_, on=['client_id',
                                                     'month_year'], how='left')
train_data.drop(columns=['month_year'], inplace=True)

In [5]:
dial_test['month_year'] = dial_test['event_time'].dt.to_period('M')
dial_test_ = dial_test.groupby(['client_id',
                                  'month_year'])['embedding'].apply(lambda x: np.mean(x.tolist(),
                                                                                      axis=0)).reset_index()
dial_test_['month_year']= dial_test_['month_year'].astype('str')
test_target['month_year'] = test_target['mon'].apply(lambda x: x[:-3])
test_data = pd.merge(test_target, dial_test_, on=['client_id',
                                                     'month_year'], how='left')
test_data.drop(columns=['month_year'], inplace=True)

In [6]:
test_data = test_data.fillna('nan')
test_data.embedding = test_data.embedding.apply(lambda x: x 
                                                if (type(x) != str) 
                                                else np.zeros((Length,)))
test_data.embedding = test_data.embedding.apply(lambda x: x.astype(float))

In [7]:
splitter = GroupShuffleSplit(test_size=.3, n_splits=2, random_state = 7)
split = splitter.split(train_data, groups=train_data['client_id'])
train_inds, test_inds = next(split)

train = train_data.iloc[train_inds]
test = train_data.iloc[test_inds]

In [8]:
train['embedding'] = train['embedding'].apply(lambda x: 'nan'
                                              if (type(x)==float and random.random() > 0.9)
                                              else x)
train = train.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['embedding'] = train['embedding'].apply(lambda x: 'nan' if (type(x)==float and random.random() > 0.9) else x)


In [9]:
X_train = train.drop(columns=['target_1', 'target_2', 'target_3', 'target_4',
                              'mon', 'client_id'])
y_train = train[['target_1', 'target_2', 'target_3', 'target_4']]

X_test = test.drop(columns=['target_1', 'target_2', 'target_3', 'target_4',
                            'mon', 'client_id'])
y_test = test[['target_1', 'target_2', 'target_3', 'target_4']]

In [10]:
clf = CatBoostClassifier(verbose=100,
                     loss_function='MultiLogloss',
                    task_type='GPU',
                    embedding_features=['embedding']
                    )

In [11]:
X_train.embedding = X_train.embedding.apply(lambda x: x 
                                            if (type(x) != str) 
                                            else np.zeros((Length,)))
X_train.embedding = X_train.embedding.apply(lambda x: x.astype(float))

In [12]:
clf.fit(X_train, y_train)

Learning rate set to 0.022641


KeyboardInterrupt: 

In [None]:
X_test = X_test.fillna('nan')
X_test.embedding = X_test.embedding.apply(lambda x: x 
                                          if (type(x) != str) 
                                          else np.zeros((Length,)))
X_test.embedding = X_test.embedding.apply(lambda x: x.astype(float))

In [None]:
preds = clf.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, preds)
roc_auc

In [35]:
test_data.embedding = test_data.embedding.apply(lambda x: x.astype(float))
test_preds = clf.predict_proba(pd.DataFrame(test_data.embedding))
test_data[['target_1', 'target_2', 'target_3', 'target_4']] = test_preds
test_data = test_data.drop(columns=['embedding'])
test_data = test_data.groupby('client_id')[['target_1', 'target_2', 'target_3', 'target_4']].mean().reset_index()

In [52]:
submit_df = pd.DataFrame({"client_id": test_target["client_id"].unique()}).merge(test_data, how="left").fillna(0)
submit_df.to_csv('df_dial.csv')