In [5]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score
import pickle

In [104]:
features_df = pd.DataFrame(pd.read_csv('../Features/features.csv', sep="\t"))

In [169]:
data_train_df = pd.read_csv("data_train.csv")

In [106]:
class DataPreprocessor():
    def __init__(self, data, features):
        self.data = data
        self.features = features

    def fit(self):
        return self


    def transform(self):
        merged = pd.merge(self.data, self.features, how='left', on=['id',])
        merged['time_delta'] = abs(merged['buy_time_x'] - merged['buy_time_y'])
        merged.sort_values(['Unnamed: 0_x', 'time_delta'], ascending=True, inplace=True)
        merged.drop_duplicates(subset=['Unnamed: 0_x'], inplace=True)

        return merged

    

In [133]:
class FeatureGenerator():
    def __init__(self) -> None:
        pass

    
    def fit(self, df):
        self.median_time = df.loc[df['target']==1].groupby(['vas_id'])['buy_time_x'].agg('median').to_dict()
        self.vas_month = df.loc[df['target']==1].groupby(['vas_id'])['buy_time_x'].agg(lambda x:x.value_counts().index[0])
        self.vas_month = pd.to_datetime(self.vas_month, unit='s').dt.month.to_dict()
        return self
    
    def transform(self, df):
        df['vas_time'] = df['vas_id'].map(self.median_time)
        df.drop(['Unnamed: 0_x', 'Unnamed: 0_y', 'id',], axis=1, inplace=True)
        df['buy_month'] = pd.to_datetime(df['buy_time_x'], unit='s').dt.month
        df = pd.get_dummies(df, columns=['vas_id'])
        df['buy_month_delta'] = df['buy_month'] - pd.to_datetime(df['vas_time']).dt.month
        df.drop(['buy_month',], axis=1, inplace=True)

        return df



In [134]:
data_preprocessor = DataPreprocessor(data_train_df, features_df)
feature_generator = FeatureGenerator()


In [135]:
merged_df = data_preprocessor.transform()
feature_generator.fit(merged_df)
merged_df = feature_generator.transform(merged_df)

In [137]:
X_train, X_test, y_train, y_test = train_test_split(merged_df.drop(['target'], axis=1), merged_df['target'], random_state=50, stratify=merged_df['target'], test_size=0.33)

In [138]:
tree = DecisionTreeClassifier(criterion='gini', splitter='best', min_samples_split=2800, random_state=5, class_weight='balanced',)
tree.fit(X_train, y_train)
preds = tree.predict(X_test)

In [139]:
f1_score(y_test, preds, average='macro')

0.7113839745141134

In [140]:
roc_auc_score(y_test, preds)

0.8738254609149038

In [159]:
pickle.dump(tree, open('model.pkl', 'wb'))
pickle.dump(feature_generator, open('feature_gen.pkl', 'wb'))