In [13]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score
import pickle
from data_processors import DataPreprocessor, FeatureGenerator


In [14]:
features_df = pd.DataFrame(pd.read_csv('../Features/features.csv', sep="\t"))

In [15]:
data_train_df = pd.read_csv("data_train.csv")

In [16]:
data_preprocessor = DataPreprocessor(data_train_df, features_df)
feature_generator = FeatureGenerator()


In [17]:
merged_df = data_preprocessor.transform()
feature_generator.fit(merged_df)
merged_df = feature_generator.transform(merged_df)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(merged_df.drop(['target'], axis=1), merged_df['target'], random_state=50, stratify=merged_df['target'], test_size=0.33)

In [19]:
tree = DecisionTreeClassifier(criterion='gini', splitter='best', min_samples_split=2800, random_state=5, class_weight='balanced',)
tree.fit(X_train, y_train)
preds = tree.predict(X_test)

In [20]:
f1_score(y_test, preds, average='macro')

0.7113839745141134

In [21]:
roc_auc_score(y_test, preds)

0.8738254609149038

In [26]:
pickle.dump(tree, open('model.pkl', 'wb'))
pickle.dump(feature_generator.__dict__, open('feature_gen.pkl', 'wb'))