## Binary structure classification used in tree building: Step 2. Feature-rich approach

Train models, save the best one.

Output:
 - ``models/structure_predictor_baseline/*``

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import glob
import pandas as pd
import pickle
from utils.file_reading import read_edus, read_gold, read_negative, read_annotation
from utils.prepare_sequence import _prepare_sequence
from tqdm import tqdm_notebook as tqdm

random_state = 45

### Make a directory

In [None]:
import os

model_path = 'models/structure_predictor_baseline'
! mkdir $model_path

### Prepare train/test sets 

In [None]:
IN_PATH = 'data_structure'

train_samples = pd.read_pickle(os.path.join(IN_PATH, 'train_samples.pkl'))
dev_samples = pd.read_pickle(os.path.join(IN_PATH, 'dev_samples.pkl'))
test_samples = pd.read_pickle(os.path.join(IN_PATH, 'test_samples.pkl'))

In [None]:
drop_columns = ['snippet_x', 'snippet_y', 'category_id', 
                'snippet_x_tmp', 'snippet_y_tmp', 
                'filename', 'order', 'postags_x', 'postags_y',
                'is_broken', 'tokens_x', 'tokens_y', 'level_0']

y_train, X_train = train_samples['relation'].to_frame(), train_samples.drop('relation', axis=1).drop(
    columns=drop_columns + ['category_id'])
y_dev, X_dev = dev_samples['relation'].to_frame(), dev_samples.drop('relation', axis=1).drop(
    columns=drop_columns + ['category_id'])
y_test, X_test = test_samples['relation'].to_frame(), test_samples.drop('relation', axis=1).drop(
    columns=drop_columns + ['category_id'])

In [None]:
constants = [c for c in X_train.columns if len(set(X_train[c])) == 1]

In [None]:
X_train = X_train.drop(columns=constants)
X_dev = X_dev.drop(columns=constants)
X_test = X_test.drop(columns=constants)

### Classifiers training 

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, RobustScaler

std_scaler = MinMaxScaler().fit(X_train.values)

X_train = pd.DataFrame(std_scaler.transform(X_train.values), index=X_train.index, columns=X_train.columns)
X_dev = pd.DataFrame(std_scaler.transform(X_dev.values), index=X_dev.index, columns=X_dev.columns)
X_test = pd.DataFrame(std_scaler.transform(X_test.values), index=X_test.index, columns=X_test.columns)

In [None]:
X_train.head()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline


model = LogisticRegression(solver='lbfgs', C=0.0005, n_jobs=4, class_weight='balanced', random_state=random_state)
model.fit(X_train, y_train)

In [None]:
from sklearn import metrics


predicted = model.predict(X_test)
print('pr:', metrics.precision_score(y_test, predicted))
print('re:', metrics.recall_score(y_test, predicted))
print('f1:', metrics.f1_score(y_test, predicted))
print()
print(metrics.classification_report(y_test, predicted))

In [None]:
from sklearn.svm import LinearSVC

svc = LinearSVC(random_state=random_state, C=0.01, class_weight='balanced')
svc.fit(X_train, y_train)

In [None]:
from sklearn import metrics


model = svc
predicted = model.predict(X_dev)
print('f1: %.2f'%(metrics.f1_score(y_dev, predicted)*100.))
print('pr: %.2f'%(metrics.precision_score(y_dev, predicted)*100.))
print('re: %.2f'%(metrics.recall_score(y_dev, predicted)*100.))
print()
print(metrics.classification_report(y_dev, predicted, digits=4))

In [None]:
predicted = model.predict(X_test)
print('f1: %.2f'%(metrics.f1_score(y_test, predicted)*100.))
print('pr: %.2f'%(metrics.precision_score(y_test, predicted)*100.))
print('re: %.2f'%(metrics.recall_score(y_test, predicted)*100.))
print()
print(metrics.classification_report(y_test, predicted, digits=4))

In [None]:
svc.labels = ["0", "1"]
pickle.dump(svc, open(os.path.join(model_path, 'model.pkl'), 'wb'))
pickle.dump(std_scaler, open(os.path.join(model_path, 'scaler.pkl'), 'wb'))
pickle.dump(constants+drop_columns, open(os.path.join(model_path, 'drop_columns.pkl'), 'wb'))