In [1]:
import sys, os, pickle
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

import pandas as pd
from d3mds import D3MDataset, D3MProblem, D3MDS
from mlutils.preprocessing import Drift_thresholder
from mlutils.encoding import NA_encoder, Categorical_encoder
from mlutils.model.classification import Clf_feature_selector, Classifier
from mlutils.prediction import Predictor
from mlutils.preprocessing import Reader as Munger
from mlutils.optimisation import Optimiser
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

try:
    os.environ["HDF5_USE_FILE_LOCKING"]
except KeyError:
    os.environ["HDF5_USE_FILE_LOCKING"]='FALSE'

Using TensorFlow backend.


In [2]:
here = os.getcwd()

In [3]:
dspath = os.path.join(here, '..', '..', 'LL1_MITLL_synthetic_vora_E_dataset')
prpath = os.path.join(here, '..', '..', 'LL1_MITLL_synthetic_vora_E_problem')
solpath = os.path.join(here, '..')
assert os.path.exists(dspath)
assert os.path.exists(prpath)

d3mds = D3MDS(dspath, prpath)

target_name = d3mds.problem.get_targets()[0]['colName']

In [4]:
# make train and test data
X_train = d3mds.get_train_data() # read train data
X_train[target_name] = d3mds.get_train_targets().ravel() # read train targets and merge it with the train data dataFrame
X_train.to_csv('X_train.csv', index=None) # save the train dataFrame

X_test = d3mds.get_test_data() # read test data
X_test.to_csv('X_test.csv', index=None) # save the test dataFrame

In [5]:
# clean the train and test data
mgr = Munger(sep = ',')
df = mgr.train_test_split(['X_train.csv', 'X_test.csv'], target_name)


reading csv : X_train.csv ...
cleaning data ...
CPU time: 20.355665683746338 seconds

reading csv : X_test.csv ...
cleaning data ...
CPU time: 0.7608325481414795 seconds

> Number of common features : 44

gathering and crunching for train and test datasets ...
reindexing for train and test datasets ...
dropping training duplicates ...
dropping constant variables on training set ...

> Number of categorical features: 35
> Number of numerical features: 9
> Number of training samples : 160340
> Number of test samples : 40085

> Top sparse features (% missing values on train set):
x23    0.2
x15    0.2
x39    0.2
x22    0.2
x5     0.2
dtype: float64

> Task : classification
0.0    150063
1.0     10277
Name: y, dtype: int64

encoding target ...


In [6]:
# calculate the drift and threshold data (used both train and test data)
dft = Drift_thresholder()
df = dft.fit_transform(df)


computing drifts ...
CPU time: 7.827710866928101 seconds

> Top 10 drifts

('x3', 0.01006988709718315)
('x34', 0.00743509361873973)
('x15', 0.005878717834974179)
('x35', 0.005671431762328094)
('x23', 0.005572845868457188)
('x1', 0.005432281641961634)
('x5', 0.005072421476614908)
('x13', 0.004419132868634934)
('x0', 0.0035563743469744225)
('x26', 0.0033716634978615545)

> Deleted variables : []
> Drift coefficients dumped into directory : save


In [7]:
space = {
        'ne__numerical_strategy':{"search":"choice","space":[0]},
        'ce__strategy':{"search":"choice","space":["random_projection"]}, 
        'fs__strategy':{"search":"choice","space":['rf_feature_importance']}
        }

In [8]:
opt = Optimiser(scoring = 'accuracy', n_folds = 3)
opt.evaluate(space, df)


##################################################### testing hyper-parameters... #####################################################

>>> NA ENCODER :{'numerical_strategy': {'search': 'choice', 'space': [0]}, 'categorical_strategy': '<NULL>'}

>>> CA ENCODER :{'strategy': {'search': 'choice', 'space': ['random_projection']}}

>>> FEATURE SELECTOR :{'strategy': {'search': 'choice', 'space': ['rf_feature_importance']}, 'threshold': 0.3}

>>> ESTIMATOR :{'strategy': 'LightGBM', 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'subsample': 0.9, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'nthread': -1, 'seed': 0}


MEAN SCORE : accuracy = nan
VARIANCE : nan (fold 1 = nan, 

nan

In [9]:
best = opt.optimise(space, df, 5)

##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}
>>> CA ENCODER :{'strategy': 'random_projection'}  
>>> FEATURE SELECTOR :{'strategy': 'rf_feature_importance', 'threshold': 0.3}
>>> ESTIMATOR :{'strategy': 'LightGBM', 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'subsample': 0.9, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'nthread': -1, 'seed': 0}
MEAN SCORE : accuracy = 0.9646875399493524         
VARIANCE : 0.0004047736233894222 (fold 1 = 0.9650869085262035, fold 2 = 0.9641326921997493, fold 3 = 0.9648430

In [None]:
prd = Predictor()
prd.fit_predict(best, df)

In [11]:
data = df

In [12]:
# process numerical columns in train data
na_encoder = NA_encoder(numerical_strategy=0)
data['train'] = na_encoder.fit_transform(data['train'])

In [13]:
# process/encode categorical columns in trian data
ce_encoder = Categorical_encoder(strategy='random_projection')
data['train'] = ce_encoder.fit_transform(data['train'], data['target'])

In [14]:
# do feature selection in train data
fs_selector = Clf_feature_selector(strategy='rf_feature_importance', threshold=0.25)
data['train'] = fs_selector.fit_transform(data['train'], data['target'])

In [15]:
# fit an estimator
estimator = Classifier(strategy= "LightGBM", max_depth=10)
estimator.fit(data['train'], data['target'])

<mlutils.model.classification.classifier.Classifier at 0x14c5a78755f8>

In [16]:
# perform above processing steps on test data
data['test'] = na_encoder.transform(data['test'])
data['test'] = ce_encoder.transform(data['test'])
data['test'] = fs_selector.transform(data['test'])

In [17]:
y_pred = estimator.predict(data['test']) # make predictions

In [18]:
# decode the predictions
fhand = open(os.path.join('save', "target_encoder.obj"), 'rb')
enc = pickle.load(fhand)
fhand.close()
y_pred = enc.inverse_transform(y_pred)
y_pred_df = pd.DataFrame(index=X_test.index, data=y_pred, columns=[target_name])
y_pred_df.to_csv(os.path.join('.','predictions.csv'))

In [21]:
# read the truth target values in test data
y_test = d3mds.get_test_targets().ravel()

In [22]:
print(classification_report(y_test, y_pred)) # classification report

              precision    recall  f1-score   support

           0       0.97      1.00      0.98     37461
           1       0.93      0.55      0.69      2624

    accuracy                           0.97     40085
   macro avg       0.95      0.78      0.84     40085
weighted avg       0.97      0.97      0.96     40085



In [23]:
# compute the f1 score
f_score = f1_score(y_test, y_pred, pos_label=1)
scoresdf = pd.DataFrame(columns=['metric','value','randomState'])
scoresdf.loc[len(scoresdf)]=['f1', f_score, 'N/A']
scoresdf.to_csv(os.path.join('.','scores.csv'))