# Two Parts

1. Run off the test data to get the test scores for a given model (to give us some indication of the accuracy)
2. With the various models, create a LR using the test data as its own train/val.

In [1]:
import configparser
import os
import sys
from os import path

PATH_ROOT = ""
PATH_DATA = ""

creds_path_ar = ["../../credentials.ini", "credentials.ini"]

for creds_path in creds_path_ar:
    if path.exists(creds_path):
        config_parser = configparser.ConfigParser()
        config_parser.read(creds_path)
        PATH_ROOT = config_parser['MAIN']["PATH_ROOT"]
        PATH_DATA = config_parser['MAIN']["PATH_DATA"]
        WANDB_enable = config_parser['MAIN']["WANDB_ENABLE"] == 'TRUE'
        ENV = config_parser['MAIN']["ENV"]

# adding cwd to path to avoid "No module named src.*" errors
sys.path.insert(0, os.path.join(PATH_ROOT))

In [2]:
import argparse
import random
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
from sklearn.metrics import roc_curve, auc
from src.data import get_data
from src.data.iterable_dataset import Config, DataDict, StreamingDataset, iq_to_spectogram, \
    normalize
from src.models import arch_setup, tcn_model3
from src.data import get_data
from src.visualization import metrics
from src.features import specto_feat
import wandb
import logging


In [3]:
%cd {PATH_ROOT}

/home/ubuntu/sota-mafat-radar


In [4]:
# wandb.init()

In [5]:
# model_paths = ['sota-mafat/sota-mafat-base/1epmi6lf','sota-mafat/sota-mafat-base/3s0bv1dr']
test_path = 'MAFAT RADAR Challenge - FULL Public Test Set V1'
final_test_path = 'MAFAT RADAR Challenge - Private Test Set V1'


### PART 1

In [11]:
def load_model(model_path: str):
    '''
    Load Model from Wandb
    '''
    wandb.init()
    wandb.restore('data/models/model.pth', run_path=model_path)
    return torch.load(path.join(PATH_ROOT, 'wandb/latest-run/files/data/models/model.pth'))


def load_testset(test_path: str):
    '''
    Load Test Data
    '''
    test_data = pd.DataFrame.from_dict(get_data.load_data(test_path, PATH_DATA), orient='index').transpose()
    return test_data


def run_predictions(model, test_df, final_submission = False):
    '''
    Have the predictions ready for submission
    '''
    test_df['output_array'] = test_df['iq_sweep_burst'].progress_apply(iq_to_spectogram)
    test_df['output_array'] = test_df.progress_apply(lambda row: specto_feat.max_value_on_doppler(row['output_array'], row['doppler_burst']), axis=1)
    test_df['output_array'] = test_df['output_array'].progress_apply(normalize)
    test_x = torch.from_numpy(np.stack(test_df['output_array'].tolist(), axis=0).astype(np.float32)).unsqueeze(1)

    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu:0')

    # Creating DataFrame with the probability prediction for each segment
    submission = pd.DataFrame()
    submission['segment_id'] = test_df['segment_id']
    submission['prediction'] = model(test_x.to(device)).detach().cpu().numpy()
    if not final_submission:
        submission['label'] = test_df['target_type']
    return submission

def check_model_auc(local_model_path: str, test_path: str):
    '''
    1. Load the Model (using load_model())
    2. Load the Test Data (using load_testdata())
    3. Return the predictionsauc and acc scores of predictions
    '''
    model = load_local_model(local_model_path)
    test_df = load_testset(test_path)
    test_df['target_type'].replace({'animal': 0, 'human': 1}, inplace=True)
    predictions = run_predictions(model, test_df)
    return metrics.model_scores(predictions['label'], predictions['prediction'])

def load_local_model(local_model_path: str):
    return torch.load(path.join(PATH_ROOT, 'wandb', local_model_path, 'files/data/models/model.pth'))


In [7]:
model_path = 'sota-mafat/sota-mafat-base/2v6bs8kw'
model_path2 = 'sota-mafat/sota-mafat-base/3s0bv1dr'
model_path3 = 'sota-mafat/sota-mafat-tcn/2j8o99e4'
# model_path4 = 'sota-mafat/sota-mafat-base/1wolsedh'

local_model_paths = ['run-20201015_062929-j2ac4ecg','run-20201015_063004-3mp7hqo7','run-20201015_063022-3fscuvx1']
# model = load_model(model_path)
# test_path = 'MAFAT RADAR Challenge - FULL Public Test Set V1'
# test_dict = load_testset(test_path)
# predics = run_predicions(model, test_path)

In [13]:
check_model_auc(model_path4,test_path)



VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

100%|██████████| 284/284 [00:00<00:00, 2095.43it/s]
100%|██████████| 284/284 [00:00<00:00, 3294.82it/s]
100%|██████████| 284/284 [00:00<00:00, 13395.06it/s]


0.7667262259281319

In [10]:
check_model_auc(local_model_paths[0], test_path)

run-20201015_062929-j2ac4ecg
100%|██████████| 284/284 [00:00<00:00, 2111.74it/s]
100%|██████████| 284/284 [00:00<00:00, 3088.48it/s]
100%|██████████| 284/284 [00:00<00:00, 12550.92it/s]


0.779084772682152

In [12]:
check_model_auc(local_model_paths[1], test_path)

100%|██████████| 284/284 [00:00<00:00, 2092.41it/s]
100%|██████████| 284/284 [00:00<00:00, 3209.40it/s]
100%|██████████| 284/284 [00:00<00:00, 12739.51it/s]


0.7667262259281319

In [13]:
check_model_auc(local_model_paths[2], test_path)

100%|██████████| 284/284 [00:00<00:00, 2117.02it/s]
100%|██████████| 284/284 [00:00<00:00, 2914.48it/s]
100%|██████████| 284/284 [00:00<00:00, 12318.33it/s]


0.7787373436569386

### PART 2

In [27]:
from sklearn.linear_model import LogisticRegression as LogR
from sklearn.model_selection import train_test_split
import math

def sigmoid(x):
  return 1 / (1 + math.exp(-x))

In [15]:
def basic_mean(model_paths: list, test_path, final_submission = False):
    preds = []
    test_df = load_testset(test_path)
    test_df['target_type'].replace({'animal': 0, 'human': 1}, inplace=True)
    for model_path in model_paths:
        model = load_local_model(model_path)
        pred = run_predictions(model, test_df, final_submission)
        if pred['prediction'].min() < 0:
            print(model_path)
        preds.append(pred['prediction'])
    df = pd.concat(preds, axis=1)
    pred = df.mean(axis=1)
    labels = test_df['target_type']
    return metrics.model_scores(labels, pred), final_submission
    

In [16]:
def weighted_mean(model_paths: list, test_path, final_submission = False):
    preds = []
    scores = []
    test_df = load_testset(test_path)
    labels = test_df['target_type']
    test_df['target_type'].replace({'animal': 0, 'human': 1}, inplace=True)
    for model_path in model_paths:
        model = load_local_model(model_path)
        pred = run_predictions(model, test_df, final_submission)
        preds.append(pred['prediction'])
        if pred['prediction'].min() < 0:
            print(model_path)
        scores.append(metrics.model_scores(labels,pred['prediction']))
    df = pd.concat(preds, axis=1)
    scores = np.array(scores)
    scores = scores / np.sum(scores)
    weighted_mean = (scores*df).mean(axis=1)
    return metrics.model_scores(labels, weighted_mean), scores
    

In [64]:
def lr_model(model_paths: list, test_path, final_submission=False):
    preds = []
    col_names = range(len(model_paths))
    test_df = load_testset(test_path)
    test_df['target_type'].replace({'animal': 0, 'human': 1}, inplace=True)
    for model_path in model_paths:
        model = load_local_model(model_path)
        pred = run_predictions(model, test_df, final_submission)
        if pred['prediction'].min() < 0:
            print(model_path)
        preds.append(pred['prediction'])
    df = pd.concat(preds, axis=1)
    df.columns = col_names 
    labels = test_df['target_type']
    X_train, X_test,y_train, y_test = train_test_split(df, labels, test_size=0.2, random_state=43)
    clf = LogR().fit(X_train,y_train)
    y_pred = clf.predict_proba(X_test)[:,1]
    print(clf.classes_)
    # y_pred = (y_pred + 1) / 2
    return metrics.model_scores(y_test, y_pred), clf


In [65]:
lr_model(local_model_paths, test_path)

100%|██████████| 284/284 [00:00<00:00, 2108.75it/s]
100%|██████████| 284/284 [00:00<00:00, 3085.88it/s]
100%|██████████| 284/284 [00:00<00:00, 13027.21it/s]
100%|██████████| 284/284 [00:00<00:00, 2127.70it/s]
100%|██████████| 284/284 [00:00<00:00, 3100.02it/s]
100%|██████████| 284/284 [00:00<00:00, 12943.84it/s]
100%|██████████| 284/284 [00:00<00:00, 2120.62it/s]
100%|██████████| 284/284 [00:00<00:00, 3272.77it/s]
100%|██████████| 284/284 [00:00<00:00, 12854.30it/s][0 1]



(0.8337438423645321, LogisticRegression())

In [69]:
weighted_mean(local_model_paths, test_path)

100%|██████████| 284/284 [00:00<00:00, 2182.93it/s]
100%|██████████| 284/284 [00:00<00:00, 2963.80it/s]
100%|██████████| 284/284 [00:00<00:00, 14366.48it/s]
100%|██████████| 284/284 [00:00<00:00, 2181.96it/s]
100%|██████████| 284/284 [00:00<00:00, 2986.30it/s]
100%|██████████| 284/284 [00:00<00:00, 14526.26it/s]
100%|██████████| 284/284 [00:00<00:00, 2170.04it/s]
100%|██████████| 284/284 [00:00<00:00, 3290.76it/s]
100%|██████████| 284/284 [00:00<00:00, 14084.00it/s]


(0.8070776255707762, array([0.33515533, 0.3298388 , 0.33500587]))

In [70]:
lr_model(local_model_paths, test_path)

100%|██████████| 284/284 [00:00<00:00, 2149.91it/s]
100%|██████████| 284/284 [00:00<00:00, 3222.70it/s]
100%|██████████| 284/284 [00:00<00:00, 13827.48it/s]
100%|██████████| 284/284 [00:00<00:00, 2139.61it/s]
100%|██████████| 284/284 [00:00<00:00, 3185.79it/s]
100%|██████████| 284/284 [00:00<00:00, 13700.88it/s]
100%|██████████| 284/284 [00:00<00:00, 2141.51it/s]
100%|██████████| 284/284 [00:00<00:00, 3099.82it/s]
100%|██████████| 284/284 [00:00<00:00, 13586.18it/s]


(0.7881773399014779,
 array([[1.71572887, 0.55828838, 0.45198532]]),
 array([-1.13894214]))

In [66]:
def run_ensemble(model_paths, old_test_path, final_test_path, ensemble_method, final_submission= True):
    preds = []
    test_df = load_testset(final_test_path)
    for model_path in model_paths:
        model = load_local_model(model_path)
        pred = run_predictions(model, test_df, final_submission)
        preds.append(pred['prediction'])
    
    df = pd.concat(preds, axis=1)

    if ensemble_method == "weighted_mean":
        _, scores = weighted_mean(model_paths, old_test_path, final_submission)
        prediction = (scores*df).sum(axis=1)
    elif ensemble_method == "lr_model":
        _, clf = lr_model(model_paths, old_test_path, True)
        prediction = clf.predict_proba(df)[:,1]
    else:
        prediction = df.mean(axis=1)
        
    submission = pd.DataFrame()
    test_df = pd.DataFrame.from_dict(get_data.load_data(final_test_path, PATH_DATA), orient='index').transpose()
    submission['segment_id'] = test_df['segment_id']
    submission['prediction'] = prediction
    return submission

In [26]:
all_options = ['basic_mean','weighted_mean','lr_model']
best_score = 0
best_option = None
for option in all_options:
    score = eval(option)(local_model_paths, test_path)
    if score[0] > best_score:
        best_score = score[0]
        best_option = option

print(best_option)

submission = run_ensemble(local_model_paths, test_path, final_test_path, option)

100%|██████████| 284/284 [00:00<00:00, 2105.00it/s]
100%|██████████| 284/284 [00:00<00:00, 3206.27it/s]
100%|██████████| 284/284 [00:00<00:00, 12577.15it/s]
100%|██████████| 284/284 [00:00<00:00, 2120.50it/s]
100%|██████████| 284/284 [00:00<00:00, 3260.24it/s]
100%|██████████| 284/284 [00:00<00:00, 12722.23it/s]
100%|██████████| 284/284 [00:00<00:00, 2127.91it/s]
100%|██████████| 284/284 [00:00<00:00, 3152.62it/s]
100%|██████████| 284/284 [00:00<00:00, 12851.39it/s]
100%|██████████| 284/284 [00:00<00:00, 2113.72it/s]
100%|██████████| 284/284 [00:00<00:00, 3215.10it/s]
100%|██████████| 284/284 [00:00<00:00, 12880.29it/s]
100%|██████████| 284/284 [00:00<00:00, 2129.44it/s]
100%|██████████| 284/284 [00:00<00:00, 3255.37it/s]
100%|██████████| 284/284 [00:00<00:00, 12588.05it/s]
100%|██████████| 284/284 [00:00<00:00, 2113.07it/s]
100%|██████████| 284/284 [00:00<00:00, 3225.56it/s]
100%|██████████| 284/284 [00:00<00:00, 12451.34it/s]
100%|██████████| 284/284 [00:00<00:00, 2129.84it/s]
100%|█

NameError: name 'math' is not defined

In [67]:
submission = run_ensemble(local_model_paths, test_path, final_test_path, 'lr_model')

100%|██████████| 248/248 [00:00<00:00, 2102.26it/s]
100%|██████████| 248/248 [00:00<00:00, 3210.83it/s]
100%|██████████| 248/248 [00:00<00:00, 12324.64it/s]
100%|██████████| 248/248 [00:00<00:00, 2103.23it/s]
100%|██████████| 248/248 [00:00<00:00, 3198.82it/s]
100%|██████████| 248/248 [00:00<00:00, 12672.39it/s]
100%|██████████| 248/248 [00:00<00:00, 2117.41it/s]
100%|██████████| 248/248 [00:00<00:00, 3067.19it/s]
100%|██████████| 248/248 [00:00<00:00, 13095.81it/s]
100%|██████████| 284/284 [00:00<00:00, 2117.99it/s]
100%|██████████| 284/284 [00:00<00:00, 3162.26it/s]
100%|██████████| 284/284 [00:00<00:00, 12950.31it/s]
100%|██████████| 284/284 [00:00<00:00, 2119.85it/s]
100%|██████████| 284/284 [00:00<00:00, 3083.58it/s]
100%|██████████| 284/284 [00:00<00:00, 13013.55it/s]
100%|██████████| 284/284 [00:00<00:00, 2122.66it/s]
100%|██████████| 284/284 [00:00<00:00, 3207.53it/s]
100%|██████████| 284/284 [00:00<00:00, 12922.49it/s]
[0 1]


In [68]:
submission['prediction']

0      0.829345
1      0.292620
2      0.242568
3      0.821355
4      0.482799
         ...   
243    0.717627
244    0.781365
245    0.252067
246    0.254729
247    0.242621
Name: prediction, Length: 248, dtype: float64

In [69]:
submission.to_csv('SOTA-MAFAT-Final2_lrmodel.csv', index=False)