In [2]:
from fastai.tabular import *  # Quick accesss to tabular functionality

In [3]:
import sqlite3
import pandas as pd
import numpy as np
import sys

In [4]:
RT_LIMIT_LOWER = 4340
RT_LIMIT_UPPER = 4580

In [5]:
BASE_NAME = "/home/ubuntu/HeLa_20KInt-rt-{}-{}".format(RT_LIMIT_LOWER,RT_LIMIT_UPPER)
CONVERTED_DATABASE_NAME = '{}/HeLa_20KInt.sqlite'.format(BASE_NAME)

In [6]:
MQ_FEATURE_ID = 487

In [7]:
import pickle
pkl_file_name = "{}/mq-feature-{}-summed-frames-df.pkl".format(BASE_NAME,MQ_FEATURE_ID)
df = pd.read_pickle(pkl_file_name)

In [8]:
df.reset_index(inplace=True)

In [9]:
df.head()

Unnamed: 0,index,frame_id,point_id,mz,scan,intensity,retention_time_secs,peak_id,hover,frame_point,part_of_feature
0,220,2,39899,816.393256,405,358,4344.591351,0,"816.3933 m/z, 405 scan, RT 4344.6 secs",2|39899,True
1,696,3,40268,816.392134,409,96,4346.940378,0,"816.3921 m/z, 409 scan, RT 4346.9 secs",3|40268,True
2,745,3,42059,816.395738,415,120,4346.605542,0,"816.3957 m/z, 415 scan, RT 4346.6 secs",3|42059,True
3,1153,4,38028,816.389161,406,183,4349.734388,3481,"816.3892 m/z, 406 scan, RT 4349.7 secs",4|38028,True
4,2831,8,35920,816.39069,378,156,4358.695199,0,"816.3907 m/z, 378 scan, RT 4358.7 secs",8|35920,True


In [10]:
dep_var = 'part_of_feature'
cat_names = None
cont_names = ['mz', 'scan', 'intensity']
procs = [FillMissing, Categorify, Normalize]

In [11]:
len(df)

8101

In [12]:
feature_points_df = df[df.part_of_feature == True]

In [13]:
non_feature_points_df = df[df.part_of_feature == False]

In [14]:
def train_validate_test_split(df, train_percent=.9, validate_percent=.05, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.loc[perm[:train_end]]
    validate = df.loc[perm[train_end:validate_end]]
    test = df.loc[perm[validate_end:]]
    return train, validate, test

In [15]:
feature_train_df,feature_validate_df,feature_test_df = train_validate_test_split(feature_points_df)
non_feature_train_df,non_feature_validate_df,non_feature_test_df = train_validate_test_split(non_feature_points_df)

In [16]:
train_df = pd.concat([feature_train_df,non_feature_train_df])
valid_df = pd.concat([feature_validate_df,non_feature_validate_df])
test_df = pd.concat([feature_test_df,non_feature_test_df])

In [17]:
len(train_df)+len(valid_df)+len(test_df)

8101

In [19]:
# separate out the test data
test = TabularList.from_df(test_df, path=BASE_NAME, cat_names=cat_names, cont_names=cont_names)

In [21]:
data = (TabularList.from_df(pd.concat([train_df,valid_df]), path=BASE_NAME, cat_names=cat_names, cont_names=cont_names, procs=procs)
                           .random_split_by_pct(0.1)
                           .label_from_df(cols=dep_var)
                           .add_test(test)
                           .databunch())

In [22]:
learn = tabular_learner(data, layers=[200,200,200,100], metrics=accuracy)
learn.fit(3, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
1,0.27867,0.291544,0.884265,00:01
2,0.26653,0.238551,0.903771,00:01
3,0.260185,0.257725,0.888166,00:01


## Inference

In [23]:
row = df[df.frame_point=='3|42059'].iloc[0]

In [24]:
row

index                                                     745
frame_id                                                    3
point_id                                                42059
mz                                                    816.396
scan                                                      415
intensity                                                 120
retention_time_secs                                   4346.61
peak_id                                                     0
hover                  816.3957 m/z, 415 scan, RT 4346.6 secs
frame_point                                           3|42059
part_of_feature                                          True
Name: 2, dtype: object

In [29]:
pred = learn.predict(row)
pred

(<fastai.core.Category at 0x7f4712238dd8>, tensor(0), tensor([0.9924, 0.0076]))

In [26]:
learn.data.classes

[False, True]

In [27]:
probs = pred[2]

In [28]:
probs[0]

tensor(0.9924)