Issues with MADE dataset:
FTIR
* Dodgy sample numbers
* 2 Hit confidence columns
* 2 substance detected columns

In [243]:
# Module imports
import copy
import datetime
import os
import numpy as np
import pandas as pd

def fix_sample_number(x):
    """Make sure all samples numbers are of form: AXXX (where A is one of A, F, W and X is a digit)"""
    if isinstance(x, float) and np.isnan(x):
        return x # leave NaN's alone
    if (isinstance(x, str) or isinstance(x, unicode)) and len(x) == 0:
        return np.nan
    try:
        sn = 'F{:04d}'.format(int(x))
    except ValueError:
        # Assume string so make sure it's of the right format
        sn = str(x).strip().capitalize()
    if sn[0] not in ['A', 'F', 'W'] or len(sn) != 5:
        print("!!! Bad ID \'%s\'" % sn)
    return sn

def now():
    return datetime.datetime.now().strftime("%d/%m/%y %H:%M:%S")

def enumerate_duplicates(row):
    """Append a counter to duplicate labels"""
    SEPARATOR = '.'
    duplicates = {}
    updated_row = []
    for r in row:
        count = duplicates.get(r, 0)
        if count > 0:
            label = "{}{}{}".format(r, SEPARATOR, count)
        else:
            label = r
        updated_row.append(label)
        duplicates[r] = count + 1
    return updated_row


ftir_csv = 'MADE/FTIR Analysis Data Recording Form.csv'
catalog_csv = 'MADE/Sample Cataloguing Form.csv'
reagent_csv = 'MADE/Reagent Outcomes.csv'
hr_csv = 'MADE/MADE MAST Intervention Questionnaire.csv'

date_cols = ['Timestamp']
df_ftir = pd.read_csv(ftir_csv, engine="python", parse_dates=date_cols)
df_catalog = pd.read_csv(catalog_csv, engine="python", parse_dates=date_cols)
df_reagent = pd.read_csv(reagent_csv, engine="python", parse_dates=date_cols)
df_hr = pd.read_csv(hr_csv, engine="python", parse_dates=date_cols)

mla_excel = 'MADE/MADE - Loop 2018 event results sheet_.xlsx'
df_mla = pd.read_excel(mla_excel, sheetname='MLA', header=1)

# Sort out column names
df_reagent.rename(columns={'Sample Code':'Sample Number', 'Substance(s) detected' : 'Reagent Result'}, inplace=True)
df_hr.rename(columns={'Sample Number:':'Sample Number'}, inplace=True)
df_mla.rename(columns={'Sample Num':'Sample Number'}, inplace=True)

# Make all sample numbers a 4-digit code starting with F
df_ftir['Sample Number'] = df_ftir['Sample Number'].apply(fix_sample_number)
df_catalog['Sample Number'] = df_catalog['Sample Number'].apply(fix_sample_number)
df_reagent['Sample Number'] = df_reagent['Sample Number'].apply(fix_sample_number)
df_hr['Sample Number'] = df_hr['Sample Number'].apply(fix_sample_number)
df_mla['Sample Number'] = df_mla['Sample Number'].apply(fix_sample_number)

# Prune down MLA to valid sample numbers
df_mla = df_mla[df_mla['Sample Number'].notnull()]

DataFrames = namedtuple('DataFrames', ['catalog', 'ftir', 'reagent','mla', 'hr'])
dfs = DataFrames(
    catalog=df_catalog,
    ftir=df_ftir,
    reagent=df_reagent,
    mla=df_mla,
    hr=df_hr)

In [244]:
pd.options.mode.chained_assignment = 'raise'

# Need to define in main or we can't pickle the data objects
class DataFrames(object):
    def __init__(self):
        catalog = None
        ftir = None
        reagent = None
        mla = None
        hr = None

def gsheets_service():
    from googleapiclient.discovery import build
    from httplib2 import Http
    from oauth2client import file, client, tools
    # If modifying these scopes, delete the file token.json.
    CREDS_FILE = '/opt/random/MADE/JensDataExportJupyter_client_secret.json'
    SCOPES = 'https://www.googleapis.com/auth/spreadsheets.readonly'
    store = file.Storage('token.json')
    creds = store.get()
    if not creds or creds.invalid:
        import argparse
        flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args([])
        flow = client.flow_from_clientsecrets(CREDS_FILE, SCOPES)
        creds = tools.run_flow(flow, store, flags)
    service = build('sheets', 'v4', http=creds.authorize(Http()))
    return service

def get_df(service, SPREADSHEET_ID, SS_RANGE, mla=False):
    # Call the Sheets API
    result = service.spreadsheets().values().get(spreadsheetId=SPREADSHEET_ID,
                                                range=SS_RANGE).execute()
    values = result.get('values', [])
    if not values:
        print('*** No data found ***')
        return None

    # mla has irrelevant stuff in columns 1 and 3 and sample numbers in first column
    if mla:
        values.pop(0)
        values.pop(1)
        def not_blank(row):
            return len(row[0]) > 0       
    else:
        def not_blank(row):
            return sum(map(len, row[:6])) > 0

    rows = filter(not_blank, values)
    if not rows:
        print('*** No data found after pruning rows! ***')
        return None
    
    columns = enumerate_duplicates(rows[0])
    ncols = len(rows[0])
    row_max = max(map(len, rows[1:]))
    width = min(ncols, row_max)
    return pd.DataFrame(rows[1:], columns=columns[:width])

def canonicalise_df(df, source=None):
    """Initial cleaning of all dataframes"""
    #from pandas._libs.tslib import OutOfBoundsDatetime
    if source:
        print("Canonicalising %s" % source)
    # Standardise names
    d = {
        'Sample Code':'SampleNumber',
        'Sample Number:':'SampleNumber',
        'Sample Number':'SampleNumber',
        'Sample number':'SampleNumber',
        'Sample Num':'SampleNumber',
        'Sample Number i.e F0XXX' : 'SampleNumber',
        
        'Sample Advertised/Acquired/Sold As' : 'SoldAs',
        'Sample Sold As' : 'SoldAs',
        
        'Sample Source' :'SampleSource',

        'User Suspicion' :'UserSuspicion',

        'Sample Form' :'SampleForm',

        'Has the Service User or a close friend tried this batch?' : 'AlreadyTried',

        'Your initials' : 'Tester',
        'Your name and first initial' : 'Tester',
        'Your name and surname initial' : 'Tester'
    }
    df.rename(columns=d, inplace=True)
    
    def fix_timestamp(x):
        """Horror to fix mixed date formats"""
        return pd.to_datetime(str(x), format='%d/%m/%Y %H:%M:%S')
#         try:
#             x = pd.to_datetime(x)
#         except:
#             try:
#                 # '12/08/2018 12:26:15'
#                 x = pd.to_datetime(x, format='%d/%m/%Y %H:%M:%S')
#             except:
#                 #'Thu 9/08 - 12:19'
#                 x = pd.to_datetime(x, format='%a %m/%y - %H:%M')
#         return x
    if 'Timestamp' in df.columns:
        df.loc[:, 'Timestamp'] = df['Timestamp'].map(fix_timestamp)
    df.loc[:, 'SampleNumber'] = df['SampleNumber'].apply(fix_sample_number)
    df.dropna(subset=['SampleNumber'])
    #df.sort_values(['Sample Number'], ascending=True, inplace=True)
    return df

def get_data(service, SPREADSHEET_ID):

    CATALOG_RANGE = 'Catalog!A:R'
    FTIR_RANGE = 'FTIR!A:X'
    REAGENT_RANGE = 'Reagent!A:W'
    MLA_RANGE = 'MLA!A:R'
    HR_RANGE = 'Interventions!A:BJ'

    #service = gsheets_service()

    df_catalog = get_df(service, SPREADSHEET_ID, CATALOG_RANGE)
    df_catalog = canonicalise_df(df_catalog, source='catalog')
    df_ftir = get_df(service, SPREADSHEET_ID, FTIR_RANGE)
    df_ftir = canonicalise_df(df_ftir, source='ftir')
    df_reagent = get_df(service, SPREADSHEET_ID, REAGENT_RANGE)
    df_reagent = canonicalise_df(df_reagent, source='reagent')
    df_mla = get_df(service, SPREADSHEET_ID, MLA_RANGE, mla=True)
    df_mla = canonicalise_df(df_mla, source='mla')
    try:
        df_hr = get_df(service, SPREADSHEET_ID, HR_RANGE)
    except ValueError:
        df_hr = None
    if df_hr is not None:
        df_hr = canonicalise_df(df_hr, source='hr')

    df = DataFrames()
    df.catalog=df_catalog
    df.ftir=df_ftir
    df.reagent=df_reagent
    df.mla=df_mla
    df.hr=df_hr
    
    return df


# The ID and range of a sample spreadsheet.
BOOMTOWN2018_SPREADSHEET_ID = '1RiA-FwG_954Ger2VPsOSA3JLh-7sEoTYr40eVS0mp24'
MADE2018_SPREADSHEET_ID = '1daXdyL6uL8qnMsEsP0RLZE9nDzt6J7Zr1ygQdguvi-E'
BOARDMASTERS2018_SPREADSHEET_ID = '1U1lhUWLazDBN-wb2eZM8YV674f46npVfQK3XUVZjPow'
SW42018_SPREADSHEET_ID = '1agpMmJ9XukeWXS5_mwrDSKeshUaFtYwOzsPiR1DKsPU'
LOSTVILLAGE2018_SPREADSHEET_ID = '1OL0gyXrpZnJ8e7yR7eF6S2OaBYBiPDoVp5xGpdK4wlA'
BESTIVAL2018_SPREADSHEET_ID = '184qudGcw4PB0SMtOo0ZBDtckeGaH0RCLUXbA-u3BiHE'
YNOT2018_SPREADSHEET_ID = '1D01cj-Mra06TuoG_MsKuLq9OdtvKzrvRdiE255po_ag'
TRUCKFEST2018_SPREADSHEET_ID = '1sGG9WJxKyD2CGUjzJAXul3g9hVnRz6HbTiqKV5cUAyA'
LSTD2018_SPREADSHEET_ID = '1R8YqDnrhvuVMwPFShwaaAUIyCXQMeozA230OXsFsDQM'
KENDALCALLING2018_SPREADSHEET_ID = '16-PfwBOaUxwod3X75LGk1VAjBblkNsTJpCsX825aghI'
PARKLIFE2018_SPREADSHEET_ID = '1oO5sHcUhUn_7M1Hap73sOZHNEfWFMcDkQuWDRFf4d-w'


data = {}
service = gsheets_service()
# print "PROCESSING BOOMTOWN"
# data['boomtown'] = get_data(service, BOOMTOWN2018_SPREADSHEET_ID)
# print "PROCESSING BOARDMASTERS"
# data['boardmasters'] = get_data(service, BOARDMASTERS2018_SPREADSHEET_ID)
# print "PROCESSING MADE"
# data['made'] = get_data(service, MADE2018_SPREADSHEET_ID)
# print "PROCESSING SW4"
# data['sw4'] = get_data(service, SW42018_SPREADSHEET_ID)
# print "PROCESSING LOST VILLAGE"
# data['lostvillage'] = get_data(service, LOSTVILLAGE2018_SPREADSHEET_ID)
# print "PROCESSING BESTIVAL"
# data['bestival'] = get_data(service, BESTIVAL2018_SPREADSHEET_ID)
print "PROCESSING YNOT"
data['ynot'] = get_data(service, YNOT2018_SPREADSHEET_ID)
# print "PROCESSING TRUCKFEST"
# data['truckfest'] = get_data(service, TRUCKFEST2018_SPREADSHEET_ID)
# print "PROCESSING LSTD"
# data['lstd'] = get_data(service, LSTD2018_SPREADSHEET_ID)
# print "PROCESSING KENDAL CALLING"
# data['kc'] = get_data(service, KENDALCALLING2018_SPREADSHEET_ID)
# print "PROCESSING PARKLIFE"
# data['parklife'] = get_data(service, PARKLIFE2018_SPREADSHEET_ID)

import pickle
with open('foo_multi.pkl','w') as w:
    pickle.dump(data, w)
    
dfs = data['ynot']

PROCESSING YNOT
Canonicalising catalog
!!! Bad ID 'M0584'
Canonicalising ftir
Canonicalising reagent
Canonicalising mla
Canonicalising hr
!!! Bad ID 'Z9999'


In [251]:
# with open('foo_multi.pkl') as f:
#     data = pickle.load(f)
# dfs = data['boomtown']

# Check for duplicates
catalog_duplicates = dfs.catalog['SampleNumber'].duplicated()
if catalog_duplicates.any():
    catalog_duplicates = list(dfs.catalog.loc[catalog_duplicates, 'SampleNumber'].values)
    print("### %d duplicated catalog SampleNumbers %s ###" % (len(catalog_duplicates), catalog_duplicates))
    dfs.catalog[dfs.catalog['SampleNumber'].duplicated(keep=False)].to_csv('catalog_duplicates.csv')
else:
    catalog_duplicates = None
    
ftir_duplicates = dfs.ftir['SampleNumber'].duplicated()
if ftir_duplicates.any():
    ftir_duplicates = list(dfs.ftir.loc[dfs.ftir['SampleNumber'].duplicated(), 'SampleNumber'].values)
    print("### %d duplicated FTIR SampleNumbers %s ###" % (len(ftir_duplicates), ftir_duplicates))
    dfs.ftir[dfs.ftir['SampleNumber'].duplicated(keep=False)].to_csv('ftir_duplicates.csv')
else:
    ftir_duplicates = None

reagent_duplicates = dfs.reagent['SampleNumber'].duplicated()
if reagent_duplicates.any():
    reagent_duplicates = list(dfs.reagent.loc[dfs.reagent['SampleNumber'].duplicated(), 'SampleNumber'].values)
    print("### %d duplicated reagent SampleNumbers %s ###" % (len(reagent_duplicates), reagent_duplicates))    
    dfs.reagent[dfs.reagent['SampleNumber'].duplicated(keep=False)].to_csv('reagent_duplicates.csv', encoding = 'utf-8')
else:
    reagent_duplicates = None

hr_duplicates = None
if dfs.hr is not None:
    hr_duplicates = dfs.hr['SampleNumber'].duplicated()
    if hr_duplicates.any():
        hr_duplicates = list(dfs.hr.loc[dfs.hr['SampleNumber'].duplicated(), 'SampleNumber'].values)
        print("### %d duplicated HR SampleNumbers %s ###" % (len(hr_duplicates), hr_duplicates))
        dfs.hr[dfs.hr['SampleNumber'].duplicated(keep=False)].to_csv('hr_duplicates.csv', encoding = 'utf-8')
    else:
        hr_duplicates = None

mla_duplicates = dfs.mla['SampleNumber'].duplicated()
if mla_duplicates.any():
    mla_duplicates = list(dfs.mla.loc[dfs.mla['SampleNumber'].duplicated(), 'SampleNumber'].values)
    print("### %d duplicated MLA SampleNumbers %s ###" % (len(mla_duplicates), mla_duplicates))
    dfs.mla[dfs.mla['SampleNumber'].duplicated(keep=False)].to_csv('mla_duplicates.csv')
else:
    mla_duplicates = None
    
if catalog_duplicates or \
    ftir_duplicates or \
    reagent_duplicates or \
    hr_duplicates or \
    mla_duplicates:
    outs = 'Please fix duplicated values'
    print(outs)
#     raise RuntimeError(outs)

### 6 duplicated catalog SampleNumbers ['A0431', 'A0442', 'A0443', 'F0505', 'A0477', 'A0573'] ###
### 11 duplicated FTIR SampleNumbers ['A0433', 'A0444', 'A0488', 'A0468', 'F0496', 'A0553', 'A0553', 'A0533', 'F0548', 'A0589', 'A0585'] ###
### 2 duplicated HR SampleNumbers ['F0395', 'F0400'] ###
### 4 duplicated MLA SampleNumbers ['A0371', 'A0521', 'A0521', 'A0433'] ###
Please fix duplicated values


In [252]:
# Check there are no SampleNumbers in any of the other spreadsheets that aren't in the cataolog sheet
catalog_unique = set(dfs.catalog['SampleNumber'].unique())

ftir_unique = set(dfs.ftir['SampleNumber'].unique())
ftir_orphan = ftir_unique.difference(catalog_unique)
if ftir_orphan:
    print("Orphaned FTIR SampleNumbers: %s" % sorted(ftir_orphan))

reagent_unique = set(dfs.reagent['SampleNumber'].unique())
reagent_orphan = reagent_unique.difference(catalog_unique)
if reagent_orphan:
    print("Orphaned Reagent Test SampleNumbers: %s" % sorted(reagent_orphan))

hr_unique = set(dfs.hr['SampleNumber'].unique())
hr_orphan = hr_unique.difference(catalog_unique)
if hr_orphan:
    print("Orphaned HR SampleNumbers: %s" % sorted(hr_orphan))
    
mla_unique = set(dfs.mla['SampleNumber'].unique()).difference(catalog_unique)
mla_orphan = mla_unique.difference(catalog_unique)
if mla_orphan:
    print("Orphaned MLA SampleNumbers: %s" % sorted(mla_orphan))
    
# Check for any that are only in the catalog
outside_catalog = set.union(ftir_unique, reagent_unique, hr_unique, mla_unique)
catalog_orphan = catalog_unique.difference(outside_catalog)
if catalog_orphan:
    print("Orphaned catalog SampleNumbers: %s" % sorted(catalog_orphan))
    
# Check for any that aren't in FTIR and don't have anything in reagent test
ftir_missing = catalog_unique.difference(ftir_unique).difference(reagent_unique).difference(catalog_orphan)
if len(ftir_missing):
    print("Samples not in FTIR or Reagent: %s" % sorted(ftir_missing))

all_unique = copy.copy(ftir_unique)
all_unique.update(reagent_unique, hr_unique, mla_unique)
if (all_unique or catalog_only):
    outs = "### Please fix orphaned/catalog only samples ###"
    print(outs)
    #raise RuntimeError(outs)

Orphaned FTIR SampleNumbers: ['A0411', 'A0444', 'A0445', 'A0446', 'A0447', 'A0448', 'A0449', 'A0450', 'A0451', 'A0453', 'A0454', 'A0455', 'A0457', 'A0471', 'A0539', 'A0556', 'A0558', 'A0578', 'A0583']
Orphaned Reagent Test SampleNumbers: ['A0558']
Orphaned HR SampleNumbers: ['A0382', 'F0166', 'Z9999']
Orphaned catalog SampleNumbers: ['F0583', 'M0584']
Samples not in FTIR or Reagent: ['F0556']
### Please fix orphaned/catalog only samples ###


In [258]:
def clean_df(df):
    sample_form_d = { 'pill' : ['Ecstasy Tablet',
                                'ecstasy pill',
                                'ecstacy pill',
                                'Non-pharmaceutical tablet (ecstasy etc)',
                                'other recreational pill',
                                 'Whole pill',
                                'Other pill',
                                'Pharmaceutical'],
                  'partial pill' : ['Partial ecstasy pill',
                                    'Partial 2C-B pill',
                                    'Crushed tablet'],
                  'powder' : ['Powder/capsule/bomb/crystal',
                              'Powder or crushed pill',
                              'Crystal, Capsule or Powder'],
                  'liquid' : ['*Cannabinoid liquid',
                               '*Viscous liquid',
                              'Dissolved in Propylene Glycol',
                              'Oil'],
                   'tab' : ['blotter', 'LSD Tab']
                      }


    # Firstly convert all columns to lower case and remove any spaces
    def lower(value):
        if type(value) in [str, unicode]:
            value = value.strip().lower()
        return value

    for column in ['SampleForm']:
        df[column] = df[column].map(lower, na_action='ignore')
    
    replace_d = {}
    for column in ['SampleForm']:
        replace_d[column] = {}
        for drug, names in sample_form_d.items():
            for name in names:
                replace_d[column][name.lower()] = drug
    
    # Replace values
    df.replace(replace_d, inplace=True)
    return df
    
dfs.catalog = clean_df(dfs.catalog)
dfs.ftir = clean_df(dfs.ftir)
dfs.reagent = clean_df(dfs.reagent)

In [265]:
catalog_duplicate_matches = {}
min_stage_delay = 60 * 1
max_stage_delay = 60 * 30
for sample_number in catalog_duplicates:
    catalog_duplicate_matches[sample_number] = {}
    for i, cat_row in enumerate(dfs.catalog.loc[dfs.catalog['SampleNumber'] == sample_number].itertuples()):
        for j, ftir_row in enumerate(dfs.ftir.loc[dfs.ftir['SampleNumber'] == sample_number].itertuples()):
            i += 1
            j += 1
            catalog_data = [cat_row.SampleForm, cat_row.SoldAs, cat_row.AlreadyTried]
            catalog_time = cat_row.Timestamp
            catalog_idx = cat_row.Index + 1
            ftir_data = [ftir_row.SampleForm, ftir_row.SoldAs, ftir_row.AlreadyTried]
            ftir_time = ftir_row.Timestamp
            ftir_idx = ftir_row.Index + 1            
            delta_t = (ftir_time - catalog_time).seconds
            if catalog_data == ftir_data and min_stage_delay < delta_t <= max_stage_delay:
                print("%s Catalog duplicate sample %d (%d) matches ftir sample %d (%d)" % (sample_number, i,
                                                                                           catalog_idx, j, ftir_idx))
                catalog_duplicate_matches[sample_number][cat_row.Index] = True
            else:
                print("%s Catalog duplicate sample %d (%d) DIFFERENT ftir sample %d (%d)\n%s %s\n%s %s" % (sample_number, i,
                                                                                                           catalog_idx, j, ftir_idx,
                                                                                                  catalog_data, catalog_time,
                                                                                                  ftir_data, ftir_time))
                catalog_duplicate_matches[sample_number][cat_row.Index] = False


A0431 Catalog duplicate sample 1 (27) DIFFERENT ftir sample 1 (43)
['pill', u'Found or otherwise not known', u'No'] 2018-07-26 17:00:02
['pill', u'Found or otherwise not known', u'No'] 2018-07-26 19:24:47
A0431 Catalog duplicate sample 2 (28) DIFFERENT ftir sample 1 (43)
['pill', u'Found or otherwise not known', u'No'] 2018-07-26 17:02:24
['pill', u'Found or otherwise not known', u'No'] 2018-07-26 19:24:47
A0442 Catalog duplicate sample 1 (51) DIFFERENT ftir sample 1 (104)
['powder', u'Found or otherwise not known', u'No'] 2018-07-27 12:11:59
['powder', u'Found or otherwise not known', u'No'] 2018-07-27 16:57:54
A0442 Catalog duplicate sample 2 (54) DIFFERENT ftir sample 1 (104)
['powder', u'Found or otherwise not known', u'No'] 2018-07-27 12:21:25
['powder', u'Found or otherwise not known', u'No'] 2018-07-27 16:57:54
A0443 Catalog duplicate sample 1 (52) DIFFERENT ftir sample 1 (138)
['powder', u'Found or otherwise not known', u'No'] 2018-07-27 12:13:57
['powder', u'Found or otherwise

In [312]:
for fo in ftir_orphan:
    ftir_data = dfs.ftir.loc[dfs.ftir['SampleNumber'] == fo, ['SampleForm', 'SoldAs', 'AlreadyTried', 'Timestamp']]
    ftir_data = ftir_data.values.tolist()[0]
    ftir_time = ftir_data.pop()
    for sample_number, indexd in catalog_duplicate_matches.items():
        for k, v in indexd.items():
            if not v:
                catalog_data = dfs.catalog.iloc[k][['SampleForm', 'SoldAs', 'AlreadyTried', 'Timestamp']].values.tolist()
                catalog_time = catalog_data.pop()
                delta_t = (ftir_time - catalog_time).seconds
                if catalog_data == ftir_data and min_stage_delay < delta_t <= max_stage_delay:
                    print "Orphan {} could be match for duplicate {} {}\n{} {}\n{} {}".format(fo,
                                                                                               sample_number, k+1,
                                                                                               catalog_data, catalog_time,
                                                                                               ftir_data, ftir_time)


Orphan A0451 could be match for duplicate A0573 176
['powder', u'Found or otherwise not known', u'No'] 2018-07-28 15:05:48
['powder', u'Found or otherwise not known', u'No'] 2018-07-27 15:34:23
Orphan A0449 could be match for duplicate A0442 54
['powder', u'Found or otherwise not known', u'No'] 2018-07-27 12:21:25
['powder', u'Found or otherwise not known', u'No'] 2018-07-27 12:42:44
Orphan A0449 could be match for duplicate A0443 52
['powder', u'Found or otherwise not known', u'No'] 2018-07-27 12:13:57
['powder', u'Found or otherwise not known', u'No'] 2018-07-27 12:42:44
Orphan A0449 could be match for duplicate A0443 55
['powder', u'Found or otherwise not known', u'No'] 2018-07-27 12:21:28
['powder', u'Found or otherwise not known', u'No'] 2018-07-27 12:42:44
Orphan A0455 could be match for duplicate A0442 51
['powder', u'Found or otherwise not known', u'No'] 2018-07-27 12:11:59
['powder', u'Found or otherwise not known', u'No'] 2018-07-27 12:39:30
Orphan A0455 could be match for du

In [6]:
# Clean up catalog
# Drop all unwanted columns

#  or 'Your initials'
l = set(['Your initials',
         'Your name and first initial',
         'Which device was a photo taken with? Who does it belong to?',
         'Is a breakline present?',
         'Unusual appearance'
        ])

to_drop = set(dfs.catalog.columns).intersection(l)
dfs.catalog.drop(to_drop, axis=1, inplace=True)

d = {
    'Timestamp' : 'Catalog timestamp',
    'Sample Advertised/Acquired/Sold As': 'Catalog_SoldAs',
    'Sample Form' : 'Catalog_Form',
    'Has the Service User or a close friend tried this batch?': 'Catalog_Tried',
    'What is the mass? (mg)': 'FullPillMass',
    'What is the shape of the pill?': 'PillShape',
    'What is the logo?': 'PillLogo',
    'What colour is the pill?': 'PillColour'
}
dfs.catalog.rename(columns=d, inplace=True)

In [7]:
# For FTIR columns need to merge the data from the 'Compound detected', 'Hit Confidence.1' columns into the
# 'Substance detected', 'Hit Confidence' column where the substance detected was 'other'
print("COLS ",dfs.ftir.columns)
print("SS ",dfs.ftir['Substance detected'][:5])
mask = dfs.ftir['Substance detected'] != 'Other'
dfs.ftir['Substance detected'].where(mask, dfs.ftir['Compound detected'], inplace=True) # Copy values from 'Compound detected'
dfs.ftir['Hit Confidence'].where(mask, dfs.ftir['Hit Confidence.1'], inplace=True)
dfs.ftir.drop(['Compound detected', 'Hit Confidence.1', 'Brief Note'], axis=1, inplace=True)

mask = dfs.ftir['Compound detected (Subtraction)'] != 'Other'
dfs.ftir['Compound detected (Subtraction)'].where(mask, df_ftir['Substance detected.1'], inplace=True) # Copy values from 'Compound detected'
dfs.ftir['Hit Confidence.2'].where(mask, dfs.ftir['Hit Confidence.3'], inplace=True)
dfs.ftir.drop(['Substance detected.1', 'Hit Confidence.3', 'Brief Note.1'], axis=1, inplace=True)

# Drop all unwanted columns
l = ['Your name and surname initial',
     'User Suspicion',
     'Is anything detected after subtraction analysis?',
     'Analysis required', 
     'Next action(s)',
     'Send to HR team'
    ]
#'Note for harm reduction worker'
to_drop = set(dfs.ftir.columns).intersection(l)
dfs.ftir.drop(to_drop, axis=1, inplace=True)

# Rename shared columns so that we can check for any errors and remove any columns not of interest to the master df
d = {
    'Timestamp' : 'FTIR timestamp',
    'Sample Sold As': 'FTIR Sold As',
    'Sample Form' : 'FTIR form',
    'Has the Service User or a close friend tried this batch?': 'FTIR tried',
    'Substance(s) detected' : 'FTIR final result',
    'Substance detected' : 'FTIR result1',
    'Hit Confidence' :  'FTIR hit1',
    'Is anything detected after subtraction analysis?' : 'FTIR subtraction positive',
    'Compound detected (Subtraction)' :  'FTIR result2',
    'Hit Confidence.2' :  'FTIR hit2',
    '"Strength" of powdered substance' : 'FTIR Powder Strength',
    'Does the substance detected match the substance that was advertised?' : 'FTIR Matches Sold As',
}
dfs.ftir.rename(columns=d, inplace=True)

('COLS ', Index([                                                           u'Timestamp',
                                                              u'Sample Number',
                                                                     u'Tester',
                                                                    u'Sold As',
                                                                u'Sample Form',
                                                              u'Already Tried',
                                                             u'User Suspicion',
                                                         u'Substance detected',
                                                             u'Hit Confidence',
                                                          u'Compound detected',
                                                           u'Hit Confidence.1',
                                                                 u'Brief Note',
                           u'I

NameError: name 'df_ftir' is not defined

In [None]:
# Clean up HR form

# Drop all unwanted columns
l = ['HR worker name:']
dfs.hr.drop(l, axis=1, inplace=True)

# Rename shared columns so that we can check for any errors and remove any columns not of interest to the master df
d = {
    'Timestamp' : 'HR timestamp',
    'You submitted a substance for analysis. What were you told it was when you got it?': 'HR Sold as',
    'Had you already tried this substance before getting it tested?': 'HR tried',
    'What was your first sample number at this event? Did you take a photo or keep the ticket?': 'Previous Sample Number'
}
dfs.hr.rename(columns=d, inplace=True)

In [None]:
# Catalog and FTIR data frames
df_all = pd.merge(dfs.catalog, dfs.ftir, how='left', on=['Sample Number'])

In [None]:
# Merge in any reagent test data
df_all = pd.merge(df_all, dfs.reagent[['Sample Number', 'Reagent Result']], how='left', on=['Sample Number'])

In [None]:
# Merge in any pill strength data
df_all = pd.merge(df_all, dfs.mla[['Sample Number', 'MDMA / tablet (mg)', '% MDMA content']], how='left', on=['Sample Number'])

In [None]:
# Merge in HR data
df_all = pd.merge(df_all, dfs.hr, how='left', on=['Sample Number'])

In [None]:
# Fix column orders
prefix = ['Sample Number',
          'Catalog timestamp', 'FTIR timestamp', 'HR timestamp',
          'Catalog Sold As', 'FTIR Sold As','HR Sold as', 
          'Catalog form', 'FTIR form',
          'Catalog tried', 'FTIR tried', 'HR tried']
columns = [c for c in df_all.columns if c not in prefix]
columns = prefix + columns
df_all = df_all[columns]
df_all.to_csv('foo.csv')