Issues with MADE dataset:
FTIR
* Dodgy sample numbers
* 2 Hit confidence columns
* 2 substance detected columns

In [1]:
# Module imports
import copy
import datetime
import os
import numpy as np
import pandas as pd
from collections import namedtuple

def fix_sample_number(x):
    """Make sure all samples numbers are of form: AXXX (where A is one of A, F, W and X is a digit)"""
    if isinstance(x, float) and np.isnan(x):
        return x # leave NaN's alone
    if (isinstance(x, str) or isinstance(x, unicode)) and len(x) == 0:
        return np.nan
    try:
        sn = int(x)
        sn = 'F{:04d}'.format(int(x))
    except ValueError:
        # Assume string so make sure it's of the right format
        sn = str(x).capitalize()
    if sn[0] not in ['A', 'F', 'W'] or len(x) != 5:
        print("!!! Bad ID %s" % sn)
    return sn

def now():
    return datetime.datetime.now().strftime("%d/%m/%y %H:%M:%S")

def enumerate_duplicates(row):
    """Append a counter to duplicate labels"""
    SEPARATOR = '.'
    duplicates = {}
    updated_row = []
    for r in row:
        count = duplicates.get(r, 0)
        if count > 0:
            label = "{}{}{}".format(r, SEPARATOR, count)
        else:
            label = r
        updated_row.append(label)
        duplicates[r] = count + 1
    return updated_row


ftir_csv = 'MADE/FTIR Analysis Data Recording Form.csv'
catalog_csv = 'MADE/Sample Cataloguing Form.csv'
reagent_csv = 'MADE/Reagent Outcomes.csv'
hr_csv = 'MADE/MADE MAST Intervention Questionnaire.csv'

date_cols = ['Timestamp']
df_ftir = pd.read_csv(ftir_csv, engine="python", parse_dates=date_cols)
df_catalog = pd.read_csv(catalog_csv, engine="python", parse_dates=date_cols)
df_reagent = pd.read_csv(reagent_csv, engine="python", parse_dates=date_cols)
df_hr = pd.read_csv(hr_csv, engine="python", parse_dates=date_cols)

mla_excel = 'MADE/MADE - Loop 2018 event results sheet_.xlsx'
df_mla = pd.read_excel(mla_excel, sheetname='MLA', header=1)

# Sort out column names
df_reagent.rename(columns={'Sample Code':'Sample Number', 'Substance(s) detected' : 'Reagent Result'}, inplace=True)
df_hr.rename(columns={'Sample Number:':'Sample Number'}, inplace=True)
df_mla.rename(columns={'Sample Num':'Sample Number'}, inplace=True)

# Make all sample numbers a 4-digit code starting with F
df_ftir['Sample Number'] = df_ftir['Sample Number'].apply(fix_sample_number)
df_catalog['Sample Number'] = df_catalog['Sample Number'].apply(fix_sample_number)
df_reagent['Sample Number'] = df_reagent['Sample Number'].apply(fix_sample_number)
df_hr['Sample Number'] = df_hr['Sample Number'].apply(fix_sample_number)
df_mla['Sample Number'] = df_mla['Sample Number'].apply(fix_sample_number)

# Prune down MLA to valid sample numbers
df_mla = df_mla[df_mla['Sample Number'].notnull()]

DataFrames = namedtuple('DataFrames', ['catalog', 'ftir', 'reagent','mla', 'hr'])
dfs = DataFrames(
    catalog=df_catalog,
    ftir=df_ftir,
    reagent=df_reagent,
    mla=df_mla,
    hr=df_hr)

In [23]:
pd.options.mode.chained_assignment = 'raise'

# Need to define in main or we can't pickle the data objects
DataFrames = namedtuple('DataFrames', ['catalog', 'ftir', 'reagent','mla', 'hr'])

def gsheets_service():
    from googleapiclient.discovery import build
    from httplib2 import Http
    from oauth2client import file, client, tools
    # If modifying these scopes, delete the file token.json.
    CREDS_FILE = '/opt/random/MADE/JensDataExportJupyter_client_secret.json'
    SCOPES = 'https://www.googleapis.com/auth/spreadsheets.readonly'
    store = file.Storage('token.json')
    creds = store.get()
    if not creds or creds.invalid:
        import argparse
        flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args([])
        flow = client.flow_from_clientsecrets(CREDS_FILE, SCOPES)
        creds = tools.run_flow(flow, store, flags)
    service = build('sheets', 'v4', http=creds.authorize(Http()))
    return service

def get_df(service, SPREADSHEET_ID, SS_RANGE, mla=False):
    # Call the Sheets API
    result = service.spreadsheets().values().get(spreadsheetId=SPREADSHEET_ID,
                                                range=SS_RANGE).execute()
    values = result.get('values', [])
    if not values:
        print('*** No data found ***')
        return None

    # mla has irrelevant stuff in columns 1 and 3 and sample numbers in first column
    if mla:
        values.pop(0)
        values.pop(1)
        def not_blank(row):
            return len(row[0]) > 0       
    else:
        def not_blank(row):
            return sum(map(len, row[:6])) > 0

    rows = filter(not_blank, values)
    if not rows:
        print('*** No data found after pruning rows! ***')
        return None
    
    columns = enumerate_duplicates(rows[0])
    ncols = len(rows[0])
    row_max = max(map(len, rows[1:]))
    width = min(ncols, row_max)
    return pd.DataFrame(rows[1:], columns=columns[:width])

def canonicalise_df(df, source=None):
    """Initial cleaning of all dataframes"""
    #from pandas._libs.tslib import OutOfBoundsDatetime
    if source:
        print("Canonicalising %s" % source)
    df.rename(columns={'Sample Code':'Sample Number',
                       'Sample Number:':'Sample Number',
                       'Sample Num':'Sample Number',
                       'Sample Number i.e F0XXX' : 'Sample Number'
                      }, inplace=True)
    def fix_timestamp(x):
        """Horror to fix mixed date formats"""
        return pd.to_datetime(str(x), format='%d/%m/%Y %H:%M:%S')
#         try:
#             x = pd.to_datetime(x)
#         except:
#             try:
#                 # '12/08/2018 12:26:15'
#                 x = pd.to_datetime(x, format='%d/%m/%Y %H:%M:%S')
#             except:
#                 #'Thu 9/08 - 12:19'
#                 x = pd.to_datetime(x, format='%a %m/%y - %H:%M')
#         return x
    if 'Timestamp' in df.columns:
        df.loc[:, 'Timestamp'] = df['Timestamp'].map(fix_timestamp)
    df.loc[:, 'Sample Number'] = df['Sample Number'].apply(fix_sample_number)
    df.dropna(subset=['Sample Number'])
    #df.sort_values(['Sample Number'], ascending=True, inplace=True)
    return df

def get_data(service, SPREADSHEET_ID):

    CATALOG_RANGE = 'Catalog!A:R'
    FTIR_RANGE = 'FTIR!A:X'
    REAGENT_RANGE = 'Reagent!A:W'
    MLA_RANGE = 'MLA!A:R'
    HR_RANGE = 'Interventions!A:BJ'

    #service = gsheets_service()

    df_catalog = get_df(service, SPREADSHEET_ID, CATALOG_RANGE)
    df_catalog = canonicalise_df(df_catalog, source='catalog')
    df_ftir = get_df(service, SPREADSHEET_ID, FTIR_RANGE)
    df_ftir = canonicalise_df(df_ftir, source='ftir')
    df_reagent = get_df(service, SPREADSHEET_ID, REAGENT_RANGE)
    df_reagent = canonicalise_df(df_reagent, source='reagent')
    df_mla = get_df(service, SPREADSHEET_ID, MLA_RANGE, mla=True)
    df_mla = canonicalise_df(df_mla, source='mla')
    df_hr = get_df(service, SPREADSHEET_ID, HR_RANGE)
    df_hr = canonicalise_df(df_hr, source='hr')

    #DataFrames = namedtuple('DataFrames', ['catalog', 'ftir', 'reagent','mla', 'hr'])
    return DataFrames(
        catalog=df_catalog,
        ftir=df_ftir,
        reagent=df_reagent,
        mla=df_mla,
        hr=df_hr)
    return


# The ID and range of a sample spreadsheet.
BOOMTOWN2018_SPREADSHEET_ID = '1RiA-FwG_954Ger2VPsOSA3JLh-7sEoTYr40eVS0mp24'
MADE2018_SPREADSHEET_ID = '1daXdyL6uL8qnMsEsP0RLZE9nDzt6J7Zr1ygQdguvi-E'
BOARDMASTERS2018_SPREADSHEET_ID = '1U1lhUWLazDBN-wb2eZM8YV674f46npVfQK3XUVZjPow'

data = {}
service = gsheets_service()
print "PROCESSING BOOMTOWN"
data['boomtown'] = get_data(service, BOOMTOWN2018_SPREADSHEET_ID)
print "PROCESSING BOARDMASTERS"
data['boardmasters'] = get_data(service, BOARDMASTERS2018_SPREADSHEET_ID)
print "PROCESSING MADE"
data['made'] = get_data(service, MADE2018_SPREADSHEET_ID)

import pickle
with open('foo_multi.pkl','w') as w:
    pickle.dump(data, w)
    
dfs = data['boomtown']

PROCESSING BOOMTOWN
Canonicalising catalog
!!! Bad ID R0876
Canonicalising ftir
!!! Bad ID F00129
!!! Bad ID F0005
!!! Bad ID Z1000
!!! Bad ID B0076
Canonicalising reagent
!!! Bad ID Z1000
!!! Bad ID Yellow > green
!!! Bad ID F1819 (or 1827?)
Canonicalising mla
Canonicalising hr
!!! Bad ID G0037
!!! Bad ID G0242
!!! Bad ID G0153
!!! Bad ID G0024
!!! Bad ID G0652
!!! Bad ID G9999
!!! Bad ID G0877
!!! Bad ID G0878
!!! Bad ID G0811
!!! Bad ID G1441
!!! Bad ID G1216
!!! Bad ID G1228
!!! Bad ID G1229
!!! Bad ID G1398
!!! Bad ID G1284
!!! Bad ID G1572
!!! Bad ID G0875
!!! Bad ID G1833
!!! Bad ID G1703
!!! Bad ID G1860
!!! Bad ID G1859
!!! Bad ID G1699
!!! Bad ID G1686
!!! Bad ID G1312
!!! Bad ID G0420
!!! Bad ID G0983
!!! Bad ID G0981
PROCESSING BOARDMASTERS
Canonicalising catalog
Canonicalising ftir
Canonicalising reagent
Canonicalising mla
Canonicalising hr
PROCESSING MADE
Canonicalising catalog
Canonicalising ftir
Canonicalising reagent
Canonicalising mla
Canonicalising hr


In [17]:
with open('foo_multi.pkl') as f:
    data = pickle.load(f)
dfs = data['made']

# Check for duplicates
catalog_duplicates = dfs.catalog.loc[dfs.catalog['Sample Number'].duplicated(), 'Sample Number'].values
if catalog_duplicates.size > 0:
    print("### %d duplicated catalog sample numbers ###" % len(catalog_duplicates))
    dfs.catalog[dfs.catalog['Sample Number'].duplicated(keep=False)].to_csv('catalog_duplicates.csv')
    
ftir_duplicates = dfs.ftir.loc[dfs.ftir['Sample Number'].duplicated(), 'Sample Number'].values
if ftir_duplicates.size > 0:
    print("### %d duplicated FTIR sample numbers ###" % len(ftir_duplicates))
    dfs.ftir[dfs.ftir['Sample Number'].duplicated(keep=False)].to_csv('ftir_duplicates.csv')

reagent_duplicates = dfs.reagent.loc[dfs.reagent['Sample Number'].duplicated(), 'Sample Number'].values
if reagent_duplicates.size > 0:
    print("### %d duplicated catalog sample numbers ###" % len(reagent_duplicates))    
    dfs.reagent[dfs.reagent['Sample Number'].duplicated(keep=False)].to_csv('reagent_duplicates.csv', encoding = 'utf-8')

hr_duplicates = dfs.hr.loc[dfs.hr['Sample Number'].duplicated(), 'Sample Number'].values
if hr_duplicates.size > 0:
    print("### %d duplicated HR sample numbers ###" % len(hr_duplicates))
    dfs.hr[dfs.hr['Sample Number'].duplicated(keep=False)].to_csv('hr_duplicates.csv', encoding = 'utf-8')

mla_duplicates = dfs.mla.loc[dfs.mla['Sample Number'].duplicated(), 'Sample Number'].values
if mla_duplicates.size > 0:
    print("### %d duplicated MLA sample numbers ###" % len(mla_duplicates))
    dfs.mla[dfs.mla['Sample Number'].duplicated(keep=False)].to_csv('mla_duplicates.csv')
    
if dfs.catalog['Sample Number'].duplicated().any() or \
    dfs.ftir['Sample Number'].duplicated().any() or \
    dfs.reagent['Sample Number'].duplicated().any() or \
    dfs.hr['Sample Number'].duplicated().any() or \
    dfs.mla['Sample Number'].duplicated().any():

    outs = 'Please fix duplicated values'
    print(outs)
#     raise RuntimeError(outs)

### 1 duplicated catalog sample numbers ###
### 2 duplicated FTIR sample numbers ###
Please fix duplicated values


In [18]:
# Check there are no sample numbers in any of the other spreadsheets that aren't in the cataolog sheet
catalog_unique = set(dfs.catalog['Sample Number'].unique())

ftir_unique = set(dfs.ftir['Sample Number'].unique())
ftir_orphan = ftir_unique.difference(catalog_unique)
if ftir_orphan:
    print("Orphaned FTIR sample numbers: %s" % sorted(ftir_orphan))

reagent_unique = set(dfs.reagent['Sample Number'].unique())
reagent_orphan = reagent_unique.difference(catalog_unique)
if reagent_orphan:
    print("Orphaned Reagent Test sample numbers: %s" % sorted(reagent_orphan))

hr_unique = set(dfs.hr['Sample Number'].unique())
hr_orphan = hr_unique.difference(catalog_unique)
if hr_orphan:
    print("Orphaned HR sample numbers: %s" % sorted(hr_orphan))
    
mla_unique = set(dfs.mla['Sample Number'].unique()).difference(catalog_unique)
mla_orphan = mla_unique.difference(catalog_unique)
if mla_orphan:
    print("Orphaned MLA sample numbers: %s" % sorted(mla_orphan))
    
# Check for any that are only in the catalog
outside_catalog = set.union(ftir_unique, reagent_unique, hr_unique, mla_unique)
catalog_orphan = catalog_unique.difference(outside_catalog)
if catalog_orphan:
    print("Orphaned catalog sample numbers: %s" % sorted(catalog_orphan))
    
# Check for any that aren't in FTIR and don't have anything in reagent test
ftir_missing = catalog_unique.difference(ftir_unique).difference(reagent_unique).difference(catalog_orphan)
if len(ftir_missing):
    print("Samples not in FTIR or Reagent: %s" % sorted(ftir_missing))

all_unique = copy.copy(ftir_unique)
all_unique.update(reagent_unique, hr_unique, mla_unique)
if (all_unique or catalog_only):
    outs = "### Please fix orphaned/catalog only samples ###"
    print(outs)
    #raise RuntimeError(outs)

Orphaned catalog sample numbers: ['A0051', 'A0052', 'A0053', 'A0054', 'A0056', 'F0006']
Samples not in FTIR or Reagent: ['F0004', 'F0010', 'F0014', 'F0033', 'F0042', 'F0072']
### Please fix orphaned/catalog only samples ###


In [42]:
# Standardise names
d = {
    'Sample Advertised/Acquired/Sold As' : 'Sold As',
    'Sample Sold As' : 'Sold As',
    
    'Has the Service User or a close friend tried this batch?' : 'Already Tried',
    
    'Your initials' : 'Tester',
    'Your name and first initial' : 'Tester',
    'Your name and surname initial' : 'Tester'
}
dfs.catalog.rename(columns=d, inplace=True)
dfs.ftir.rename(columns=d, inplace=True)
dfs.reagent.rename(columns=d, inplace=True)
dfs.hr.rename(columns=d, inplace=True)

def clean_df(df):
    sample_form_d = { 'pill' : ['Ecstasy Tablet',
                                'ecstacy pill',
                                'Non-pharmaceutical tablet (ecstasy etc)',
                                'other recreational pill',
                                 'Whole pill',
                                'Other pill',
                                'Pharmaceutical'],
                  'partial pill' : ['Partial ecstasy pill',
                                    'Partial 2C-B pill',
                                    'Crushed tablet'],
                  'powder' : ['Powder/capsule/bomb/crystal',
                              'Powder or crushed pill',
                              'Crystal, Capsule or Powder'],
                  'liquid' : ['*Cannabinoid liquid',
                               '*Viscous liquid',
                              'Dissolved in Propylene Glycol',
                              'Oil'],
                   'tab' : ['blotter', 'LSD Tab']
                      }


    # Firstly convert all columns to lower case and remove any spaces
    def clean(value):
        if type(value) in [str, unicode]:
            value = value.strip().lower()
        return value

    for column in ['Sample Form']:
        df[column] = df[column].map(clean, na_action='ignore')
    
    replace_d = {}
    for column in ['Sample Form']:
        replace_d[column] = {}
        for drug, names in sample_form_d.items():
            for name in names:
                replace_d[column][name.lower()] = drug
    
    # Replace values
    df.replace(replace_d, inplace=True)
    
clean_df(dfs.catalog)
print(dfs.catalog['Sample Form'].unique())

['partial pill' 'powder' 'pill' 'tab' u'brown lump' 'liquid' u'paste'
 u'gel' u'fruit pastille']


In [24]:

# c_sn = 'F0516'
# f_sn = 'A1439'
# c = dfs.catalog.loc[dfs.catalog['Sample Number'] == c_sn, ['Sold As', 'Sample Form', 'Already Tried', 'User Suspicion']]
# ct = dfs.catalog.loc[dfs.catalog['Sample Number'] == c_sn, ['Timestamp']].iat[0,0]
# f = dfs.ftir.loc[dfs.ftir['Sample Number'] == f_sn, ['Sold As', 'Sample Form', 'Already Tried', 'User Suspicion']]
# ft = dfs.ftir.loc[dfs.ftir['Sample Number'] == f_sn, ['Timestamp']].iat[0,0]
# if np.all(c.values == f.values) and ft > ct:
#     print("Catalog %s matches FTIR %s")

for sample_number in catalog_unique.union(ftir_unique):
    print("CHECKING ", sample_number)
    c = dfs.catalog.loc[dfs.catalog['Sample Number'] == sample_number, ['Sold As', 'Sample Form', 'Already Tried', 'User Suspicion']]
    ct = dfs.catalog.loc[dfs.catalog['Sample Number'] == sample_number, ['Timestamp']].iat[0,0]
    f = dfs.ftir.loc[dfs.ftir['Sample Number'] == sample_number, ['Sold As', 'Sample Form', 'Already Tried', 'User Suspicion']]
    ft = dfs.ftir.loc[dfs.ftir['Sample Number'] == sample_number, ['Timestamp']].iat[0,0]
    print("%s - %s" % (c.values == f.values, ft > ct))
    if np.all(c.values == f.values) and ft > ct:
        print("Catalog matches FTIR for %s" % sample_number)
    else:
        print("NO MATCH Catalog matches FTIR for %s" % sample_number)
        print("%s vs\n%s" % (c.values, f.values))

if False:
    for c_sn in sorted(list(catalog_orphan) + catalog_duplicates.tolist()):
        for f_cn in sorted(list(ftir_orphan) + ftir_duplicates.tolist()):
            print("CHECKING ",c_sn, f_sn)
            c = dfs.catalog.loc[dfs.catalog['Sample Number'] == c_sn, ['Sold As', 'Sample Form', 'Already Tried', 'User Suspicion']]
            ct = dfs.catalog.loc[dfs.catalog['Sample Number'] == c_sn, ['Timestamp']].iat[0,0]
            f = dfs.ftir.loc[dfs.ftir['Sample Number'] == f_sn, ['Sold As', 'Sample Form', 'Already Tried', 'User Suspicion']]
            ft = dfs.ftir.loc[dfs.ftir['Sample Number'] == f_sn, ['Timestamp']].iat[0,0]
            print("%s - %s" % (c.values == f.values, ft > ct))
            if np.all(c.values == f.values) and ft > ct:
                print("Catalog %s matches FTIR %s" % c_sn, f_sn)



('CHECKING ', 'F0050')
[[ True False  True  True]] - True
NO MATCH Catalog matches FTIR for F0050
[[u'MDMA' u'Ecstasy pill' u'No' u'']] vs
[[u'MDMA' u'Non-pharmaceutical tablet (ecstasy etc)' u'No' u'']]
('CHECKING ', 'F0059')
[[ True False  True  True]] - True
NO MATCH Catalog matches FTIR for F0059
[[u'MDMA' u'Ecstasy pill' u'Yes' u'Roughly what it was sold as']] vs
[[u'MDMA' u'Non-pharmaceutical tablet (ecstasy etc)' u'Yes'
  u'Roughly what it was sold as']]
('CHECKING ', 'F0018')
[[ True False  True  True]] - True
NO MATCH Catalog matches FTIR for F0018
[[u'MDMA' u'Partial ecstasy pill' u'Yes' u'Roughly what it was sold as']] vs
[[u'MDMA' u'Non-pharmaceutical tablet (ecstasy etc)' u'Yes'
  u'Roughly what it was sold as']]
('CHECKING ', 'F0014')
[[ True False  True  True]] - True
NO MATCH Catalog matches FTIR for F0014
[[u'MDMA' u'Partial ecstasy pill' u'No' u'']] vs
[[u'MDMA' u'Crystal, Capsule or Powder' u'No' u'']]
('CHECKING ', 'F0015')
[[ True False  True  True]] - True
NO MATC

IndexError: index 0 is out of bounds for axis 0 with size 0

In [32]:
print(dfs.catalog['Sample Form'].unique())
print(dfs.ftir['Sample Form'].unique())
print(dfs.reagent['Sample Form'].unique())
print(dfs.hr['Sample Form'].unique())



['partial pill' 'powder' u'Other recreational pill' u'Ecstasy pill' 'tab'
 u'Brown lump' u'Blotter' u'Tab' 'liquid' u'Ecstacy pill' 'pill' u'Liquid'
 u'Paste' u'Gel' u'LSD TAB' u'Fruit pastille' u'BLOTTER']
[u'Crystal, Capsule or Powder' u'Non-pharmaceutical tablet (ecstasy etc)'
 u'Hash' u'Dissolved in Propylene Glycol' u'Pharmaceutical' u'Extract'
 u'Oil']
[u'Ecstasy Tablet' u'Crushed tablet' u'tab' u'Tab' u'Partial pill'
 u'Crystal, Capsule or Powder' u'pill' u'Blotter' u'Other pill'
 u'Whole pill' u'partial pill' u'Rizla' u'Full bottle' u'Gel capsule'
 u'Pharmaceutical' u'Jelly ' u'Sweets']


KeyError: 'Sample Form'

In [None]:
for c_sn in sorted(list(catalog_orphan)):
    for f_cn in sorted(list(ftir_orphan)):
        print("CHECKING ",c_sn, f_sn)
        c = dfs.catalog.loc[dfs.catalog['Sample Number'] == c_sn, ['Sold As', 'Sample Form', 'Already Tried', 'User Suspicion']]
        ct = dfs.catalog.loc[dfs.catalog['Sample Number'] == c_sn, ['Timestamp']].iat[0,0]
        f = dfs.ftir.loc[dfs.ftir['Sample Number'] == f_sn, ['Sold As', 'Sample Form', 'Already Tried', 'User Suspicion']]
        ft = dfs.ftir.loc[dfs.ftir['Sample Number'] == f_sn, ['Timestamp']].iat[0,0]
        print("%s - %s" % (c.values == f.values, ft > ct))
        if np.all(c.values == f.values) and ft > ct:
            print("Catalog %s matches FTIR %s" % c_sn, f_sn)

In [6]:
# Clean up catalog
# Drop all unwanted columns

#  or 'Your initials'
l = set(['Your initials',
         'Your name and first initial',
         'Which device was a photo taken with? Who does it belong to?',
         'Is a breakline present?',
         'Unusual appearance'
        ])

to_drop = set(dfs.catalog.columns).intersection(l)
dfs.catalog.drop(to_drop, axis=1, inplace=True)

d = {
    'Timestamp' : 'Catalog timestamp',
    'Sample Advertised/Acquired/Sold As': 'Catalog Sold As',
    'Sample Form' : 'Catalog form',
    'Has the Service User or a close friend tried this batch?': 'Catalog tried',
    'What is the mass? (mg)': 'Full pill mass',
    'What is the shape of the pill?': 'Pill shape',
    'What is the logo?': 'Pill logo',
    'What colour is the pill?': 'Pill colour'
}
dfs.catalog.rename(columns=d, inplace=True)

In [7]:
# For FTIR columns need to merge the data from the 'Compound detected', 'Hit Confidence.1' columns into the
# 'Substance detected', 'Hit Confidence' column where the substance detected was 'other'
print("COLS ",dfs.ftir.columns)
print("SS ",dfs.ftir['Substance detected'][:5])
mask = dfs.ftir['Substance detected'] != 'Other'
dfs.ftir['Substance detected'].where(mask, dfs.ftir['Compound detected'], inplace=True) # Copy values from 'Compound detected'
dfs.ftir['Hit Confidence'].where(mask, dfs.ftir['Hit Confidence.1'], inplace=True)
dfs.ftir.drop(['Compound detected', 'Hit Confidence.1', 'Brief Note'], axis=1, inplace=True)

mask = dfs.ftir['Compound detected (Subtraction)'] != 'Other'
dfs.ftir['Compound detected (Subtraction)'].where(mask, df_ftir['Substance detected.1'], inplace=True) # Copy values from 'Compound detected'
dfs.ftir['Hit Confidence.2'].where(mask, dfs.ftir['Hit Confidence.3'], inplace=True)
dfs.ftir.drop(['Substance detected.1', 'Hit Confidence.3', 'Brief Note.1'], axis=1, inplace=True)

# Drop all unwanted columns
l = ['Your name and surname initial',
     'User Suspicion',
     'Is anything detected after subtraction analysis?',
     'Analysis required', 
     'Next action(s)',
     'Send to HR team'
    ]
#'Note for harm reduction worker'
to_drop = set(dfs.ftir.columns).intersection(l)
dfs.ftir.drop(to_drop, axis=1, inplace=True)

# Rename shared columns so that we can check for any errors and remove any columns not of interest to the master df
d = {
    'Timestamp' : 'FTIR timestamp',
    'Sample Sold As': 'FTIR Sold As',
    'Sample Form' : 'FTIR form',
    'Has the Service User or a close friend tried this batch?': 'FTIR tried',
    'Substance(s) detected' : 'FTIR final result',
    'Substance detected' : 'FTIR result1',
    'Hit Confidence' :  'FTIR hit1',
    'Is anything detected after subtraction analysis?' : 'FTIR subtraction positive',
    'Compound detected (Subtraction)' :  'FTIR result2',
    'Hit Confidence.2' :  'FTIR hit2',
    '"Strength" of powdered substance' : 'FTIR Powder Strength',
    'Does the substance detected match the substance that was advertised?' : 'FTIR Matches Sold As',
}
dfs.ftir.rename(columns=d, inplace=True)

('COLS ', Index([                                                           u'Timestamp',
                                                              u'Sample Number',
                                                                     u'Tester',
                                                                    u'Sold As',
                                                                u'Sample Form',
                                                              u'Already Tried',
                                                             u'User Suspicion',
                                                         u'Substance detected',
                                                             u'Hit Confidence',
                                                          u'Compound detected',
                                                           u'Hit Confidence.1',
                                                                 u'Brief Note',
                           u'I

NameError: name 'df_ftir' is not defined

In [None]:
# Clean up HR form

# Drop all unwanted columns
l = ['HR worker name:']
dfs.hr.drop(l, axis=1, inplace=True)

# Rename shared columns so that we can check for any errors and remove any columns not of interest to the master df
d = {
    'Timestamp' : 'HR timestamp',
    'You submitted a substance for analysis. What were you told it was when you got it?': 'HR Sold as',
    'Had you already tried this substance before getting it tested?': 'HR tried',
    'What was your first sample number at this event? Did you take a photo or keep the ticket?': 'Previous Sample Number'
}
dfs.hr.rename(columns=d, inplace=True)

In [None]:
# Catalog and FTIR data frames
df_all = pd.merge(dfs.catalog, dfs.ftir, how='left', on=['Sample Number'])

In [None]:
# Merge in any reagent test data
df_all = pd.merge(df_all, dfs.reagent[['Sample Number', 'Reagent Result']], how='left', on=['Sample Number'])

In [None]:
# Merge in any pill strength data
df_all = pd.merge(df_all, dfs.mla[['Sample Number', 'MDMA / tablet (mg)', '% MDMA content']], how='left', on=['Sample Number'])

In [None]:
# Merge in HR data
df_all = pd.merge(df_all, dfs.hr, how='left', on=['Sample Number'])

In [None]:
# Fix column orders
prefix = ['Sample Number',
          'Catalog timestamp', 'FTIR timestamp', 'HR timestamp',
          'Catalog Sold As', 'FTIR Sold As','HR Sold as', 
          'Catalog form', 'FTIR form',
          'Catalog tried', 'FTIR tried', 'HR tried']
columns = [c for c in df_all.columns if c not in prefix]
columns = prefix + columns
df_all = df_all[columns]
df_all.to_csv('foo.csv')