Issues with MADE dataset:
FTIR
* Dodgy sample numbers
* 2 Hit confidence columns
* 2 substance detected columns

In [25]:
# Module imports
import datetime
import os
import numpy as np
import pandas as pd
from collections import OrderedDict

def fix_sample_number(x):
    """Make sure all samples numbers are of form: AXXX (where A is one of A, F, W and X is a digit)"""
    if isinstance(x, float) and np.isnan(x):
        return x # leave NaN's alone    
    try:
        sn = int(x)
        sn = 'F{:04d}'.format(int(x))
    except ValueError:
        # Assume string so make sure it's of the right format
        sn = str(x).capitalize()
    assert sn[0] in ['A', 'F', 'W'], "Bad ID %s" % sn
    return sn

def now():
    return datetime.datetime.now().strftime("%d/%m/%y %H:%M:%S")

In [26]:
ftir_csv = 'MADE/FTIR Analysis Data Recording Form.csv'
catalog_csv = 'MADE/Sample Cataloguing Form.csv'
reagent_csv = 'MADE/Reagent Outcomes.csv'
hr_csv = 'MADE/MADE MAST Intervention Questionnaire.csv'

date_cols = ['Timestamp']
df_ftir = pd.read_csv(ftir_csv, engine="python", parse_dates=date_cols)
df_catalog = pd.read_csv(catalog_csv, engine="python", parse_dates=date_cols)
df_reagent = pd.read_csv(reagent_csv, engine="python", parse_dates=date_cols)
df_hr = pd.read_csv(hr_csv, engine="python", parse_dates=date_cols)


# Sort out column names
df_reagent.rename(columns={'Sample Code':'Sample Number'}, inplace=True)
df_hr.rename(columns={'Sample Number:':'Sample Number'}, inplace=True)

# Make all sample numbers a 4-digit code starting with F
df_ftir['Sample Number'] = df_ftir['Sample Number'].apply(fix_sample_number)
df_catalog['Sample Number'] = df_catalog['Sample Number'].apply(fix_sample_number)
df_reagent['Sample Number'] = df_reagent['Sample Number'].apply(fix_sample_number)
df_hr['Sample Number'] = df_hr['Sample Number'].apply(fix_sample_number)

In [27]:
# Check there are no sample numbers in any of the other spreadsheets that aren't in the cataolog sheet
csn = set(df_catalog['Sample Number'].unique())

ftir_unique = set(df_ftir['Sample Number'].unique()).difference(csn)
if ftir_unique:
    print("Orphaned FTIR sample numbers: %s" % ftir_unique)

reagent_unique = set(df_reagent['Sample Number'].unique()).difference(csn)
if reagent_unique:
    print("Orphaned Reagent Test sample numbers: %s" % reagent_unique)

hr_unique = set(df_hr['Sample Number'].unique()).difference(csn)
if hr_unique:
    print("Orphaned HR sample numbers: %s" % hr_unique)

ftir_unique.update(reagent_unique, hr_unique)
if (ftir_unique):
    print("Please fix orphaned samples")
    #raise RuntimeError("Please fix orphaned samples")

Orphaned FTIR sample numbers: set(['F0547'])
Orphaned HR sample numbers: set(['F0000'])
Please fix orphaned samples


Add FTIR, Reagnent and MAST results to a single spreadsheet
From FTIR:
'Sample Sold As'
'Sample Form'
'Has the Service User or a close friend tried this batch?'

'Substance detected'
'Hit Confidence'
'Compound detected'
'Hit Confidence'
'Is anything detected after subtraction analysis?'
'Compound detected (Subtraction)'
'Hit Confidence'
'Analysis required'
'Substance(s) detected'
'"Strength" of powdered substance'
'Does the substance detected match the substance that was advertised?'
'Note for harm reduction worker'

From Reagent
'Sample Sold As'
'Sample Form'

'Froehde'
'Froehde possible substances'
'Liebermann'
'Liebermann possible substances'
'Marquis'
'Marquis possible substances'
'Mandelin'
'Mandelin possible substances'
'Ehrlich'
'Likely drug or class'
'Substance(s) detected'
'Does the substance detected match the substance that was advertised?'

MAST
'You submitted a substance for analysis. What were you told it was when you got it?'
'Had you already tried this substance before getting it tested?'


In [28]:
# For FTIR columns need to merge the data from the 'Compound detected', 'Hit Confidence.1' columns into the
# 'Substance detected', 'Hit Confidence' column where the substance detected was 'other'
mask = df_ftir['Substance detected'] != 'Other'
df_ftir['Substance detected'].where(mask, df_ftir['Compound detected'], inplace=True) # Copy values from 'Compound detected'
df_ftir['Hit Confidence'].where(mask, df_ftir['Hit Confidence.1'], inplace=True)
df_ftir.drop(['Compound detected', 'Hit Confidence.1', 'Brief Note'], axis=1, inplace=True)

mask = df_ftir['Compound detected (Subtraction)'] != 'Other'
df_ftir['Compound detected (Subtraction)'].where(mask, df_ftir['Substance detected.1'], inplace=True) # Copy values from 'Compound detected'
df_ftir['Hit Confidence.2'].where(mask, df_ftir['Hit Confidence.3'], inplace=True)
df_ftir.drop(['Substance detected.1', 'Hit Confidence.3', 'Brief Note.1'], axis=1, inplace=True)

# Drop all unwanted columns
l = ['Your name and surname initial',
    'Is anything detected after subtraction analysis?',
    'Analysis required',
    'Note for harm reduction worker'
    ]
df_ftir.drop(l, axis=1, inplace=True)

# Rename shared columns so that we can check for any errors and remove any columns not of interest to the master df
d = {
    'Timestamp' : 'FTIR timestamp',
    'Sample Sold As': 'FTIR Sold As',
    'Sample Form' : 'FTIR form',
    'Has the Service User or a close friend tried this batch?': 'FTIR tried',
    'Substance(s) detected' : 'FTIR final result',
    'Substance detected' : 'FTIR result1',
    'Hit Confidence' :  'FTIR hit1',
    'Is anything detected after subtraction analysis?' : 'FTIR subtraction positive',
    'Compound detected (Subtraction)' :  'FTIR result2',
    '"Strength" of powdered substance' : 'Powder Strength',
    'Does the substance detected match the substance that was advertised?' : 'FTIR Matches Sold As',
}

df_ftir.rename(columns=d, inplace=True)
# ftir_columns = ['Sample Number'] + d.values()
print("Columns now: %s" % df_ftir.columns)



Columns now: Index([u'FTIR timestamp', u'Sample Number', u'FTIR Sold As', u'FTIR form',
       u'FTIR tried', u'User Suspicion', u'FTIR result1', u'FTIR hit1',
       u'FTIR result2', u'Hit Confidence.2', u'FTIR final result',
       u'Powder Strength', u'FTIR Matches Sold As'],
      dtype='object')


In [29]:
# Rename FTIR columns to make role clearer
df_ftir.to_csv('foo.csv')

In [None]:
# # Merge all the data frames together
# dfall = pd.merge(df, lab_df, how='left', on=['Festival'])
# print("%d entries were merged" % len(dfa))