Issues with MADE dataset:
FTIR
* Dodgy sample numbers
* 2 Hit confidence columns
* 2 substance detected columns

In [1]:
# Module imports
import copy
import datetime
import os
import numpy as np
import pandas as pd
from collections import OrderedDict

def fix_sample_number(x):
    """Make sure all samples numbers are of form: AXXX (where A is one of A, F, W and X is a digit)"""
    if isinstance(x, float) and np.isnan(x):
        return x # leave NaN's alone    
    try:
        sn = int(x)
        sn = 'F{:04d}'.format(int(x))
    except ValueError:
        # Assume string so make sure it's of the right format
        sn = str(x).capitalize()
    assert sn[0] in ['A', 'F', 'W'], "Bad ID %s" % sn
    return sn

def now():
    return datetime.datetime.now().strftime("%d/%m/%y %H:%M:%S")

In [2]:
ftir_csv = 'MADE/FTIR Analysis Data Recording Form.csv'
catalog_csv = 'MADE/Sample Cataloguing Form.csv'
reagent_csv = 'MADE/Reagent Outcomes.csv'
hr_csv = 'MADE/MADE MAST Intervention Questionnaire.csv'

date_cols = ['Timestamp']
df_ftir = pd.read_csv(ftir_csv, engine="python", parse_dates=date_cols)
df_catalog = pd.read_csv(catalog_csv, engine="python", parse_dates=date_cols)
df_reagent = pd.read_csv(reagent_csv, engine="python", parse_dates=date_cols)
df_hr = pd.read_csv(hr_csv, engine="python", parse_dates=date_cols)

# Sort out column names
df_reagent.rename(columns={'Sample Code':'Sample Number', 'Substance(s) detected' : 'Reagent Result'}, inplace=True)
df_hr.rename(columns={'Sample Number:':'Sample Number'}, inplace=True)

# Make all sample numbers a 4-digit code starting with F
df_ftir['Sample Number'] = df_ftir['Sample Number'].apply(fix_sample_number)
df_catalog['Sample Number'] = df_catalog['Sample Number'].apply(fix_sample_number)
df_reagent['Sample Number'] = df_reagent['Sample Number'].apply(fix_sample_number)
df_hr['Sample Number'] = df_hr['Sample Number'].apply(fix_sample_number)

In [3]:
mla_excel = 'MADE/MADE - Loop 2018 event results sheet_.xlsx'
df_mla = pd.read_excel(mla_excel, sheetname='MLA', header=1)
df_mla.rename(columns={'Sample Num':'Sample Number'}, inplace=True)
df_mla['Sample Number'] = df_mla['Sample Number'].apply(fix_sample_number)
# Prune down to valid sample numbers
df_mla = df_mla[df_mla['Sample Number'].notnull()]

In [4]:
# Check for duplicates
if df_catalog['Sample Number'].duplicated().any():
    print("Duplicated catalog sample numbers")
    print(df_catalog[df_catalog['Sample Number'].duplicated(keep=False)])
if df_ftir['Sample Number'].duplicated().any():
    print("Duplicated FTIR sample numbers")
    print(df_ftir[df_ftir['Sample Number'].duplicated(keep=False)])
if df_reagent['Sample Number'].duplicated().any():
    print("Duplicated catalog sample numbers")
    print(df_reagent[df_reagent['Sample Number'].duplicated(keep=False)])
if df_hr['Sample Number'].duplicated().any():
    print("Duplicated catalog sample numbers")
    print(df_hr[df_hr['Sample Number'].duplicated(keep=False)])
if df_mla['Sample Number'].duplicated().any():
    print("Duplicated MLA sample numbers")
    print(df_mla[df_mla['Sample Number'].duplicated(keep=False)])
    
    
if df_catalog['Sample Number'].duplicated().any() or \
    df_ftir['Sample Number'].duplicated().any() or \
    df_reagent['Sample Number'].duplicated().any() or \
    df_hr['Sample Number'].duplicated().any() or \
    df_mla['Sample Number'].duplicated().any():

    outs = 'Please fix duplicated values'
    print(outs)
#     raise RuntimeError(outs)

Duplicated catalog sample numbers
             Timestamp Sample Number Your initials Sample Source  \
22 2018-07-28 14:34:58         F0023            JF        Public   
27 2018-07-28 15:10:42         F0023            JF        Public   

   Sample Advertised/Acquired/Sold As  \
22                               MDMA   
27                               MDMA   

   Has the Service User or a close friend tried this batch? User Suspicion  \
22                                                 No                  NaN   
27                                                 No                  NaN   

             Sample Form What is the logo?  \
22  Partial ecstasy pill               n/a   
27  Partial ecstasy pill               NaN   

   Which device was a photo taken with? Who does it belong to?  \
22   N/A: fragment too small, doesn't warrant a photo            
27                                                NaN            

    What is the mass? (mg) What is the shape of the pill?  \
22 

In [5]:
# Check there are no sample numbers in any of the other spreadsheets that aren't in the cataolog sheet
catalog_unique = set(df_catalog['Sample Number'].unique())

ftir_unique = set(df_ftir['Sample Number'].unique()).difference(catalog_unique)
if ftir_unique:
    print("Orphaned FTIR sample numbers: %s" % ftir_unique)

reagent_unique = set(df_reagent['Sample Number'].unique()).difference(catalog_unique)
if reagent_unique:
    print("Orphaned Reagent Test sample numbers: %s" % reagent_unique)

hr_unique = set(df_hr['Sample Number'].unique()).difference(catalog_unique)
if hr_unique:
    print("Orphaned HR sample numbers: %s" % hr_unique)
    
mla_unique = set(df_mla['Sample Number'].unique()).difference(catalog_unique)
if mla_unique:
    print("Orphaned MLA sample numbers: %s" % mla_unique)
    
# Check for any that are only in the catalog
outside_catalog = set(df_ftir['Sample Number'].unique())
outside_catalog.update(set(df_reagent['Sample Number'].unique()))
outside_catalog.update(set(df_hr['Sample Number'].unique()))
outside_catalog.update(set(df_mla['Sample Number'].unique()))
catalog_only = catalog_unique.difference(outside_catalog)
if catalog_only:
    print("Sample numbers only in catalog: %s" % catalog_only)

all_unique = copy.copy(ftir_unique)
all_unique.update(reagent_unique, hr_unique, mla_unique)
if (all_unique or catalog_only):
    outs = "Please fix orphaned/catalog only samples"
    print(outs)
    #raise RuntimeError(outs)

Orphaned FTIR sample numbers: set(['F0547'])
Orphaned HR sample numbers: set(['F0000'])
Sample numbers only in catalog: set(['F0006', 'A0056', 'A0054', 'A0053', 'A0052', 'A0051'])
Please fix orphaned/catalog only samples


In [6]:
# Clean up catalog
# Drop all unwanted columns
l = ['Your initials',
     'Which device was a photo taken with? Who does it belong to?',
     'Is a breakline present?',
     'Unusual appearance'
    ]
df_catalog.drop(l, axis=1, inplace=True)

d = {
    'Timestamp' : 'Catalog timestamp',
    'Sample Advertised/Acquired/Sold As': 'Catalog Sold As',
    'Sample Form' : 'Catalog form',
    'Has the Service User or a close friend tried this batch?': 'Catalog tried',
    'What is the mass? (mg)': 'Full pill mass',
    'What is the shape of the pill?': 'Pill shape',
    'What is the logo?': 'Pill logo',
    'What colour is the pill?': 'Pill colour'
}
df_catalog.rename(columns=d, inplace=True)

In [7]:
# For FTIR columns need to merge the data from the 'Compound detected', 'Hit Confidence.1' columns into the
# 'Substance detected', 'Hit Confidence' column where the substance detected was 'other'
mask = df_ftir['Substance detected'] != 'Other'
df_ftir['Substance detected'].where(mask, df_ftir['Compound detected'], inplace=True) # Copy values from 'Compound detected'
df_ftir['Hit Confidence'].where(mask, df_ftir['Hit Confidence.1'], inplace=True)
df_ftir.drop(['Compound detected', 'Hit Confidence.1', 'Brief Note'], axis=1, inplace=True)

mask = df_ftir['Compound detected (Subtraction)'] != 'Other'
df_ftir['Compound detected (Subtraction)'].where(mask, df_ftir['Substance detected.1'], inplace=True) # Copy values from 'Compound detected'
df_ftir['Hit Confidence.2'].where(mask, df_ftir['Hit Confidence.3'], inplace=True)
df_ftir.drop(['Substance detected.1', 'Hit Confidence.3', 'Brief Note.1'], axis=1, inplace=True)

# Drop all unwanted columns
l = ['Your name and surname initial',
     'User Suspicion',
     'Is anything detected after subtraction analysis?',
     'Analysis required',
     'Note for harm reduction worker'
    ]
df_ftir.drop(l, axis=1, inplace=True)

# Rename shared columns so that we can check for any errors and remove any columns not of interest to the master df
d = {
    'Timestamp' : 'FTIR timestamp',
    'Sample Sold As': 'FTIR Sold As',
    'Sample Form' : 'FTIR form',
    'Has the Service User or a close friend tried this batch?': 'FTIR tried',
    'Substance(s) detected' : 'FTIR final result',
    'Substance detected' : 'FTIR result1',
    'Hit Confidence' :  'FTIR hit1',
    'Is anything detected after subtraction analysis?' : 'FTIR subtraction positive',
    'Compound detected (Subtraction)' :  'FTIR result2',
    'Hit Confidence.2' :  'FTIR hit2',
    '"Strength" of powdered substance' : 'FTIR Powder Strength',
    'Does the substance detected match the substance that was advertised?' : 'FTIR Matches Sold As',
}
df_ftir.rename(columns=d, inplace=True)

In [8]:
# Clean up HR form

# Drop all unwanted columns
l = ['HR worker name:']
df_hr.drop(l, axis=1, inplace=True)

# Rename shared columns so that we can check for any errors and remove any columns not of interest to the master df
d = {
    'Timestamp' : 'HR timestamp',
    'You submitted a substance for analysis. What were you told it was when you got it?': 'HR Sold as',
    'Had you already tried this substance before getting it tested?': 'HR tried',
    'What was your first sample number at this event? Did you take a photo or keep the ticket?': 'Previous Sample Number'
}
df_hr.rename(columns=d, inplace=True)

In [9]:
# Catalog and FTIR data frames
df_all = pd.merge(df_catalog, df_ftir, how='left', on=['Sample Number'])

In [10]:
# Merge in any reagent test data
df_all = pd.merge(df_all, df_reagent[['Sample Number', 'Reagent Result']], how='left', on=['Sample Number'])

In [11]:
# Merge in any pill strength data
df_all = pd.merge(df_all, df_mla[['Sample Number', 'MDMA / tablet (mg)', '% MDMA content']], how='left', on=['Sample Number'])

In [12]:
# Merge in HR data
df_all = pd.merge(df_all, df_hr, how='left', on=['Sample Number'])

In [13]:
# Fix column orders
prefix = ['Sample Number',
          'Catalog timestamp', 'FTIR timestamp', 'HR timestamp',
          'Catalog Sold As', 'FTIR Sold As','HR Sold as', 
          'Catalog form', 'FTIR form',
          'Catalog tried', 'FTIR tried', 'HR tried']
columns = [c for c in df_all.columns if c not in prefix]
columns = prefix + columns
df_all = df_all[columns]
df_all.to_csv('foo.csv')