In [415]:
# Module imports
import copy
import datetime
import os
import numpy as np
import pandas as pd

def fix_sample_number(x):
    """Make sure all samples numbers are of form: AXXX (where A is one of A, F, W and X is a digit)"""
    if isinstance(x, float) and np.isnan(x):
        return x # leave NaN's alone
    if (isinstance(x, str) or isinstance(x, unicode)) and len(x) == 0:
        return np.nan
    try:
        sn = 'F{:04d}'.format(int(x))
    except ValueError:
        # Assume string so make sure it's of the right format
        sn = str(x).strip().capitalize()
    if len(sn) != 5 or sn[0] not in ['A', 'F', 'W']:
        print("!!! Bad ID \'%s\'" % sn)
    return sn

def now():
    return datetime.datetime.now().strftime("%d/%m/%y %H:%M:%S")

def enumerate_duplicates(row):
    """Append a counter to duplicate labels"""
    SEPARATOR = '.'
    duplicates = {}
    updated_row = []
    for r in row:
        count = duplicates.get(r, 0)
        if count > 0:
            label = "{}{}{}".format(r, SEPARATOR, count)
        else:
            label = r
        updated_row.append(label)
        duplicates[r] = count + 1
    return updated_row


In [426]:
pd.options.mode.chained_assignment = 'raise'

# Need to define in main or we can't pickle the data objects
class DataFrames(object):
    def __init__(self):
        catalog = None
        ftir = None
        reagent = None
        mla = None
        hr = None

def gsheets_service():
    from googleapiclient.discovery import build
    from httplib2 import Http
    from oauth2client import file, client, tools
    # If modifying these scopes, delete the file token.json.
    CREDS_FILE = '/opt/random/MADE/JensDataExportJupyter_client_secret.json'
    SCOPES = 'https://www.googleapis.com/auth/spreadsheets.readonly'
    store = file.Storage('token.json')
    creds = store.get()
    if not creds or creds.invalid:
        import argparse
        flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args([])
        flow = client.flow_from_clientsecrets(CREDS_FILE, SCOPES)
        creds = tools.run_flow(flow, store, flags)
    service = build('sheets', 'v4', http=creds.authorize(Http()))
    return service

def get_df(service, SPREADSHEET_ID, SS_RANGE, mla=False):
    # Call the Sheets API
    result = service.spreadsheets().values().get(spreadsheetId=SPREADSHEET_ID,
                                                range=SS_RANGE).execute()
    values = result.get('values', [])
    if not values:
        print('*** No data found ***')
        return None

    # mla has irrelevant stuff in columns 1 and 3 and sample numbers in first column
    if mla:
        values.pop(0)
        values.pop(1)
        def not_blank(row):
            return len(row[0]) > 0       
    else:
        def not_blank(row):
            return sum(map(len, row[:6])) > 0

    rows = filter(not_blank, values)
    if not rows:
        print('*** No data found after pruning rows! ***')
        return None
    
    columns = enumerate_duplicates(rows[0])
    ncols = len(rows[0])
    row_max = max(map(len, rows[1:]))
    width = min(ncols, row_max)
    return pd.DataFrame(rows[1:], columns=columns[:width])

def canonicalise_df(df, source=None):
    """Initial cleaning of all dataframes"""
    #from pandas._libs.tslib import OutOfBoundsDatetime
    if source:
        print("Canonicalising %s" % source)
    # Standardise names
    d = {
        'Sample Code':'SampleNumber',
        'Sample Number:':'SampleNumber',
        'Sample Number':'SampleNumber',
        'Sample number':'SampleNumber',
        'Sample Num':'SampleNumber',
        'Sample Number i.e F0XXX' : 'SampleNumber',
        
        'Sample Advertised/Acquired/Sold As' : 'SoldAs',
        'Sample Sold As' : 'SoldAs',
        'You submitted a substance for analysis. What were you told it was when you got it?':  'SoldAs',
        
        
        'Sample Source' :'SampleSource',

        'User Suspicion' :'UserSuspicion',

        'Sample Form' :'SampleForm',

        'Has the Service User or a close friend tried this batch?' : 'AlreadyTried',
        'Had you already tried this substance before getting it tested?' : 'AlreadyTried',

        'Your initials' : 'Tester',
        'Your name and first initial' : 'Tester',
        'Your name and surname initial' : 'Tester'
    }
    df.rename(columns=d, inplace=True)
    
    def fix_timestamp(x):
        return pd.to_datetime(str(x), format='%d/%m/%Y %H:%M:%S')
    if 'Timestamp' in df.columns:
        df.loc[:, 'Timestamp'] = df['Timestamp'].map(fix_timestamp)
    df.loc[:, 'SampleNumber'] = df['SampleNumber'].apply(fix_sample_number)
    df.dropna(subset=['SampleNumber'])
    #df.sort_values(['Sample Number'], ascending=True, inplace=True)
    return df

def get_data(service, SPREADSHEET_ID):

    CATALOG_RANGE = 'Catalog!A:R'
    FTIR_RANGE = 'FTIR!A:X'
    REAGENT_RANGE = 'Reagent!A:W'
    MLA_RANGE = 'MLA!A:R'
    HR_RANGE = 'Interventions!A:BJ'

    df_catalog = get_df(service, SPREADSHEET_ID, CATALOG_RANGE)
    df_catalog = canonicalise_df(df_catalog, source='catalog')
    df_ftir = get_df(service, SPREADSHEET_ID, FTIR_RANGE)
    df_ftir = canonicalise_df(df_ftir, source='ftir')
    df_reagent = get_df(service, SPREADSHEET_ID, REAGENT_RANGE)
    df_reagent = canonicalise_df(df_reagent, source='reagent')
    df_mla = get_df(service, SPREADSHEET_ID, MLA_RANGE, mla=True)
    df_mla = canonicalise_df(df_mla, source='mla')
    try:
        df_hr = get_df(service, SPREADSHEET_ID, HR_RANGE)
    except ValueError:
        df_hr = None
    if df_hr is not None:
        df_hr = canonicalise_df(df_hr, source='hr')

    df = DataFrames()
    df.catalog = df_catalog
    df.ftir = df_ftir
    df.reagent = df_reagent
    df.mla = df_mla
    df.hr = df_hr
    
    return df


In [462]:

# The ID and range of a sample spreadsheet.
BOOMTOWN2018_SPREADSHEET_ID = '1RiA-FwG_954Ger2VPsOSA3JLh-7sEoTYr40eVS0mp24'
MADE2018_SPREADSHEET_ID = '1daXdyL6uL8qnMsEsP0RLZE9nDzt6J7Zr1ygQdguvi-E'
BOARDMASTERS2018_SPREADSHEET_ID = '1U1lhUWLazDBN-wb2eZM8YV674f46npVfQK3XUVZjPow'
SW42018_SPREADSHEET_ID = '1agpMmJ9XukeWXS5_mwrDSKeshUaFtYwOzsPiR1DKsPU'
LOSTVILLAGE2018_SPREADSHEET_ID = '1OL0gyXrpZnJ8e7yR7eF6S2OaBYBiPDoVp5xGpdK4wlA'
BESTIVAL2018_SPREADSHEET_ID = '184qudGcw4PB0SMtOo0ZBDtckeGaH0RCLUXbA-u3BiHE'
YNOT2018_SPREADSHEET_ID = '1D01cj-Mra06TuoG_MsKuLq9OdtvKzrvRdiE255po_ag'
TRUCKFEST2018_SPREADSHEET_ID = '1sGG9WJxKyD2CGUjzJAXul3g9hVnRz6HbTiqKV5cUAyA'
LSTD2018_SPREADSHEET_ID = '1R8YqDnrhvuVMwPFShwaaAUIyCXQMeozA230OXsFsDQM'
KENDALCALLING2018_SPREADSHEET_ID = '16-PfwBOaUxwod3X75LGk1VAjBblkNsTJpCsX825aghI'
PARKLIFE2018_SPREADSHEET_ID = '1oO5sHcUhUn_7M1Hap73sOZHNEfWFMcDkQuWDRFf4d-w'


data = {}
service = gsheets_service()
print "PROCESSING BOOMTOWN"
data['boomtown'] = get_data(service, BOOMTOWN2018_SPREADSHEET_ID)
# print "PROCESSING BOARDMASTERS"
# data['boardmasters'] = get_data(service, BOARDMASTERS2018_SPREADSHEET_ID)
# print "PROCESSING MADE"
# data['made'] = get_data(service, MADE2018_SPREADSHEET_ID)
# print "PROCESSING SW4"
# data['sw4'] = get_data(service, SW42018_SPREADSHEET_ID)
# print "PROCESSING LOST VILLAGE"
# data['lostvillage'] = get_data(service, LOSTVILLAGE2018_SPREADSHEET_ID)
# print "PROCESSING BESTIVAL"
# data['bestival'] = get_data(service, BESTIVAL2018_SPREADSHEET_ID)
# print "PROCESSING YNOT"
# data['ynot'] = get_data(service, YNOT2018_SPREADSHEET_ID)
# print "PROCESSING TRUCKFEST"
# data['truckfest'] = get_data(service, TRUCKFEST2018_SPREADSHEET_ID)
# print "PROCESSING LSTD"
# data['lstd'] = get_data(service, LSTD2018_SPREADSHEET_ID)
# print "PROCESSING KENDAL CALLING"
# data['kc'] = get_data(service, KENDALCALLING2018_SPREADSHEET_ID)
# print "PROCESSING PARKLIFE"
# data['parklife'] = get_data(service, PARKLIFE2018_SPREADSHEET_ID)

import pickle
with open('foo_multi.pkl','w') as w:
    pickle.dump(data, w)
    
dfs = data['boomtown']

PROCESSING BOOMTOWN
Canonicalising catalog
!!! Bad ID 'R0876'
Canonicalising ftir
!!! Bad ID 'F00129'
!!! Bad ID 'Z1000'
!!! Bad ID 'B0076'
Canonicalising reagent
!!! Bad ID 'Z1000'
!!! Bad ID 'Yellow > green'
!!! Bad ID 'F1819 (or 1827?)'
Canonicalising mla
Canonicalising hr
!!! Bad ID 'G0037'
!!! Bad ID 'G0242'
!!! Bad ID 'G0153'
!!! Bad ID 'G0024'
!!! Bad ID 'G0652'
!!! Bad ID 'G9999'
!!! Bad ID 'G0877'
!!! Bad ID 'G0878'
!!! Bad ID 'G0811'
!!! Bad ID 'G1441'
!!! Bad ID 'G1216'
!!! Bad ID 'G1228'
!!! Bad ID 'G1229'
!!! Bad ID 'G1398'
!!! Bad ID 'G1284'
!!! Bad ID 'G1572'
!!! Bad ID 'G0875'
!!! Bad ID 'G1833'
!!! Bad ID 'G1703'
!!! Bad ID 'G1860'
!!! Bad ID 'G1859'
!!! Bad ID 'G1699'
!!! Bad ID 'G1686'
!!! Bad ID 'G1312'
!!! Bad ID 'G0420'
!!! Bad ID 'G0983'
!!! Bad ID 'G0981'


In [463]:
# with open('foo_multi.pkl') as f:
#     data = pickle.load(f)
# dfs = data['boomtown']

# Check for duplicates
catalog_duplicates = dfs.catalog['SampleNumber'].duplicated()
if catalog_duplicates.any():
    catalog_duplicates = list(dfs.catalog.loc[catalog_duplicates, 'SampleNumber'].values)
    print("### %d duplicated catalog SampleNumbers %s ###" % (len(catalog_duplicates), catalog_duplicates))
    dfs.catalog[dfs.catalog['SampleNumber'].duplicated(keep=False)].to_csv('catalog_duplicates.csv')
else:
    catalog_duplicates = None
    
ftir_duplicates = dfs.ftir['SampleNumber'].duplicated()
if ftir_duplicates.any():
    ftir_duplicates = list(dfs.ftir.loc[dfs.ftir['SampleNumber'].duplicated(), 'SampleNumber'].values)
    print("### %d duplicated FTIR SampleNumbers %s ###" % (len(ftir_duplicates), ftir_duplicates))
    dfs.ftir[dfs.ftir['SampleNumber'].duplicated(keep=False)].to_csv('ftir_duplicates.csv')
else:
    ftir_duplicates = None

reagent_duplicates = dfs.reagent['SampleNumber'].duplicated()
if reagent_duplicates.any():
    reagent_duplicates = list(dfs.reagent.loc[dfs.reagent['SampleNumber'].duplicated(), 'SampleNumber'].values)
    print("### %d duplicated reagent SampleNumbers %s ###" % (len(reagent_duplicates), reagent_duplicates))    
    dfs.reagent[dfs.reagent['SampleNumber'].duplicated(keep=False)].to_csv('reagent_duplicates.csv', encoding = 'utf-8')
else:
    reagent_duplicates = None

hr_duplicates = None
if dfs.hr is not None:
    hr_duplicates = dfs.hr['SampleNumber'].duplicated()
    if hr_duplicates.any():
        hr_duplicates = list(dfs.hr.loc[dfs.hr['SampleNumber'].duplicated(), 'SampleNumber'].values)
        print("### %d duplicated HR SampleNumbers %s ###" % (len(hr_duplicates), hr_duplicates))
        dfs.hr[dfs.hr['SampleNumber'].duplicated(keep=False)].to_csv('hr_duplicates.csv', encoding = 'utf-8')
    else:
        hr_duplicates = None

mla_duplicates = dfs.mla['SampleNumber'].duplicated()
if mla_duplicates.any():
    mla_duplicates = list(dfs.mla.loc[dfs.mla['SampleNumber'].duplicated(), 'SampleNumber'].values)
    print("### %d duplicated MLA SampleNumbers %s ###" % (len(mla_duplicates), mla_duplicates))
    dfs.mla[dfs.mla['SampleNumber'].duplicated(keep=False)].to_csv('mla_duplicates.csv')
else:
    mla_duplicates = None
    
if catalog_duplicates or \
    ftir_duplicates or \
    reagent_duplicates or \
    hr_duplicates or \
    mla_duplicates:
    outs = 'Please fix duplicated values'
    print(outs)
#     raise RuntimeError(outs)

### 44 duplicated catalog SampleNumbers ['F0109', 'F0076', 'F0024', 'F0158', 'F0255', 'F0275', 'F0344', 'F0373', 'F0446', 'F0690', 'F0579', 'F0833', 'F0939', 'F0870', 'F0934', 'F0930', 'F0873', 'F0857', 'F0945', 'F0858', 'F0853', 'F0943', 'F0944', 'F0855', 'F0848', 'F0695', 'F0862', 'F0831', 'F0852', 'F0889', 'F0846', 'F0904', 'F0922', 'F1122', 'F0976', 'F1170', 'F0012', 'F1207', 'F1227', 'F1585', 'F1665', 'F1660', 'F1856', 'F1873'] ###
### 47 duplicated FTIR SampleNumbers ['F0017', 'F0247', 'F0266', 'F0376', 'F0158', 'F0019', 'F0019', 'F0546', 'F0446', 'F0005', 'F0599', 'F0659', 'F0688', 'F0748', 'F1137', 'F0983', 'F0938', 'F0869', 'F0838', 'F0982', 'F0865', 'F0816', 'F0878', 'F0885', 'F0833', 'F0815', 'F1196', 'F1122', 'F1292', 'F1253', 'F1313', 'F1393', 'F1665', 'F1215', 'F1392', 'F1640', 'F1172', 'F1606', 'F1433', 'F1431', 'F1609', 'F1660', 'F1623', 'F1792', 'F1846', 'F0912', 'F1830'] ###
### 7 duplicated reagent SampleNumbers ['F0446', 'F0565', 'F0874', 'F0932', 'F0930', 'F1561', 

In [464]:
# Check there are no SampleNumbers in any of the other spreadsheets that aren't in the cataolog sheet
catalog_unique = set(dfs.catalog['SampleNumber'].unique())

ftir_unique = set(dfs.ftir['SampleNumber'].unique())
ftir_orphan = ftir_unique.difference(catalog_unique)
if ftir_orphan:
    print("Orphaned FTIR SampleNumbers: %s" % sorted(ftir_orphan))

reagent_unique = set(dfs.reagent['SampleNumber'].unique())
reagent_orphan = reagent_unique.difference(catalog_unique)
if reagent_orphan:
    print("Orphaned Reagent Test SampleNumbers: %s" % sorted(reagent_orphan))

hr_orphan = None
if dfs.hr is not None:
    hr_unique = set(dfs.hr['SampleNumber'].unique())
    hr_orphan = hr_unique.difference(catalog_unique)
    if hr_orphan:
        print("Orphaned HR SampleNumbers: %s" % sorted(hr_orphan))
    
mla_unique = set(dfs.mla['SampleNumber'].unique()).difference(catalog_unique)
mla_orphan = mla_unique.difference(catalog_unique)
if mla_orphan:
    print("Orphaned MLA SampleNumbers: %s" % sorted(mla_orphan))
    
# Check for any that are only in the catalog
outside_catalog = set.union(ftir_unique, reagent_unique, hr_unique, mla_unique)
catalog_orphan = catalog_unique.difference(outside_catalog)
if catalog_orphan:
    print("Orphaned catalog SampleNumbers: %s" % sorted(catalog_orphan))
    
# Check for any that aren't in FTIR and don't have anything in reagent test
ftir_missing = catalog_unique.difference(ftir_unique).difference(reagent_unique).difference(catalog_orphan)
if len(ftir_missing):
    print("Samples not in FTIR or Reagent: %s" % sorted(ftir_missing))

all_unique = copy.copy(ftir_unique)
all_unique.update(reagent_unique, hr_unique, mla_unique)
if (all_unique or catalog_only):
    outs = "### Please fix orphaned/catalog only samples ###"
    print(outs)
    #raise RuntimeError(outs)

Orphaned FTIR SampleNumbers: ['A1439', 'A1904', 'A1910', 'B0076', 'F00129', 'F0054', 'F0057', 'F0225', 'F0272', 'F0285', 'F0343', 'F0378', 'F0571', 'F0876', 'F0883', 'F0967', 'F0983', 'F1179', 'F1197', 'F1201', 'F1277', 'F1413', 'F1518', 'F1878', 'F1880', 'Z1000']
Orphaned Reagent Test SampleNumbers: ['F0225', 'F0526', 'F0983', 'F1437', 'F1819 (or 1827?)', 'Yellow > green', 'Z1000']
Orphaned HR SampleNumbers: [nan, 'F0054', 'F0154', 'F0225', 'F0272', 'F0285', 'F0378', 'F0612', 'F0616', 'F0876', 'F0883', 'F0967', 'F1076', 'F1179', 'F1201', 'F1880', 'F2901', 'F9271', 'F9999', 'G0024', 'G0037', 'G0153', 'G0242', 'G0420', 'G0652', 'G0811', 'G0875', 'G0877', 'G0878', 'G0981', 'G0983', 'G1216', 'G1228', 'G1229', 'G1284', 'G1312', 'G1398', 'G1441', 'G1572', 'G1686', 'G1699', 'G1703', 'G1833', 'G1859', 'G1860', 'G9999']
Orphaned catalog SampleNumbers: ['F0516', 'F1012', 'F1104', 'F1168', 'F1243', 'F1488', 'F1910', 'R0876']
Samples not in FTIR or Reagent: ['F0368', 'F0369', 'F0871', 'F0879', 'F

In [465]:
def find_duplicate_matches(duplicates, df1, df2, df1_name='DataFrame1', df2_name='DataFrame2'):
    hr = False
    if df1_name.lower()[:2] == 'hr':
        hr = True
    duplicate_matches = {}
    min_stage_delay = 60 * 1
    max_stage_delay = 60 * 60
    for sample_number in duplicates:
        duplicate_matches[sample_number] = {}
        for df1_idx, df1_row in df1.loc[df1['SampleNumber'] == sample_number].iterrows():
            for df2_idx, df2_row in df2.loc[df2['SampleNumber'] == sample_number].iterrows():
                df1_data = df1_row.loc[['SoldAs', 'AlreadyTried']].values
                if not hr:
                    df1_data.append(df1_row.SampleForm.values)
                df1_time = df1_row.Timestamp
                df2_data = df2_row.loc[['SoldAs', 'AlreadyTried']].values
                if not hr:
                    df2_data.append(df2_row.SampleForm.values)
                df2_time = df2_row.Timestamp
                delta_t = (df2_time - df1_time).seconds
                if np.all(df1_data == df2_data) and min_stage_delay < delta_t <= max_stage_delay:
                    print("%s SampleNumber %s duplicate (line: %d) MATCHES %s sample (line: %d)" % \
                          (df1_name, sample_number, df1_idx + 1, df2_name, df2_idx + 1))
                    duplicate_matches[sample_number][df1_idx] = True
                else:
                    print("%s SampleNumber %s duplicate (line: %d) DIFFERENT %s sample (line: %d)\n%s %s\n%s %s" % \
                          (df1_name, sample_number, df1_idx + 1, df2_name, df2_idx + 1,
                           df1_data, df1_time,
                           df2_data, df2_time))
                    duplicate_matches[sample_number][df1_idx] = False
    return duplicate_matches

duplicate_matches = find_duplicate_matches(catalog_duplicates, dfs.catalog, dfs.ftir, df1_name='Catalog', df2_name='FTIR')
#duplicate_matches = find_duplicate_matches(hr_duplicates, dfs.hr, dfs.catalog, df1_name='HR', df2_name='Catalog')

AttributeError: 'numpy.ndarray' object has no attribute 'append'

In [413]:
def match_orphans_to_duplicates(df1_orphans, duplicate_matches, df1, df2):
    for orphan_sample_number in df1_orphans:
        df1_data = df1.loc[df1['SampleNumber'] == orphan_sample_number, ['SampleForm', 'SoldAs', 'AlreadyTried', 'Timestamp']]
        df1_data = df1_data.values.tolist()[0]
        df1_time = df1_data.pop()
        for sample_number, indexd in duplicate_matches.items():
            for k, v in indexd.items():
                if not v:
                    df2_data = dfs.catalog.iloc[k][['SampleForm', 'SoldAs', 'AlreadyTried', 'Timestamp']].values.tolist()
                    df2_time = df2_data.pop()
                    delta_t = (df1_time - df2_time).seconds
                    if df2_data == df1_data and min_stage_delay < delta_t <= max_stage_delay:
                        print "Orphan {} could be match for duplicate {} (line: {})\n{} {}\n{} {}".format(orphan_sample_number,
                                                                                                  sample_number, k+1,
                                                                                                  df2_data, df2_time,
                                                                                                  df1_data, df1_time)

match_orphans_to_duplicates(ftir_orphan, duplicate_matches, dfs.ftir, dfs.catalog)
# match_orphans_to_duplicates(catalog_orphan, duplicate_matches, dfs.catalog, dfs.ftir)

In [423]:
# Check orphans against the FTIR sheet using just their numbers

def match_orphans_with_sample_integer(orphans, orphan_df, ref_df):
#     min_stage_delay = 60 * 1
#     max_stage_delay = 60 * 60
    def to_int(sn):
        if type(sn) in [str, unicode]:
            try:
                sn = int(sn[1:])
            except ValueError:
                print "Bad SampleNumber %s" % sn
        return sn
    orphan_df['SampleInteger'] = orphan_df['SampleNumber'].apply(to_int)
    ref_df['SampleInteger'] = ref_df['SampleNumber'].apply(to_int)
    orphan_ints = map(to_int, orphans)
    
    for orphan_sample_number, oint in zip(orphans, orphan_ints):
        for i, orphan_row in enumerate(orphan_df.loc[orphan_df['SampleNumber'] == orphan_sample_number].itertuples()):
            for j, ref_row in enumerate(ref_df.loc[ref_df['SampleInteger'] == oint].itertuples()):
                i += 1
                j += 1
                orphan_data = [orphan_row.SampleForm, orphan_row.SoldAs, orphan_row.AlreadyTried]
                orphan_time = orphan_row.Timestamp
                orphan_idx = orphan_row.Index + 1
                ref_sample_number = ref_row.SampleNumber
                ref_data = [ref_row.SampleForm, ref_row.SoldAs, ref_row.AlreadyTried]
                ref_time = ref_row.Timestamp
                ref_idx = ref_row.Index + 1            
                delta_t = (ref_time - orphan_time).seconds
#                 if orphan_data == ref_data and min_stage_delay < delta_t <= max_stage_delay:
                if orphan_data == ref_data:
                    print("FTIR orphan %s (line: %d) could be match for Catalog SampleNumber %s (line: %d)\n%s %s\n%s %s" % \
                          (orphan_sample_number, orphan_idx, ref_sample_number, ref_idx, orphan_data, orphan_time, ref_data, ref_time))

# match_orphans_with_sample_integer(catalog_orphan, dfs.catalog, dfs.ftir)
match_orphans_with_sample_integer(hr_orphan, dfs.hr, dfs.ftir)

In [6]:
# Clean up catalog
# Drop all unwanted columns

#  or 'Your initials'
l = set(['Your initials',
         'Your name and first initial',
         'Which device was a photo taken with? Who does it belong to?',
         'Is a breakline present?',
         'Unusual appearance'
        ])

to_drop = set(dfs.catalog.columns).intersection(l)
dfs.catalog.drop(to_drop, axis=1, inplace=True)

d = {
    'Timestamp' : 'Catalog timestamp',
    'Sample Advertised/Acquired/Sold As': 'Catalog_SoldAs',
    'Sample Form' : 'Catalog_Form',
    'Has the Service User or a close friend tried this batch?': 'Catalog_Tried',
    'What is the mass? (mg)': 'FullPillMass',
    'What is the shape of the pill?': 'PillShape',
    'What is the logo?': 'PillLogo',
    'What colour is the pill?': 'PillColour'
}
dfs.catalog.rename(columns=d, inplace=True)

In [7]:
# For FTIR columns need to merge the data from the 'Compound detected', 'Hit Confidence.1' columns into the
# 'Substance detected', 'Hit Confidence' column where the substance detected was 'other'
print("COLS ",dfs.ftir.columns)
print("SS ",dfs.ftir['Substance detected'][:5])
mask = dfs.ftir['Substance detected'] != 'Other'
dfs.ftir['Substance detected'].where(mask, dfs.ftir['Compound detected'], inplace=True) # Copy values from 'Compound detected'
dfs.ftir['Hit Confidence'].where(mask, dfs.ftir['Hit Confidence.1'], inplace=True)
dfs.ftir.drop(['Compound detected', 'Hit Confidence.1', 'Brief Note'], axis=1, inplace=True)

mask = dfs.ftir['Compound detected (Subtraction)'] != 'Other'
dfs.ftir['Compound detected (Subtraction)'].where(mask, df_ftir['Substance detected.1'], inplace=True) # Copy values from 'Compound detected'
dfs.ftir['Hit Confidence.2'].where(mask, dfs.ftir['Hit Confidence.3'], inplace=True)
dfs.ftir.drop(['Substance detected.1', 'Hit Confidence.3', 'Brief Note.1'], axis=1, inplace=True)

# Drop all unwanted columns
l = ['Your name and surname initial',
     'User Suspicion',
     'Is anything detected after subtraction analysis?',
     'Analysis required', 
     'Next action(s)',
     'Send to HR team'
    ]
#'Note for harm reduction worker'
to_drop = set(dfs.ftir.columns).intersection(l)
dfs.ftir.drop(to_drop, axis=1, inplace=True)

# Rename shared columns so that we can check for any errors and remove any columns not of interest to the master df
d = {
    'Timestamp' : 'FTIR timestamp',
    'Sample Sold As': 'FTIR Sold As',
    'Sample Form' : 'FTIR form',
    'Has the Service User or a close friend tried this batch?': 'FTIR tried',
    'Substance(s) detected' : 'FTIR final result',
    'Substance detected' : 'FTIR result1',
    'Hit Confidence' :  'FTIR hit1',
    'Is anything detected after subtraction analysis?' : 'FTIR subtraction positive',
    'Compound detected (Subtraction)' :  'FTIR result2',
    'Hit Confidence.2' :  'FTIR hit2',
    '"Strength" of powdered substance' : 'FTIR Powder Strength',
    'Does the substance detected match the substance that was advertised?' : 'FTIR Matches Sold As',
}
dfs.ftir.rename(columns=d, inplace=True)

('COLS ', Index([                                                           u'Timestamp',
                                                              u'Sample Number',
                                                                     u'Tester',
                                                                    u'Sold As',
                                                                u'Sample Form',
                                                              u'Already Tried',
                                                             u'User Suspicion',
                                                         u'Substance detected',
                                                             u'Hit Confidence',
                                                          u'Compound detected',
                                                           u'Hit Confidence.1',
                                                                 u'Brief Note',
                           u'I

NameError: name 'df_ftir' is not defined

In [None]:
# Clean up HR form

# Drop all unwanted columns
l = ['HR worker name:']
dfs.hr.drop(l, axis=1, inplace=True)

# Rename shared columns so that we can check for any errors and remove any columns not of interest to the master df
d = {
    'Timestamp' : 'HR timestamp',
    'You submitted a substance for analysis. What were you told it was when you got it?': 'HR Sold as',
    'Had you already tried this substance before getting it tested?': 'HR tried',
    'What was your first sample number at this event? Did you take a photo or keep the ticket?': 'Previous Sample Number'
}
dfs.hr.rename(columns=d, inplace=True)

In [None]:
# Catalog and FTIR data frames
df_all = pd.merge(dfs.catalog, dfs.ftir, how='left', on=['Sample Number'])

In [None]:
# Merge in any reagent test data
df_all = pd.merge(df_all, dfs.reagent[['Sample Number', 'Reagent Result']], how='left', on=['Sample Number'])

In [None]:
# Merge in any pill strength data
df_all = pd.merge(df_all, dfs.mla[['Sample Number', 'MDMA / tablet (mg)', '% MDMA content']], how='left', on=['Sample Number'])

In [None]:
# Merge in HR data
df_all = pd.merge(df_all, dfs.hr, how='left', on=['Sample Number'])

In [None]:
# Fix column orders
prefix = ['Sample Number',
          'Catalog timestamp', 'FTIR timestamp', 'HR timestamp',
          'Catalog Sold As', 'FTIR Sold As','HR Sold as', 
          'Catalog form', 'FTIR form',
          'Catalog tried', 'FTIR tried', 'HR tried']
columns = [c for c in df_all.columns if c not in prefix]
columns = prefix + columns
df_all = df_all[columns]
df_all.to_csv('foo.csv')