### Prepping Data Challenge:  Spread the Knowledge for TC (week 45)

### Requirements
- Input the Data
- Create a DateTime field for each Session
- Create a row for each Attendee and Join on the Lookup Table
- Create a Direct Contact Field for each Attendee 
  - These are people they directly meet in the brain dates they attend
- Make sure Attendees don't have their own names listed as Direct Contacts
- Create an Indirect Contact field for each Attendee
  - These will be the Direct Contacts for each Attendee's Direct Contacts, for sessions that have happened prior to the session where they meet
  - Remember: order of sessions is important!
- If another attendee is classified as both a Direct and an Indirect Contact, classify them as a Direct Contact
- Reshape the data so that each row represents an attendee and a contact they've made, either Directly or Indirectly, for each subject matter
  - Ensure there are no duplicates!
- Output the Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
#Input the data
with pd.ExcelFile(r"\Dataprep\2021\TC Input.xlsx") as xl:
     df1 = pd.concat([pd.read_excel(xl, s).assign(date=s) for s in xl.sheet_names if s != 'Attendees'])\
        .rename(columns={'Attendee IDs' : 'Attendee ID'})
     df2 = pd.read_excel(xl, 'Attendees', )

#df1.head()
#df2

In [3]:
#Create a DateTime field for each Session
df1['DateTime'] = pd.to_datetime(df1['date'] + ' 2021 ' + (df1['Session Time'].astype(str) + ':00:00').str[0:8])
df1.drop(columns=['date', 'Session Time'], inplace=True)

In [4]:
#Create a row for each Attendee and Join on the Lookup Table
df1['Attendee ID'] = df1['Attendee ID'].str.split(', ')
df1 = df1.explode('Attendee ID').astype({'Attendee ID' : 'int'})

In [5]:
df = df1.merge(df1[['Session ID', 'Attendee ID']], on='Session ID', suffixes=['', '_2'])
df.drop(df.loc[df['Attendee ID'] == df['Attendee ID_2']].index, axis=0, inplace=True)

In [6]:
df.head()

Unnamed: 0,Session ID,Subject,Attendee ID,DateTime,Attendee ID_2
1,1030,Desktop,1,2021-11-10 09:00:00,4
2,1030,Desktop,1,2021-11-10 09:00:00,17
3,1030,Desktop,1,2021-11-10 09:00:00,19
4,1030,Desktop,1,2021-11-10 09:00:00,30
5,1030,Desktop,4,2021-11-10 09:00:00,1


In [7]:
# join on direct contact ID to get their direct contacts
df3 = df[['Subject', 'DateTime', 'Attendee ID', 'Attendee ID_2']].merge(df, left_on=['Subject', 'Attendee ID_2'], right_on=['Subject', 'Attendee ID'],suffixes=['', '_ic'])

df3 = df3.drop(df3.loc[(df3['Attendee ID'] == df3['Attendee ID_2_ic'])
                    | (df3['DateTime'] < df3['DateTime_ic'])].index)[['Subject', 'Attendee ID', 'Attendee ID_2_ic']].rename(columns={'Attendee ID_2_ic' : 'Attendee ID_2'})

In [8]:
# union direct and indirect contacts, add the attendee names, remove any duplicates
cols = ['Subject', 'Attendee ID', 'Attendee ID_2']
df_all = pd.concat([df[cols].drop_duplicates(subset=['Subject', 'Attendee ID', 'Attendee ID_2'])\
                         .assign(Contact_Type='Direct Contact'),
                 df3[cols].assign(Contact_Type='Indirect Contact')], axis=0)\
         .sort_values(by='Contact_Type')\
         .drop_duplicates(subset=['Subject', 'Attendee ID', 'Attendee ID_2'], keep='first')\
         .merge(df2, on='Attendee ID')\
         .merge(df2, left_on='Attendee ID_2', right_on='Attendee ID', suffixes=['', '_2'])\
         .rename(columns={'Attendee_2' : 'Contact', 'Contact_Type' : 'Contact Type'})

df_all.drop(columns=[c for c in df_all.columns if 'ID' in c], inplace=True)


In [9]:
#output the data
df_all

Unnamed: 0,Subject,Contact Type,Attendee,Contact
0,Desktop,Direct Contact,Kate,Tim
1,Prep,Direct Contact,Andy,Jack
2,Prep,Direct Contact,Jack,Rosario
3,Prep,Direct Contact,Jack,Jenny
4,Prep,Direct Contact,Jack,Jonathan
...,...,...,...,...
1418,Desktop,Indirect Contact,Will,John
1419,DataDev,Indirect Contact,Michael,Anya
1420,DataDev,Indirect Contact,Michael,David
1421,DataDev,Indirect Contact,Jonathan,David


In [10]:
#output the data
df_all.to_csv('wk45-output.csv', index=False)