This notebook merges people, degrees and jobs. They will be aggregated for each organization, so there will be much fewer rows produced. <br>
Setup: place `people.csv`, `degrees.csv`, `jobs.csv` in the same folder as this notebook.

In [6]:
# import library
import pandas as pd
import numpy as np
import datetime as dt

In [8]:
# Stardust ver unique keyword search
def unique_keyword_search(df_col: pd.Series, num_selected: int) -> list:
    '''
    Performs unique keyword search on a dataframe's column for its most common keywords

    :param pd.Series df_col: column of a pd.DataFrame (e.g. df['col'])
    :param int num_selected: number of keywords
    :return: list of keywords in decreasing occurrence
    :rtype: list
    '''
    keywords = []
    for entry in df_col:
        words = entry.split(',')
        [keywords.append(word) for word in words]
    sorted_keywords = Counter(keywords).most_common()
    output_keywords = []
    for i in range(num_selected):
        output_keywords.append(sorted_keywords[i][0])
    return output_keywords

In [10]:
# Stardust ver one-hot encoder
def onehot_encoder(df: pd.DataFrame, col_name: str, num_selected: int) -> pd.DataFrame:
    '''
    Performs one-hot encoding on a dataframe's column for its values with most occurrences

    :param pd.DataFrame df: dataframe to be processed
    :param str col_name: name of the encoded column
    :param int num_selected: number of values with most occurrences
    :return: processed dataframe
    :rtype: pd.DataFrame
    '''
    series = df[col_name].value_counts()
    selected_col_name = series.head(num_selected).index.tolist()

    for item in selected_col_name:
        df[col_name + "_" + item] = np.where(df[col_name] == item, 1, 0) # whenever df[col_name] == cat replace it with 1 else 0
        
    df.drop(columns=col_name, inplace=True)
    return df

In [12]:
# Stardust ver one-hot encoder V2
def onehot_encoder_v2(df: pd.DataFrame, col_name: str, list_selected: list) -> pd.DataFrame:
    '''
    Performs one-hot encoding on a dataframe's column for its values with most occurrences

    :param pd.DataFrame df: dataframe to be processed
    :param str col_name: name of the encoded column
    :param list list_selected: list of most common values
    :return: processed dataframe
    :rtype: pd.DataFrame
    '''
    for item in list_selected:
        df[item] = np.where(df[col_name] == item, 1, 0) # whenever df[col_name] == cat replace it with 1 else 0
    df.drop(col_name, axis=1, inplace=True)

    return df

First process `people.csv`.

In [14]:
# read the CSV
ppl_df = pd.read_csv("people.csv")
# change the cols to keep
keep_col = ['uuid','gender','featured_job_organization_uuid']
ppl_df = ppl_df[keep_col]
# ppl_df.info()

In [16]:
# if a person has no associated organization, the entry has no use
ppl_df.drop(ppl_df[ppl_df['featured_job_organization_uuid'].isnull()].index, inplace=True)
# ppl_df.info()

In [18]:
ppl_df = onehot_encoder(ppl_df, 'gender', 2) # male and female
# ppl_df.info()

In [20]:
ppl_df.reset_index(inplace=True)
ppl_df.drop(axis=1, labels='index', inplace=True)

In [22]:
ppl_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 676013 entries, 0 to 676012
Data columns (total 5 columns):
 #   Column                          Non-Null Count   Dtype 
---  ------                          --------------   ----- 
 0   uuid                            676013 non-null  object
 1   name                            676013 non-null  object
 2   featured_job_organization_uuid  676013 non-null  object
 3   gender_male                     676013 non-null  int32 
 4   gender_female                   676013 non-null  int32 
dtypes: int32(2), object(3)
memory usage: 20.6+ MB


Then process `people.csv`.

In [24]:
# read the CSV
deg_df = pd.read_csv("degrees.csv")
# change the cols to keep
keep_col = ['person_uuid','degree_type','subject','started_on','completed_on','is_completed']
deg_df = deg_df[keep_col]

In [26]:
deg_df.drop_duplicates(inplace=True)

degree_type_to_drop = (deg_df['degree_type'] == 'unknown') | (deg_df['degree_type'] == 'Unknown') | (deg_df['degree_type'] == 'Specialization') | (deg_df['degree_type'] == 'Certificate') | (deg_df['degree_type'] == 'Certification')
deg_df = deg_df.drop(deg_df[degree_type_to_drop].index)
deg_df = deg_df.drop(deg_df[deg_df['subject'] == 'unknown'].index)

In [28]:
# use keyword to search for relevant degrees
bachelor_keyword = ['Bachelor', 'Degree', 'BS', 'BSc', 'B.S.', 'Bsc', 'B.S',
'BENG', 'BEng', 'B.Eng.', 'Beng', 'B.Eng', 'BE', 'B.E', 'BA', 'Ba', 'B.A.', 'B.A', 'A.B.', 'AB',
'BBA', 'B.B.A.', 'B.B.A', 'B.Tech', 'B.Tech.', 'B.Com.', 'J.D.', 'JD', 'Juris Doctor']

master_keyword = ['Master', 'Postgraduate', 'Graduate', 'MPHIL', 'MPhil', 'Mphil', 'M.Phil.', 'M.phil', 'M.Phil', 'M.S',
'MS', 'MSc', 'M.Sc', 'Msc', 'MENG', 'MEng', 'M.Eng.', 'M.eng.',
'MA', 'M.A', 'MBA', 'M.B.A.', 'M.B.A', 'Mba', 'M.BA.', 'M.Ba.', 'LLM']

phd_keyword = ['PHD', 'Phd', 'PhD', 'P.HD', 'P.Hd', 'P.hd', 'P.H.D', 'Ph.D.', 'Ph.D.', 'PhD']

# categories can be formed from one or more above elementary keywords
# df.dropna(axis=0, subset=['degree_type'], inplace=True)
deg_df['degree_type'].fillna(value='N/A', inplace=True)
deg_df.loc[deg_df['degree_type'].str.contains('|'.join(phd_keyword)),'degree_type']='PhD'
deg_df.loc[deg_df['degree_type'].str.contains('|'.join(master_keyword)),'degree_type']='Master'
deg_df.loc[deg_df['degree_type'].str.contains('|'.join(bachelor_keyword)),'degree_type']='Bachelor'

degree_type = []

# perform ordinal encoding: bachelor = 1, master = 2, PhD = 3, no/others = 0
for degree in deg_df['degree_type']:
    if degree == 'Bachelor':
        degree_type.append(1)
    elif degree == 'Master':
        degree_type.append(2)
    elif degree == 'PhD':
        degree_type.append(3)
    else:
        degree_type.append(0)
        
deg_df['degree_type'] = degree_type

In [30]:
deg_df = onehot_encoder(deg_df, 'subject', 30)

In [32]:
deg_df.sort_values(by='degree_type', ascending=False, inplace=True) # PhD > Master > Bachelor
deg_df = deg_df.drop_duplicates(subset='person_uuid', keep="first") # only consider the highest degree obtained

In [34]:
degree_date = ['started_on', 'completed_on']
has_degree = np.where(deg_df['degree_type'] != 0, 1, 0)

# ignore for now
# for col in degree_date:
#     deg_df[col] = pd.to_datetime(deg_df[col], errors='coerce', format='%Y-%m-%d') # 'coerce' converts NaN to NaT
#     mean = deg_df[col].mean()
#     deg_df.loc[has_degree,col].fillna(value=mean, inplace=True)

In [36]:
deg_df['is_completed'].fillna(value=0, inplace=True)
deg_df['is_completed'] = deg_df['is_completed']*1

In [38]:
deg_df.reset_index(inplace=True)
deg_df.drop(axis=1, labels='index', inplace=True)

In [40]:
# deg_df.info()

In [42]:
# join people and degrees
ppl_join = ppl_df.set_index('uuid').join(deg_df.set_index('person_uuid'))

In [44]:
ppl_join.head()

Unnamed: 0_level_0,name,featured_job_organization_uuid,gender_male,gender_female,degree_type,started_on,completed_on,is_completed,subject_Computer Science,subject_Economics,...,subject_Biology,subject_International Business,subject_Information Technology,subject_Business Management,subject_English,subject_Industrial Engineering,subject_Civil Engineering,subject_Biochemistry,subject_Medicine,subject_Philosophy
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ed13cd36-fe2b-3707-197b-0c2d56e37a71,Ben Elowitz,cf253887-5eac-21a2-28d3-47db7311f7e9,1,0,1.0,,1994-01-01,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5ceca97b-493c-1446-6249-5aaa33464763,Kevin Flaherty,789e5e4d-0c90-d06e-92a0-b800b461c3da,1,0,,,,,,,...,,,,,,,,,,
9f99a98a-aa97-b30b-0d36-db67c1d277e0,Raju Vegesna,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,1,0,,,,,,,...,,,,,,,,,,
6e1bca72-a865-b518-b305-31214ce2d1b0,Ian Wenig,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,1,0,1.0,,1986-01-01,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80d25c23-9726-9dda-5852-39cdf4810ea5,Ron Gorodetzky,aa3bf156-06af-5b6e-215d-9e7211fc173b,1,0,,,,,,,...,,,,,,,,,,


In [46]:
# some cols will use 0 as the fillna() value
col_nan_to_zero = ['degree_type','is_completed']
for col in col_nan_to_zero:
    ppl_join[col].fillna(value=0, inplace=True)
ppl_join.iloc[9:].fillna(value=0, inplace=True)

In [48]:
ppl_join.reset_index(inplace=True)

In [50]:
# read the CSV
job_df = pd.read_csv("jobs.csv")
# change the cols to keep
keep_col = ['person_uuid','org_uuid','started_on','ended_on','is_current']
job_df = job_df[keep_col]
job_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1589222 entries, 0 to 1589221
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   person_uuid  1589222 non-null  object
 1   org_uuid     1589222 non-null  object
 2   started_on   786367 non-null   object
 3   ended_on     293932 non-null   object
 4   is_current   1589222 non-null  bool  
dtypes: bool(1), object(4)
memory usage: 50.0+ MB


In [None]:
job_df.head()

In [45]:
job_is_current = job_df['is_current'].array
job_df.loc[job_is_current,'ended_on'] = job_df.loc[job_is_current,'ended_on'].fillna('2020-12-30')
job_df.drop(columns='is_current', inplace=True)

In [47]:
# find the duration of the job
job_df['job_duration'] = pd.to_datetime(job_df['ended_on'], errors='coerce', format='%Y-%m-%d') - pd.to_datetime(job_df['started_on'], errors='coerce', format='%Y-%m-%d')
# take mean for NaN
mean = job_df['job_duration'].mean()
job_df['job_duration'].fillna(value=mean, inplace=True)
# convert TimeDelta to int
job_df['job_duration'] = job_df['job_duration'].apply(lambda x: x.days)

In [55]:
job_df.drop(columns=['started_on','ended_on'], inplace=True)

In [56]:
# join jobs and ppl_join (= ppl + degrees)
job_join = job_df.set_index('person_uuid').join(ppl_join.set_index('uuid'))

In [57]:
job_join.head()

Unnamed: 0,org_uuid,is_current,name,featured_job_organization_uuid,gender_male,gender_female,degree_type,started_on,completed_on,is_completed,...,subject_Biology,subject_International Business,subject_Information Technology,subject_Business Management,subject_English,subject_Industrial Engineering,subject_Civil Engineering,subject_Biochemistry,subject_Medicine,subject_Philosophy
0000125e-5faf-822d-72d4-b7701e250550,75685ae6-5ba1-0ffb-80ba-3c0bf539e04d,True,Tony Wahl,75685ae6-5ba1-0ffb-80ba-3c0bf539e04d,1.0,0.0,0.0,,,0.0,...,,,,,,,,,,
000020dc-18ce-7f7b-e8c4-8f5d716ad09d,867f0af5-a1d0-143d-bbed-5cc252ca40d6,True,Sara Kintzle,867f0af5-a1d0-143d-bbed-5cc252ca40d6,0.0,1.0,3.0,2008-01-01,2012-01-01,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00002852-4f2a-473a-ae63-810fa8d3f31f,655ff5a2-33d2-dfe5-af13-20866a58a5c0,True,,,,,,,,,...,,,,,,,,,,
00006713-e0f6-e8d0-58a3-0063bb243a50,bf0eb7c9-a5c4-014f-fb7c-f6530084e4d5,True,Susan Moore,bf0eb7c9-a5c4-014f-fb7c-f6530084e4d5,0.0,1.0,0.0,,,0.0,...,,,,,,,,,,
00006aa5-68cc-7430-eb3d-9bd8305dcb4d,cab0141c-849d-56e9-8173-56fcd95c5dd1,False,,,,,,,,,...,,,,,,,,,,


In [58]:
job_join.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1589222 entries, 0000125e-5faf-822d-72d4-b7701e250550 to fffff797-41b1-4358-a991-7d9c51be798c
Data columns (total 40 columns):
 #   Column                           Non-Null Count    Dtype  
---  ------                           --------------    -----  
 0   org_uuid                         1589222 non-null  object 
 1   is_current                       1589222 non-null  bool   
 2   name                             1221322 non-null  object 
 3   featured_job_organization_uuid   1221322 non-null  object 
 4   gender_male                      1221322 non-null  float64
 5   gender_female                    1221322 non-null  float64
 6   degree_type                      1221322 non-null  float64
 7   started_on                       165218 non-null   object 
 8   completed_on                     263875 non-null   object 
 9   is_completed                     1221322 non-null  float64
 10  subject_Computer Science         457804 non-null   floa

In [None]:
# the next step is to get the list of fintech companies
# so that unique org_uuid drastically decreases

In [53]:
# df.to_csv("degrees_cleaned.csv", index=False)