This notebook merges people, degrees and jobs. They will be aggregated for each organization, so there will be much fewer rows produced. <br>
Setup: place `people.csv`, `degrees.csv`, `jobs.csv` in the same folder as this notebook.

In [1]:
# import library
import pandas as pd
import numpy as np

In [None]:
# Stardust ver one-hot encoder
def onehot_encoder(df: pd.DataFrame, col_name: str, num_selected: int) -> pd.DataFrame:
    '''
    Performs one-hot encoding on a dataframe's column for its values with most occurrences

    :param pd.DataFrame df: dataframe to be processed
    :param str col_name: name of the encoded column
    :param int num_selected: number of values with most occurrences
    :return: processed dataframe
    :rtype: pd.DataFrame
    '''
    series = df[col_name].value_counts()
    selected_col_name = series.head(num_selected).index.tolist()

    for item in selected_col_name:
        df[col_name + "_" + item] = np.where(df[col_name] == item, 1, 0) # whenever df[col_name] == cat replace it with 1 else 0
        
    df.drop(columns=col_name)
    return df

First process `people.csv`.

In [2]:
# read the CSV
ppl_df = pd.read_csv("people.csv")
# change the cols to keep
keep_col = ['uuid','name','rank','gender','featured_job_organization_uuid']
ppl_df = ppl_df[keep_col]

In [3]:
# featured_job_organization_uuid
ppl_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1106474 entries, 0 to 1106473
Data columns (total 5 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   uuid                            1106474 non-null  object 
 1   name                            1106474 non-null  object 
 2   rank                            1104160 non-null  float64
 3   gender                          1084964 non-null  object 
 4   featured_job_organization_uuid  676013 non-null   object 
dtypes: float64(1), object(4)
memory usage: 42.2+ MB


In [None]:

deg_df = pd.read_csv("degrees.csv")
keep_col = ['uuid','person_uuid','degree_type','subject','started_on','completed_on','is_completed']
jobs_df = 

In [None]:
jobs_df = pd.read_csv("jobs.csv")
['uuid','person_uuid','org_uuid','org_name','started_on','ended_on','is_current','job_type']

In [3]:
# df['degree_type'].value_counts()
# df['degree_type'].value_counts().head(100)

In [4]:
# df['subject'].value_counts()

In [5]:
# use keyword to search for relevant degrees
bachelor_keyword = ['Bachelor', 'Degree', 'BS', 'BSc', 'B.S.', 'Bsc', 'B.S',
'BENG', 'BEng', 'B.Eng.', 'Beng', 'B.Eng', 'BE', 'B.E', 'BA', 'Ba', 'B.A.', 'B.A', 'A.B.', 'AB',
'BBA', 'B.B.A.', 'B.B.A', 'B.Tech', 'B.Tech.', 'B.Com.', 'J.D.', 'JD', 'Juris Doctor']

master_keyword = ['Master', 'Postgraduate', 'Graduate', 'MPHIL', 'MPhil', 'Mphil', 'M.Phil.', 'M.phil', 'M.Phil', 'M.S',
'MS', 'MSc', 'M.Sc', 'Msc', 'MENG', 'MEng', 'M.Eng.', 'M.eng.',
'MA', 'M.A', 'MBA', 'M.B.A.', 'M.B.A', 'Mba', 'M.BA.', 'M.Ba.', 'LLM']

phd_keyword = ['PHD', 'Phd', 'PhD', 'P.HD', 'P.Hd', 'P.hd', 'P.H.D', 'Ph.D.', 'Ph.D.', 'PhD']

# categories can be formed from one or more above elementary keywords
df.dropna(axis=0, subset=['degree_type'], inplace=True)

df.loc[df['degree_type'].str.contains('|'.join(phd_keyword)),'degree_type']='PhD'
df.loc[df['degree_type'].str.contains('|'.join(master_keyword)),'degree_type']='Master'
df.loc[df['degree_type'].str.contains('|'.join(bachelor_keyword)),'degree_type']='Bachelor'

df.reset_index(inplace=True)
df.drop(axis=1, labels='index', inplace=True)

# df

In [6]:
df.to_csv("degrees_cleaned.csv", index=False)