In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = None

In [2]:
df=pd.read_csv("org_final_joined_new.csv")

In [3]:
# df.columns.get_loc("event_names")

In [4]:
df.drop(df.iloc[:, 0:80], inplace = True, axis = 1)

In [5]:
# df.info()

In [6]:
# some cols will be deleted, for being too complicated / containing little useful info
col_to_drop = ['event_short_description', 'event_venue_name', 'event_description', 'person_personal_featured_job_organization_name']
df.drop(columns=col_to_drop, inplace=True)

In [7]:
# some cols will use 0 as the fillna() value
col_nan_to_zero = ['person', 'organization', 'speaker', 'sponsor', 'exhibitor', 'organizer', 'contestant', 'job_executive', 'job_employee', 'job_board_member', 'job_advisor', 'job_board_observer', 'person_person', 'person_organization', 'person_speaker', 'person_sponsor', 'person_exhibitor', 'person_organizer', 'person_contestant']
for col in col_nan_to_zero:
    df[col].fillna(value=0, inplace=True)

In [8]:
df.info(max_cols=1000)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1018183 entries, 0 to 1018182
Data columns (total 51 columns):
 #   Column                              Non-Null Count    Dtype  
---  ------                              --------------    -----  
 0   event_names                         860952 non-null   object 
 1   event_rank                          860952 non-null   float64
 2   event_started_on                    860952 non-null   object 
 3   event_ended_on                      860952 non-null   object 
 4   event_country_code                  850218 non-null   object 
 5   event_region                        850218 non-null   object 
 6   event_city                          850218 non-null   object 
 7   event_event_roles                   860952 non-null   object 
 8   participant_name                    860952 non-null   object 
 9   person                              1018183 non-null  float64
 10  organization                        1018183 non-null  float64
 11  speaker    

In [9]:
# CD: event_rank
event_rank_mean = df['event_rank'].mean()
df['event_rank'] = df['event_rank'].fillna(value=event_rank_mean, inplace=True)

In [10]:
# CF, CG: event_started_on, event_ended_on
event_date = ['event_started_on', 'event_ended_on']
has_event = ~df['event_names'].isnull()

for col in event_date:
    df[col] = pd.to_datetime(df[col])
    mean = df[col].mean()
    df.loc[has_event,col] = df.loc[has_event,col].fillna(value=mean, inplace=True)

In [11]:
# Stardust ver one-hot for country_code, stock_exchange_symbol
def onehot_encoder(df: pd.DataFrame, col_name: str, num_selected: int) -> pd.DataFrame:
    '''
    Performs one-hot encoding on a dataframe's column for its values with most occurrences

    :param pd.DataFrame df: dataframe to be processed
    :param str col_name: name of the encoded column
    :param int num_selected: number of values with most occurrences
    :return: processed dataframe
    :rtype: pd.DataFrame
    '''
    series = df[col_name].value_counts()
    selected_col_name = series.head(num_selected).index.tolist()

    for item in selected_col_name:
        df[col_name + "_" + item] = np.where(df[col_name] == item, 1, 0) # whenever df[col_name] == cat replace it with 1 else 0
        # df[item].add_prefix(col_name + "_")
    df.drop(columns=col_name)
    return df

In [12]:
# CJ, CL: event_country_code, event_city
df = onehot_encoder(df, 'event_country_code', 20)
df = onehot_encoder(df, 'event_city', 20)

In [13]:
# CX: job_is_current
df['job_is_current'].fillna(value=0, inplace=True)
df['job_is_current'] = df['job_is_current']*1

In [14]:
# DE, DF, DH: person_personal_gender, person_personal_country_code, person_personal_city
df = onehot_encoder(df, 'person_personal_gender', 2)
df = onehot_encoder(df, 'person_personal_country_code', 20)
df = onehot_encoder(df, 'person_personal_city', 20)

In [15]:
# DK: person_degree_degree_type
# use keyword to search for relevant degrees
bachelor_keyword = ['Bachelor', 'Degree', 'BS', 'BSc', 'B.S.', 'Bsc', 'B.S',
'BENG', 'BEng', 'B.Eng.', 'Beng', 'B.Eng', 'BE', 'B.E', 'BA', 'Ba', 'B.A.', 'B.A', 'A.B.', 'AB',
'BBA', 'B.B.A.', 'B.B.A', 'B.Tech', 'B.Tech.', 'B.Com.', 'J.D.', 'JD', 'Juris Doctor']

master_keyword = ['Master', 'Postgraduate', 'Graduate', 'MPHIL', 'MPhil', 'Mphil', 'M.Phil.', 'M.phil', 'M.Phil', 'M.S',
'MS', 'MSc', 'M.Sc', 'Msc', 'MENG', 'MEng', 'M.Eng.', 'M.eng.',
'MA', 'M.A', 'MBA', 'M.B.A.', 'M.B.A', 'Mba', 'M.BA.', 'M.Ba.', 'LLM']

phd_keyword = ['PHD', 'Phd', 'PhD', 'P.HD', 'P.Hd', 'P.hd', 'P.H.D', 'Ph.D.', 'Ph.D.', 'PhD']

df['person_degree_degree_type'].fillna(value='N/A', inplace=True)
df.loc[df['person_degree_degree_type'].str.contains('|'.join(phd_keyword)),'person_degree_degree_type']='PhD'
df.loc[df['person_degree_degree_type'].str.contains('|'.join(master_keyword)),'person_degree_degree_type']='Master'
df.loc[df['person_degree_degree_type'].str.contains('|'.join(bachelor_keyword)),'person_degree_degree_type']='Bachelor'

degree_type = []

# perform ordinal encoding: bachelor = 1, master = 2, PhD = 3, no/others = 0
for degree in df['person_degree_degree_type']:
    if degree == 'Bachelor':
        degree_type.append(1)
    elif degree == 'Master':
        degree_type.append(2)
    elif degree == 'PhD':
        degree_type.append(3)
    else:
        degree_type.append(0)
        
df['person_degree_degree_type'] = degree_type

In [16]:
# Stardust ver unique keyword search
def unique_keyword_search(df_col: pd.Series, num_selected: int) -> list:
    '''
    Performs unique keyword search on a dataframe's column for its most common keywords

    :param pd.Series df_col: column of a pd.DataFrame (e.g. df['col'])
    :param int num_selected: number of keywords
    :return: list of keywords in decreasing occurrence
    :rtype: list
    '''
    keywords = []
    for entry in df_col:
        words = entry.split(',')
        [keywords.append(word) for word in words]
    sorted_keywords = Counter(keywords).most_common()
    output_keywords = []
    for i in range(num_selected):
        output_keywords.append(sorted_keywords[i][0])
    return output_keywords

In [17]:
# Stardust ver one-hot encoder V2
def onehot_encoder_v2(df: pd.DataFrame, col_name: str, list_selected: list) -> pd.DataFrame:
    '''
    Performs one-hot encoding on a dataframe's column for its values with most occurrences

    :param pd.DataFrame df: dataframe to be processed
    :param str col_name: name of the encoded column
    :param list list_selected: list of most common values
    :return: processed dataframe
    :rtype: pd.DataFrame
    '''
    for item in list_selected:
        df[item] = np.where(df[col_name] == item, 1, 0) # whenever df[col_name] == cat replace it with 1 else 0
    df.drop(col_name, axis=1, inplace=True)

    return df

In [18]:
# can't run
# degree_subject = []

# # simplify the subject areas
# for subject in df['person_degree_subject']:
#     if 'Computer Science' in subject:
#         degree_type.append(1)
#     elif 'Engineering' == 'Master':
#         degree_type.append(2)
#     elif degree == 'PhD':
#         degree_type.append(3)
#     else:
#         degree_type.append(0)
# subject_keywords = unique_keyword_search(df['person_degree_subject'], 50)

In [19]:
# onehot_encoder_v2(df, 'person_degree_subject', subject_keywords)
# df.head(50)

In [20]:
# # DM, DN: person_degree_started_on, person_degree_completed_on
# degree_date = ['person_degree_started_on', 'person_degree_completed_on']

# has_degree = np.where(df['person_degree_degree_type'] != 0, 1, 0)

# for col in degree_date:
#     df[col] = pd.to_datetime(df[col])
#     mean = df[col].mean()
#     df.loc[has_degree,col] = df.loc[has_degree,col].fillna(value=mean, inplace=True)

In [21]:
# DO: person_degree_is_completed
df['person_degree_is_completed'].fillna(value=0, inplace=True)
df['person_degree_is_completed'] = df['person_degree_is_completed']*1

In [22]:
# DQ: person_event_rank
person_event_rank_mean = df['person_event_rank'].mean()
df['person_event_rank'] = df['person_event_rank'].fillna(value=person_event_rank_mean, inplace=True)

In [23]:
# DQ, DR: person_event_started_on, person_event_ended_on
event_date = ['person_event_started_on', 'person_event_ended_on']
has_event = ~df['person_event_name'].isnull()

for col in event_date:
    df[col] = pd.to_datetime(df[col])
    mean = df[col].mean()
    df.loc[has_event,col] = df.loc[has_event,col].fillna(value=mean, inplace=True)

In [24]:
# DV: person_event_city
df = onehot_encoder(df, 'person_event_city', 20)

In [25]:
df.head()

Unnamed: 0,event_names,event_rank,event_started_on,event_ended_on,event_country_code,event_region,event_city,event_event_roles,participant_name,person,organization,speaker,sponsor,exhibitor,organizer,contestant,organization_name,job_started_on,job_ended_on,job_is_current,job_executive,job_employee,job_board_member,job_advisor,job_board_observer,person_personal_name,person_personal_gender,person_personal_country_code,person_personal_region,person_personal_city,person_personal_featured_job_title,person_degree_degree_type,person_degree_subject,person_degree_started_on,person_degree_completed_on,person_degree_is_completed,person_event_name,person_event_rank,person_event_started_on,person_event_ended_on,person_event_country_code,person_event_region,person_event_city,person_event_roles,person_person,person_organization,person_speaker,person_sponsor,person_exhibitor,person_organizer,person_contestant,event_country_code_USA,event_country_code_GBR,event_country_code_DEU,event_country_code_ESP,event_country_code_SGP,event_country_code_CAN,event_country_code_FRA,event_country_code_DNK,event_country_code_HKG,event_country_code_IRL,event_country_code_EST,event_country_code_NLD,event_country_code_CZE,event_country_code_ISR,event_country_code_PRT,event_country_code_BGR,event_country_code_AUT,event_country_code_IDN,event_country_code_ARE,event_country_code_FIN,event_city_San Francisco,event_city_New York,event_city_Las Vegas,event_city_London,event_city_Orlando,event_city_Berlin,event_city_Singapore,event_city_Boston,event_city_Cambridge,event_city_Copenhagen,event_city_Toronto,event_city_Paris,event_city_Santa Clara,event_city_Berkeley,event_city_Washington,event_city_Dublin,event_city_Madrid,event_city_Santa Monica,event_city_Cyberport,event_city_Tallinn,person_personal_gender_male,person_personal_gender_female,person_personal_country_code_USA,person_personal_country_code_GBR,person_personal_country_code_DEU,person_personal_country_code_ISR,person_personal_country_code_FRA,person_personal_country_code_JPN,person_personal_country_code_CAN,person_personal_country_code_SGP,person_personal_country_code_IND,person_personal_country_code_ESP,person_personal_country_code_CHE,person_personal_country_code_HKG,person_personal_country_code_CHN,person_personal_country_code_AUT,person_personal_country_code_AUS,person_personal_country_code_NLD,person_personal_country_code_BRA,person_personal_country_code_EST,person_personal_country_code_THA,person_personal_country_code_SWE,person_personal_city_San Francisco,person_personal_city_London,person_personal_city_New York,person_personal_city_Washington,person_personal_city_Palo Alto,person_personal_city_Los Angeles,person_personal_city_Berlin,person_personal_city_Paris,person_personal_city_Chicago,person_personal_city_San Jose,person_personal_city_Menlo Park,person_personal_city_Boston,person_personal_city_Hamburg,person_personal_city_Atlanta,person_personal_city_Seattle,person_personal_city_Singapore,person_personal_city_Toronto,person_personal_city_Jerusalem,person_personal_city_Mountain View,person_personal_city_Vancouver,person_event_city_San Francisco,person_event_city_New York,person_event_city_London,person_event_city_Las Vegas,person_event_city_Berlin,person_event_city_Paris,person_event_city_Madrid,person_event_city_Dublin,person_event_city_Lisbon,person_event_city_Singapore,person_event_city_Redwood City,person_event_city_Santa Clara,person_event_city_Copenhagen,person_event_city_Mountain View,person_event_city_Amsterdam,person_event_city_Hong Kong,person_event_city_Los Angeles,person_event_city_Santa Monica,person_event_city_Toronto,person_event_city_Helsinki
0,,,NaT,NaT,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SOICO,,,1,1.0,0.0,0.0,0.0,0.0,Doki Ayaka,female,,,,,0,,,,0,,,NaT,NaT,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,,,NaT,NaT,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SOICO,,,1,1.0,0.0,0.0,0.0,0.0,Motoshi Shimizu,male,,,,,0,,,,0,,,NaT,NaT,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,,,NaT,NaT,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SOICO,,,1,0.0,0.0,1.0,0.0,0.0,Tetsuya Sanada,male,,,,Chairman and CEO,0,,,,0,,,NaT,NaT,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,,,NaT,NaT,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SOICO,,,1,1.0,0.0,0.0,0.0,0.0,Junichi Kayahara,male,,,,,0,,,,0,,,NaT,NaT,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,,,NaT,NaT,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NFX,2015-02-01,,1,0.0,0.0,0.0,1.0,0.0,Josh Elman,male,USA,California,San Francisco,Venture Partner,1,Symbolic Systems,1993-01-01,1997-01-01,1,TechCrunch Disrupt SF 2016,,NaT,NaT,USA,California,San Francisco,"competition,conference,expo,hackathon,meetup,n...",1.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
