In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = None

In [2]:
df=pd.read_csv("org_final_joined_new.csv")

In [3]:
# df.columns.get_loc("event_names")

In [4]:
df.drop(df.iloc[:, 0:80], inplace = True, axis = 1)

In [5]:
# df.info()

In [6]:
# some cols will be deleted, for being too complicated / containing little useful info
col_to_drop = ['event_short_description', 'event_venue_name', 'event_description', 'person_personal_featured_job_organization_name']
df.drop(columns=col_to_drop, inplace=True)

In [7]:
# some cols will use 0 as the fillna() value
col_nan_to_zero = ['person', 'organization', 'speaker', 'sponsor', 'exhibitor', 'organizer', 'contestant', 'job_executive', 'job_employee', 'job_board_member', 'job_advisor', 'job_board_observer', 'person_person', 'person_organization', 'person_speaker', 'person_sponsor', 'person_exhibitor', 'person_organizer', 'person_contestant']
for col in col_nan_to_zero:
    df[col].fillna(0)

In [8]:
df.info(max_cols=1000)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1018183 entries, 0 to 1018182
Data columns (total 51 columns):
 #   Column                              Non-Null Count    Dtype  
---  ------                              --------------    -----  
 0   event_names                         860952 non-null   object 
 1   event_rank                          860952 non-null   float64
 2   event_started_on                    860952 non-null   object 
 3   event_ended_on                      860952 non-null   object 
 4   event_country_code                  850218 non-null   object 
 5   event_region                        850218 non-null   object 
 6   event_city                          850218 non-null   object 
 7   event_event_roles                   860952 non-null   object 
 8   participant_name                    860952 non-null   object 
 9   person                              860952 non-null   float64
 10  organization                        860952 non-null   float64
 11  speaker    

In [9]:
# CD: event_rank
event_rank_mean = df['event_rank'].mean()
df['event_rank'] = df['event_rank'].fillna(event_rank_mean)

In [21]:
# CF, CG: event_started_on, event_ended_on
event_date = ['event_started_on', 'event_ended_on']
has_event = ~df['event_names'].isnull()

for col in event_date:
    df[col] = pd.to_datetime(df[col])
    mean = df[col].mean()
    df.loc[has_event,col] = df.loc[has_event,col].fillna(mean)

In [25]:
# Stardust ver one-hot for country_code, stock_exchange_symbol
def onehot_encoder(df: pd.DataFrame, col_name: str, num_selected: int) -> pd.DataFrame:
    '''
    Performs one-hot encoding on a dataframe's column for its values with most occurrences

    :param pd.DataFrame df: dataframe to be processed
    :param str col_name: name of the encoded column
    :param int num_selected: number of values with most occurrences
    :return: processed dataframe
    :rtype: pd.DataFrame
    '''
    series = df[col_name].value_counts()
    selected_col_name = series.head(num_selected).index.tolist()

    for item in selected_col_name:
        df[item] = np.where(df[col_name] == item, 1, 0) # whenever df[col_name] == cat replace it with 1 else 0
        # df[item].add_prefix(col_name + "_")
    df.drop(columns=col_name)
    return df

In [26]:
# CJ, CL: event_country_code, event_city
df = onehot_encoder(df, 'event_country_code', 20)
df = onehot_encoder(df, 'event_city', 20)

In [27]:
df.head()

Unnamed: 0,event_names,event_rank,event_started_on,event_ended_on,event_country_code,event_region,event_city,event_event_roles,participant_name,person,organization,speaker,sponsor,exhibitor,organizer,contestant,organization_name,job_started_on,job_ended_on,job_is_current,job_executive,job_employee,job_board_member,job_advisor,job_board_observer,person_personal_name,person_personal_gender,person_personal_country_code,person_personal_region,person_personal_city,person_personal_featured_job_title,person_degree_degree_type,person_degree_subject,person_degree_started_on,person_degree_completed_on,person_degree_is_completed,person_event_name,person_event_rank,person_event_started_on,person_event_ended_on,person_event_country_code,person_event_region,person_event_city,person_event_roles,person_person,person_organization,person_speaker,person_sponsor,person_exhibitor,person_organizer,person_contestant,USA,GBR,DEU,ESP,SGP,CAN,FRA,DNK,HKG,IRL,EST,NLD,CZE,ISR,PRT,BGR,AUT,IDN,ARE,FIN,San Francisco,New York,Las Vegas,London,Orlando,Berlin,Singapore,Boston,Cambridge,Copenhagen,Toronto,Paris,Santa Clara,Berkeley,Washington,Dublin,Madrid,Santa Monica,Cyberport,Tallinn
0,,8293.305429,NaT,NaT,,,,,,,,,,,,,SOICO,,,True,1.0,0.0,0.0,0.0,0.0,Doki Ayaka,female,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,,8293.305429,NaT,NaT,,,,,,,,,,,,,SOICO,,,True,1.0,0.0,0.0,0.0,0.0,Motoshi Shimizu,male,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,,8293.305429,NaT,NaT,,,,,,,,,,,,,SOICO,,,True,0.0,0.0,1.0,0.0,0.0,Tetsuya Sanada,male,,,,Chairman and CEO,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,,8293.305429,NaT,NaT,,,,,,,,,,,,,SOICO,,,True,1.0,0.0,0.0,0.0,0.0,Junichi Kayahara,male,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,,8293.305429,NaT,NaT,,,,,,,,,,,,,NFX,2015-02-01,,True,0.0,0.0,0.0,1.0,0.0,Josh Elman,male,USA,California,San Francisco,Venture Partner,Bachelor,Symbolic Systems,1993-01-01,1997-01-01,True,TechCrunch Disrupt SF 2016,2255.0,12/9/2016,14/9/2016,USA,California,San Francisco,"competition,conference,expo,hackathon,meetup,n...",1.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [13]:
# DE, DF, DH: person_personal_gender, person_personal_country, person_personal_city
df = onehot_encoder(df, 'person_personal_gender', 2)
df = onehot_encoder(df, 'person_personal_country', 20)
df = onehot_encoder(df, 'person_personal_city', 20)

male                 856774
female               122996
not_provided           3218
transgender_woman        67
agender                   8
mtf                       6
other                     3
androgynous               2
neutrois                  1
Name: person_personal_gender, dtype: int64

In [12]:
# df['person_degree_degree_type'] # not done

Unnamed: 0,event_names,event_rank,event_started_on,event_ended_on,event_country_code,event_region,event_city,event_event_roles,participant_name,person,organization,speaker,sponsor,exhibitor,organizer,contestant,organization_name,job_started_on,job_ended_on,job_is_current,job_executive,job_employee,job_board_member,job_advisor,job_board_observer,person_personal_name,person_personal_gender,person_personal_country_code,person_personal_region,person_personal_city,person_personal_featured_job_title,person_degree_degree_type,person_degree_subject,person_degree_started_on,person_degree_completed_on,person_degree_is_completed,person_event_name,person_event_rank,person_event_started_on,person_event_ended_on,person_event_country_code,person_event_region,person_event_city,person_event_roles,person_person,person_organization,person_speaker,person_sponsor,person_exhibitor,person_organizer,person_contestant,USA,GBR,DEU,ESP,SGP,CAN,FRA,DNK,HKG,IRL,EST,NLD,CZE,ISR,PRT,BGR,AUT,IDN,ARE,FIN
0,,8293.305429,,,,,,,,,,,,,,,SOICO,,,True,1.0,0.0,0.0,0.0,0.0,Doki Ayaka,female,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,,8293.305429,,,,,,,,,,,,,,,SOICO,,,True,1.0,0.0,0.0,0.0,0.0,Motoshi Shimizu,male,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,,8293.305429,,,,,,,,,,,,,,,SOICO,,,True,0.0,0.0,1.0,0.0,0.0,Tetsuya Sanada,male,,,,Chairman and CEO,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,,8293.305429,,,,,,,,,,,,,,,SOICO,,,True,1.0,0.0,0.0,0.0,0.0,Junichi Kayahara,male,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,,8293.305429,,,,,,,,,,,,,,,NFX,2015-02-01,,True,0.0,0.0,0.0,1.0,0.0,Josh Elman,male,USA,California,San Francisco,Venture Partner,Bachelor,Symbolic Systems,1993-01-01,1997-01-01,True,TechCrunch Disrupt SF 2016,2255.0,12/9/2016,14/9/2016,USA,California,San Francisco,"competition,conference,expo,hackathon,meetup,n...",1.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [14]:
# DL_person_degree_subject
# df['person_degree_subject'].value_counts()
# df['person_degree_subject'].value_counts().head(50)

unknown                                                                99644
Computer Science                                                       67812
Business                                                               32044
Economics                                                              25396
Law                                                                    23316
Electrical Engineering                                                 20777
Finance                                                                19128
Business Administration                                                12813
Government                                                             11828
General Management                                                      9188
Philosophy                                                              9088
Mechanical Engineering                                                  8105
Engineering                                                             6975