In [1]:
# import library
import pandas as pd

### Preparation of Dataframes

In [2]:
org_df = pd.read_csv("cleaned_csv/organizations_cleaned.csv")
ppl_df = pd.read_csv("people_processed.csv")

fund_rd_df = pd.read_csv("cleaned_csv/funding_rounds_cleaned.csv") # each funding round in the dataset
fund_rd_df = fund_rd_df.drop(columns=['name','permalink','cb_url', 'created_at', 'updated_at', 'org_name'])

In [3]:
acq_df = pd.read_csv("acquisitions_processed.csv")
fund_df = pd.read_csv("funds_processed.csv") #investors' investment funds

ipo_df = pd.read_csv("cleaned_csv/ipos_cleaned.csv") 
deg_df = pd.read_csv("cleaned_csv/degrees_cleaned.csv") #people's education background
cate_gp_df = pd.read_csv("cleaned_csv/category_groups_cleaned.csv") #Company name with its group

event_app_df = pd.read_csv("cleaned_csv/event_appearances_cleaned.csv") # Event participation details

event_df= pd.read_csv("events_processed.csv") #Event details

In [4]:
df1 = pd.read_csv("cleaned_csv/jobs_cleaned_1.csv")
df2 = pd.read_csv("cleaned_csv/jobs_cleaned_2.csv")
df3 = pd.read_csv("cleaned_csv/jobs_cleaned_3.csv")
job_df = df1.append(df2)
job_df= job_df.append(df3)
job_df= job_df.reset_index()
job_df= job_df.drop(columns=["index"]) #all job and advisory roles

invest_partner_df= pd.read_csv("investment_partners_processed.csv") #Partners who are responsible for their firm's investments
investor_df= pd.read_csv("investors_processed.csv") #Active investors 
org_parent_df= pd.read_csv("cleaned_csv/org_parents_cleaned.csv") #Mapping between parent organizations and subsidiaries

#All dataframes are now ready, org_df is the master dataframe
org_df

Unnamed: 0,uuid,country_code,region,city,status,category_list,category_groups_list,num_funding_rounds,total_funding_usd,founded_on,last_funding_on,closed_on,employee_count,primary_role,num_exits
0,cf3a40e6-920e-edfd-569e-371f84e0a4e4,USA,California,San Francisco,operating,"Finance,Financial Services,Health Care,Venture...","Financial Services,Health Care,Lending and Inv...",,,2015-01-01,,,101-250,investor,24.0
1,fa65a572-1621-dd22-57a8-92bb49217ac5,GBR,England,London,operating,"Financial Services,FinTech,Venture Capital","Financial Services,Lending and Investments",,,2018-01-01,,,11-50,investor,10.0
2,74a20af3-f4dd-6188-de60-c4ee6cd0ca4a,CHN,Zhejiang,Hangzhou,operating,"Banking,E-Commerce,Financial Services,FinTech,...","Commerce and Shopping,Financial Services,Inter...",4.0,2.200000e+10,2014-10-01,2018-06-08,,5001-10000,company,12.0
3,f33a3674-ec6b-14ca-16dc-437f280dc10b,USA,Virginia,Reston,operating,"E-Commerce,Financial Services,Information Tech...","Commerce and Shopping,Financial Services,Infor...",,,2015-01-01,,,1-10,investor,19.0
4,8fa7fd0d-d5cc-425d-52cc-a2019e7d42a3,USA,California,Santa Monica,operating,"Cyber Security,Developer APIs,FinTech,Software","Financial Services,Information Technology,Priv...",1.0,,2013-08-20,2014-01-23,,1-10,company,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37193,ea942ecf-e2bf-43ce-9ccf-5e435b702b35,USA,Ohio,Delaware,operating,Financial Services,Financial Services,,,2014-01-01,,,11-50,company,
37194,1b05416c-7c71-442b-8d1c-a63400279c68,USA,California,San Jose,operating,"Analytics,Artificial Intelligence,Financial Se...","Artificial Intelligence,Data and Analytics,Fin...",,,2016-01-01,,,11-50,company,
37195,90b4a15b-69ed-4ab0-a257-e1e10b39d3c5,CAN,Ontario,Toronto,operating,"Financial Services,FinTech,Service Industry","Financial Services,Other",,,2018-08-12,,,11-50,company,
37196,a6060229-6086-438e-a5e3-d6580d0105cf,USA,Maryland,Baltimore,operating,"Communities,Financial Services,Impact Investing","Community and Lifestyle,Financial Services,Len...",,,2020-01-01,,,1-10,investor,


### Handling Naming Issues, and dropping some overlapped columns

In [5]:
# Some prefixes are needed due to duplicate coulmn names with other csv
# Some repeated columns existing info in other dataframes can be deleted

acq_df = acq_df.drop(columns=['acquiree_name', 'acquiree_country_code', 'acquiree_region', 'acquiree_city', 'rank'])
acq_df = acq_df.add_prefix('acquisitions_')
fund_df = fund_df.add_prefix('funds_')
ipo_df = ipo_df.add_prefix('ipo_')
fund_rd_df = fund_rd_df.drop(columns=['rank'])
fund_rd_df = fund_rd_df.add_prefix('fund_rd_')
cate_gp_df = cate_gp_df.drop(columns=['name'])
cate_gp_df = cate_gp_df.add_prefix('cat_')
job_df = job_df.add_prefix('job_')
org_parent_df = org_parent_df.add_prefix('parent_org_')

laed_investor_df = investor_df.add_prefix('lead_investor_')
partner_df = investor_df.add_prefix('partner_')
investor_df = investor_df.add_prefix('investor_')
event_df = event_df.add_prefix('event_')
event_df = event_df.rename(columns={'event_name': 'event_names'})
event_app_df = event_app_df.drop(columns=['uuid'])

deg_df = deg_df.add_prefix('degree_')
ppl_df = ppl_df.add_prefix('personal_')

### Firstly, handle investor, partners and fundings. As a partner is also an investor himself, two joinings are needed

In [6]:
# Joining Investor and Partners
invest_join_df = invest_partner_df.set_index('partner_uuid').join(partner_df.set_index('partner_uuid'))
invest_join_df = invest_join_df.set_index('investor_uuid').join(investor_df.set_index('investor_uuid'))
invest_join_df.drop(columns=['uuid'], inplace = True)

In [7]:
# Joining investor and funding rounds
fund_rd_df = fund_rd_df.set_index('fund_rd_uuid').join(invest_join_df.set_index('funding_round_uuid'))

In [8]:
# Joining lead investor and funding rounds
fund_rd_df = laed_investor_df.set_index('lead_investor_uuid').join(fund_rd_df.set_index('fund_rd_lead_investor_uuids'))

### Next, we handle the event. Link the event participants to the events

In [9]:
# Joining event and event_app
event_join_df = event_df.set_index('event_uuid').join(event_app_df.set_index('event_uuid'))

people_event_df = event_join_df.loc[event_join_df['person'] == 1]
org_event_df = event_join_df.loc[event_join_df['organization'] == 1]

### After sorting out all dataframes that are replated to "people entity", we are ready to build up a large dataframe that consists of people_uuid as index

In [10]:
#Joining People and Degrees
ppl_join = ppl_df.set_index('personal_uuid').join(deg_df.set_index('degree_person_uuid'))

In [11]:
#Joining People and People Event Participant 
ppl_join = ppl_join.join(people_event_df.set_index('participant_uuid'))

In [12]:
#Add prefix to avoid overlap of column names
ppl_join = ppl_join.add_prefix('person_')

### Now, it is time to handle organizations, put org_uuid as index

In [13]:
#Joining Organizations and fund_rounds
org_join = org_df.set_index('uuid').join(fund_rd_df.set_index('fund_rd_org_uuid'))

In [14]:
#Joining Organizations and IPO
org_join = org_join.join(ipo_df.set_index('ipo_org_uuid'))

In [15]:
#Joining Organizations and funds
org_join = org_join.join(fund_df.set_index('funds_entity_uuid'))

In [16]:
#Joining Organizations and acquisitions
org_join = org_join.join(acq_df.set_index('acquisitions_acquiree_uuid'))

In [17]:
#Joining Organizations and categories
org_join = org_join.join(cate_gp_df.set_index('cat_uuid'))

In [18]:
#Joining Organizations and its parents, if any
org_join = org_join.join(org_parent_df.set_index('parent_org_uuid'))

In [19]:
#Joining Organizations and Organization Event Participant
org_join = org_join.join(org_event_df.set_index('participant_uuid'))

### Have a look at the current large dataframes and drop some columns (org_join)

In [20]:
#Drop away emtpy columns to improve efficiency
ppl_join.dropna(how = 'all', axis = 1, inplace = True)
#Drop away emtpy columns to improve efficiency
org_join.dropna(how = 'all', axis = 1, inplace = True)

In [21]:
org_join.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54720 entries, 0001a6ec-e7e9-4d1b-8a77-adb5ac815420 to ffffabce-6d4a-b3d1-13c0-4e90cedf5270
Columns: 102 entries, country_code to contestant
dtypes: float64(27), object(75)
memory usage: 43.0+ MB


In [22]:
#Drop away columns that are without any use to improve efficiency
org_join = org_join.drop(columns = ['ipo_uuid', 'ipo_org_name', 'funds_uuid', \
                'acquisitions_uuid', 'parent_org_parent_uuid', 'ipo_stock_symbol'])

### Finally, it is about the linkage between people and organization, through "job"

In [23]:
#Join Job with People
job_join = job_df.set_index('job_person_uuid').join(ppl_join)

In [24]:
#Drop away columns that are without any use to improve efficiency
job_join = job_join.drop(columns=['job_uuid', 'person_participant_name',\
                                 'person_degree_uuid', 'person_degree_institution_uuid', 'person_event_venue_name',\
                                 'person_event_short_description', 'person_event_description'])
job_join = job_join.rename(columns={'job_person_name': 'person_name','person_event_event_roles':'person_event_roles',\
                                   'event_event_roles':'event_roles', 'event_names':'event_name', 'person_event_names': \
                                    'person_event_name', 'job_org_name': 'organization_name'})

In [25]:
#Join Organization with Job
org_join = org_join.join(job_join.set_index('job_org_uuid'))

In [26]:
#Drop away emtpy columns to simplify the final dataframe
org_join.dropna(how = 'all', axis = 1, inplace = True)

### Final preparation for the overall joint dataframe

In [28]:
#Extract the uuid from index and put back into a proper column
org_join['uuid'] = org_join.index
x = org_join.columns.get_loc('organization_name')
#Move the uuid column and the organization name column to the front for easier references
cols = org_join.columns.tolist()
cols = cols[-1:] + cols[x:x+1] + cols[0:x] + cols[x+1:-1]
org_join = org_join[cols]

In [29]:
#Reindex the dataframe so that the index column is not uuid any more, but integer values "0, 1, 2..."
new_index = []
for i in range(len(org_join)):
    new_index.append(i)
org_join.index = new_index

### Have a look at the joint dataframe

In [30]:
#Let's see the info of the final DataFrame
org_join.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1018183 entries, 0 to 1018182
Columns: 133 entries, uuid to person_contestant
dtypes: float64(40), object(93)
memory usage: 1.0+ GB


In [31]:
#The final DataFrame
org_join

Unnamed: 0,uuid,organization_name,country_code,region,city,status,category_list,category_groups_list,num_funding_rounds,total_funding_usd,...,person_event_region,person_event_city,person_event_roles,person_person,person_organization,person_speaker,person_sponsor,person_exhibitor,person_organizer,person_contestant
0,0001a6ec-e7e9-4d1b-8a77-adb5ac815420,SOICO,JPN,Tokyo,Tokyo,operating,"Finance,Financial Services,Recruiting","Financial Services,Professional Services",,,...,,,,,,,,,,
1,0001a6ec-e7e9-4d1b-8a77-adb5ac815420,SOICO,JPN,Tokyo,Tokyo,operating,"Finance,Financial Services,Recruiting","Financial Services,Professional Services",,,...,,,,,,,,,,
2,0001a6ec-e7e9-4d1b-8a77-adb5ac815420,SOICO,JPN,Tokyo,Tokyo,operating,"Finance,Financial Services,Recruiting","Financial Services,Professional Services",,,...,,,,,,,,,,
3,0001a6ec-e7e9-4d1b-8a77-adb5ac815420,SOICO,JPN,Tokyo,Tokyo,operating,"Finance,Financial Services,Recruiting","Financial Services,Professional Services",,,...,,,,,,,,,,
4,0003f244-79d0-6178-353e-33dabaf3b2c6,NFX,USA,California,San Francisco,operating,"Finance,Financial Services,Venture Capital","Financial Services,Lending and Investments",,,...,California,San Francisco,"competition,conference,expo,hackathon,meetup,n...",1.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1018178,ffffabce-6d4a-b3d1-13c0-4e90cedf5270,ERNIT,USA,New York,New York,operating,"Banking,Enterprise Applications,Finance,Financ...","Apps,Financial Services,Lending and Investment...",6.0,1087000.0,...,,,,,,,,,,
1018179,ffffabce-6d4a-b3d1-13c0-4e90cedf5270,ERNIT,USA,New York,New York,operating,"Banking,Enterprise Applications,Finance,Financ...","Apps,Financial Services,Lending and Investment...",6.0,1087000.0,...,,,,,,,,,,
1018180,ffffabce-6d4a-b3d1-13c0-4e90cedf5270,ERNIT,USA,New York,New York,operating,"Banking,Enterprise Applications,Finance,Financ...","Apps,Financial Services,Lending and Investment...",6.0,1087000.0,...,,,,,,,,,,
1018181,ffffabce-6d4a-b3d1-13c0-4e90cedf5270,ERNIT,USA,New York,New York,operating,"Banking,Enterprise Applications,Finance,Financ...","Apps,Financial Services,Lending and Investment...",6.0,1087000.0,...,,,,,,,,,,


In [33]:
"""
#Export out as CSV, the file is about 216MB
org_join.to_csv('org_final_joined_new.csv')
"""

"\n#Export out as CSV, the file is about 216MB\norg_join.to_csv('org_final_joined_new.csv')\n"

In [34]:
"""
# (Primary for W2) Problem 1: Is the following items the same as organization itself? Can we come up a way to check? If yes, we can drop all duplications to simplify our df

# e.g.
#'ipo_country',vs 'fund_rd_country', vs 'country_code'
#'ipo_region', vs 'fund_rd_region', vs 'state_code'
#'ipo_city', vs 'fund_rd_city', vs 'city'
#'person_personal_rank', vs 'personal_event_rank'
#'total_funding_usd', vs 'total_funding', vs 'funds_raised_amount_usd' 
#'category_list', vs 'category_groups_list' (what should we use eaxctly......)
"""
"""
# (TBC) Problem 2: There are multiple rows for some organization eitites, due to their multiple funding rounds, or multiple people, etc.
#            I plan to keep here first, but we may have to come up with a way to handle this situation
"""
"""
# (TBC) Problem 3: The dataframe consists of a column 'Acquirer'. I chose to join using acquiriee id instead of acquirier id,
#            due to the intuition that venture companies are more likely to be acquired than to acquire others;
#            However, may need to find a way to handle the situation if there is really a startup acquiring others
"""
"""
# (TBC) Problem 4: If a startup company has a parent company, do we also need the information of its parent company in
#            our dataframes. My answer is yes and know, coz not sure how much could its parent company affect it;
#            however, after all we value a startup company all by itself. If we have to incorporate its parents' data,
#            how do we valuate its parents? We may have to build up another dataframe, or else it would be too large. 
#            Also, if its parent is an acquirer, the situation will be much more complex.
"""
"""
# (Primary for W2) Problem 5: Now, all data are raw, many NaN data and are unprocessed, we have to preprocess all the data here at once
#            Also, we have to extract those that are within 5 years and fintech field to be our target data.
"""

'\n# (Primary for W2) Problem 5: Now, all data are raw, many NaN data and are unprocessed, we have to preprocess all the data here at once\n#            Also, we have to extract those that are within 5 years and fintech field to be our target data.\n'