In [1]:
#Import basic packages
import os
import numpy as np
import pandas as pd
import csv
import regex as re
import datetime as dt 

In [2]:
#import outcome file with org_uuid
main_df = pd.read_csv(R"d:/msc-project/data/final/outcome_final.csv",encoding='utf-8',
                         index_col=False,
                         parse_dates= ['founded_on','seed_date','series_a_date','series_b_date','series_c_date'])
main_processing_df = main_df[['org_uuid','name','outcome']]

In [3]:
main_processing_df.head(3)

Unnamed: 0,org_uuid,name,outcome
0,00107fd1-b65b-85cd-7d60-c00c4b2ae2fb,CloudAptitude,0
1,0013719b-13cb-9419-c604-597100dd642f,Win Win Slots,0
2,001c3ed9-6eb1-63b6-eeba-dcdb97f9444f,PetaData Labs SocialDNA,0


In [4]:
len(main_processing_df)

76806

In [5]:
#import jobs file
jobs_df = pd.read_csv(R"d:/msc-project/data/pre-processed/jobs_preprocessed.csv",encoding='utf-8',
                         index_col=False,
                         parse_dates= ['started_on','ended_on'])
jobs_df.head(3)

Unnamed: 0,job_uuid,job_name,people_uuid,person_name,org_uuid,org_name,started_on,ended_on,is_current,title,job_type,is_founder
0,697b6934-fc1f-9d63-cfb2-1a10759b378e,Ben Elowitz Co-Founder/CEO/Board of Directors ...,ed13cd36-fe2b-3707-197b-0c2d56e37a71,Ben Elowitz,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,,,False,Co-Founder/CEO/Board of Directors,executive,1
1,b1de3765-442e-b556-9304-551c2a055901,Kevin Flaherty VP Marketing @ Wetpaint,5ceca97b-493c-1446-6249-5aaa33464763,Kevin Flaherty,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,,,False,VP Marketing,executive,0
2,1319cd30-f5e8-c700-0af6-64029c6f7124,Raju Vegesna Chief Evangelist @ Zoho,9f99a98a-aa97-b30b-0d36-db67c1d277e0,Raju Vegesna,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,2000-11-01,,True,Chief Evangelist,employee,0


# Assign Number of founder for each organisation

In [6]:
num_founder = pd.DataFrame(jobs_df.groupby('org_uuid')['is_founder'].sum())
main_processing_df_v1 = pd.merge(main_processing_df,
                         num_founder,
                         on='org_uuid',
                         how='left'
                         )
main_processing_df_v1.rename(columns = {'is_founder':'num_founder'},inplace=True)
main_processing_df_v1.head(3)

Unnamed: 0,org_uuid,name,outcome,num_founder
0,00107fd1-b65b-85cd-7d60-c00c4b2ae2fb,CloudAptitude,0,1.0
1,0013719b-13cb-9419-c604-597100dd642f,Win Win Slots,0,1.0
2,001c3ed9-6eb1-63b6-eeba-dcdb97f9444f,PetaData Labs SocialDNA,0,0.0


In [7]:
main_processing_df_v1[main_processing_df_v1['num_founder'].isna()]

Unnamed: 0,org_uuid,name,outcome,num_founder
7,004c2baa-36be-4d84-8731-5418ce248ab5,Altitude Products,0,
10,00662293-6e6e-a876-9070-f1a35e5ae110,Coupsta,0,
14,00a36480-eaaa-4c18-9ee1-ab9d50cacb12,Southern Robotic Foods,0,
15,00a9ee84-66bd-0807-2e70-d4fb5c81cc58,Atlantis,0,
43,020964c6-3ba9-b83a-d2c6-f34440cacc2b,UeeeU.com,0,
...,...,...,...,...
76763,ffcbc4ca-0236-4b6a-a3ac-835f94740f80,Jagger,0,
76767,ffd10ff4-06da-49b3-9d42-91ec518fe3b6,Caihong Lushi,0,
76775,ffd958cc-7c71-4fa8-bee5-ff54cb0ae194,Cutie Pie,0,
76784,ffe01926-79ee-4d17-984c-c9a4f74ccbab,KTVme,1,


In [8]:
#drop companies with no founder info
main_processing_df_v1.dropna(subset=['num_founder'],inplace=True)
main_processing_df_v2 = main_processing_df_v1[main_processing_df_v1['num_founder'] > 0]

In [9]:
len(main_processing_df_v2)

50236

In [10]:
main_processing_df_v2.head(5)

Unnamed: 0,org_uuid,name,outcome,num_founder
0,00107fd1-b65b-85cd-7d60-c00c4b2ae2fb,CloudAptitude,0,1.0
1,0013719b-13cb-9419-c604-597100dd642f,Win Win Slots,0,1.0
3,002194d5-e7cf-cc78-86ca-4213f8edf62b,Smart Lanes,0,2.0
4,002542fe-a20c-aab2-1bcf-6f495fac9ab1,Clay Piggy,0,2.0
5,002f3cd0-9655-4f2d-834f-af6fc7a8a437,Kids on 45th,0,1.0


# Assign Gender Diversity Features

In [11]:
founder_df = jobs_df[jobs_df['is_founder'] == 1]
founder_gb = founder_df.groupby('org_uuid')

In [12]:
people_df = pd.read_csv(R"d:/msc-project/data/pre-processed/people_preprocessed.csv",encoding='utf-8',
                         index_col=False)

In [13]:
people_df.columns

Index(['people_uuid', 'people_name', 'first_name', 'last_name', 'gender',
       'country_code', 'state_code', 'region', 'city', 'org_uuid',
       'featured_job_organization_name', 'featured_job_title', 'facebook_url',
       'linkedin_url', 'twitter_url', 'logo_url', 'description'],
      dtype='object')

In [14]:
founder_gender_df = pd.merge(founder_df[['people_uuid','org_uuid']],
                             people_df[['people_uuid','gender']],
                             on='people_uuid',
                             how='left')

In [15]:
founder_gender_df.head(3)

Unnamed: 0,people_uuid,org_uuid,gender
0,ed13cd36-fe2b-3707-197b-0c2d56e37a71,e1393508-30ea-8a36-3f96-dd3226033abd,male
1,a01b8d46-d311-3333-7c34-aa3ae9c03f22,df662812-7f97-0b43-9d3e-12f64f504fbb,male
2,084aaa07-0795-1fe8-9c46-98bbeb02cd64,df662812-7f97-0b43-9d3e-12f64f504fbb,male


In [16]:
founder_gender_gb = founder_gender_df.groupby('org_uuid')

In [17]:
is_founder_gender_diversity = list()

for org_uuid in list(main_processing_df_v2['org_uuid']):
    founders_gender = list(founder_gender_gb.get_group(org_uuid)['gender']) #create list of founder's gender for one organisation
    if 'male' in founders_gender and 'female' in founders_gender: #assign 1 if male and female in gender list
        is_founder_gender_diversity.append(1)
    else:
        is_founder_gender_diversity.append(0)

main_processing_df_v2['is_founder_gender_diversity'] = is_founder_gender_diversity

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [18]:
main_processing_df_v2.shape

(50236, 5)

In [19]:
main_processing_df_v2.head(3)

Unnamed: 0,org_uuid,name,outcome,num_founder,is_founder_gender_diversity
0,00107fd1-b65b-85cd-7d60-c00c4b2ae2fb,CloudAptitude,0,1.0,0
1,0013719b-13cb-9419-c604-597100dd642f,Win Win Slots,0,1.0,0
3,002194d5-e7cf-cc78-86ca-4213f8edf62b,Smart Lanes,0,2.0,1


# Assigning degree type, ivy league, and top100 to organisation based on founder

In [20]:
#import processed degree file
deg_df = pd.read_csv(R"d:/msc-project/data/pre-processed/degrees_preprocessed.csv",encoding='utf-8',
                         index_col=False)
deg_df.head(3)

Unnamed: 0,deg_uuid,deg_name,people_uuid,person_name,institution_uuid,institution_name,degree_type,subject,started_on,completed_on,is_completed,is_bachelor,is_master,is_phd,is_mba,is_stem,is_ivy_league
0,205fdfd1-ecac-aa43-262f-219f11755f67,MS Mass Communication @ Boston University,4897dba9-3141-ecc0-2c4b-c9d844e6440f,John Green,1eab62d2-15d9-0db7-930f-2aa77d4688e1,Boston University,MS,Mass Communication,,1992-01-01,True,0,1,0,0,1,0
1,1a2ac288-eb99-3318-fde5-1517bc168f51,"BA English, French @ Washington University in...",4897dba9-3141-ecc0-2c4b-c9d844e6440f,John Green,6ae9957a-8fb4-0ab1-73fa-dd547c4d3da4,Washington University in St. Louis,BA,"English, French",,1990-01-01,True,1,0,0,0,0,0
2,b978d338-7ccc-7469-5ce7-ef98c34155ad,MS Internet Technology @ University of Greenwich,7d187b77-94f7-e6cc-6981-d7468db5968f,Sridhar Gundaiah,b5ea73f6-12a3-576d-ae9b-f4169147f974,University of Greenwich,MS,Internet Technology,,2006-01-01,True,0,1,0,0,1,0


In [21]:
#merge degree with founder_df
deg_type_df = deg_df[['people_uuid','is_bachelor','is_master','is_phd','is_mba','is_stem','is_ivy_league','is_top_100']]
founder_degree_df = pd.merge(founder_df[['people_uuid','org_uuid']],
                             deg_type_df,
                             on='people_uuid',
                             how='left')
founder_degree_df.fillna(value=0,inplace=True)
len(founder_degree_df)

KeyError: "['is_top_100'] not in index"

In [None]:
founder_degree_df.head(3)

In [None]:
#group by org_uuid and merge with main file
founder_degree_df = founder_degree_df[['org_uuid','is_bachelor','is_master','is_phd','is_mba','is_stem',
                                      'is_ivy_league','is_top_100']].groupby('org_uuid').max()

main_processing_df_v3 = pd.merge(main_processing_df_v2,
                                 founder_degree_df,
                                 on='org_uuid',
                                 how = 'left')

In [None]:
#assign 0 to founder with no degree information
main_processing_df_v3.isnull().sum(axis=0)

In [None]:
main_processing_df_v3.head(10)

# Assign number of advisor

In [None]:
jobs_df['job_type'].value_counts()

In [None]:
jobs_advisor_df = jobs_df[jobs_df['job_type'] == 'advisor']

In [None]:
jobs_advisor_df.head(3)

In [None]:
advisor_gb = jobs_advisor_df.groupby('org_uuid')

In [None]:
num_advisor = advisor_gb.size()
num_advisor = pd.DataFrame(num_advisor,
                          columns=['num_advisor'])

In [None]:
type(num_advisor)

In [None]:
main_processing_df_v4 = pd.merge(main_processing_df_v3,
                                 num_advisor,
                                 on='org_uuid',
                                 how='left')

In [None]:
main_processing_df_v4.head(3)

In [None]:
main_processing_df_v4['num_advisor'].fillna(0,inplace=True)

In [None]:
main_processing_df_v4.head(3)

# Board Composition

In [None]:
jobs_board_df = jobs_df[jobs_df['job_type'] == 'board_member']

In [None]:
jobs_board_df.head(3)

In [None]:
jobs_board_df=jobs_board_df[['people_uuid','org_uuid']]

In [None]:
jobs_board_df.head(3)

In [None]:
investors_df = pd.read_csv(R'd:\msc-project\data\pre-processed\investors_preprocessed.csv',encoding='utf-8')

In [None]:
investors_df.head(3)