Sample code for finding ROI (% change of two latest successive funding rounds / time difference).<br>
Put `funding_rounds.csv` here.

In [1]:
# import library
import pandas as pd
import numpy as np
from collections import Counter
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [2]:
# read the CSV
# df = pd.read_csv("../bulk_export_processed/cleaned_csv/funding_rounds_cleaned.csv")
df = pd.read_csv("funding_rounds.csv") # place CSV here
# change the cols to keep
keep_col = ['investment_type','raised_amount_usd','org_uuid'] # cols for illustration purpose only. choose more for real model
df = df[keep_col]

In [3]:
df.dropna(inplace=True)

no_series_info = (df['investment_type'] == 'undisclosed') | (df['investment_type'] == 'series_unknown')
df.drop(df[no_series_info].index, inplace=True)

df.drop(df[df['raised_amount_usd'].isnull()].index, inplace=True)

In [4]:
# df.sort_values(by=['org_uuid','raised_amount_usd'])
df.sort_values(by=['raised_amount_usd'])

Unnamed: 0,investment_type,raised_amount_usd,org_uuid
247642,angel,0.000000e+00,3dfb60a7-08bb-63be-8d25-8439c78fc7ff
303873,series_a,0.000000e+00,3dfb60a7-08bb-63be-8d25-8439c78fc7ff
342056,pre_seed,0.000000e+00,33962bf0-400d-47f5-b24f-3c6eacbd719a
269592,pre_seed,0.000000e+00,b6c181b3-299c-6117-1144-9b7689dc5b01
303251,seed,0.000000e+00,ec3dcecf-e6d0-426c-ba17-d300b685368f
...,...,...,...
210794,series_c,1.400000e+10,74a20af3-f4dd-6188-de60-c4ee6cd0ca4a
55313,post_ipo_equity,2.127194e+10,e693e2f0-50bb-05ab-8a27-4078f5dacf11
158549,private_equity,2.180169e+10,44996a96-b3bc-2498-2d15-ea91a4b608e2
293281,post_ipo_equity,2.400000e+10,7ef1577a-8779-8044-4a42-669b796f8e4f


In [5]:
# ordinal_investment_type = []

# # perform ordinal encoding: seed/angel = 0, series_a = 1, series_b = 2, series_c = 3
# for investment in df['investment_type']:
#     if investment == 'seed' or investment == 'angel':
#         ordinal_investment_type.append(0)
#     elif investment == 'series_a':
#         ordinal_investment_type.append(1)
#     elif investment == 'series_b':
#         ordinal_investment_type.append(2)
#     elif investment == 'series_c':
#         ordinal_investment_type.append(3)
#     else:
#         ordinal_investment_type.append(99)
        
# df['investment_type'] = ordinal_investment_type

In [6]:
# df.groupby(['investment_type'])

In [7]:
df = df.merge(df.groupby('org_uuid').agg(investment_list=('investment_type',list), raised_amount_usd_list=('raised_amount_usd',list)).reset_index())

In [8]:
df.drop(columns=['investment_type', 'raised_amount_usd'], inplace=True)
df.drop_duplicates(subset='org_uuid', inplace=True)

In [9]:
investment_list = []

# zip() gives successive difference list
for investment in df['investment_list']:
    # indicate the change as "series_xxx to series_yyy"
    process = [(i + " to " + j) for i, j in zip(investment[: -1], investment[1 :])]
    investment_list.append(process if len(process) else None)

df['investment_delta'] = investment_list

In [10]:
amount_list = []

for amount in df['raised_amount_usd_list']:
    # find change in amount normalized to the smaller amount
    norm_amount_delta = [2 * (j - i)/ (i + j) if (i + j) != 0 else 0 for i, j in zip(amount[: -1], amount[1 :])]
    amount_list.append(norm_amount_delta if len(norm_amount_delta) else None)

df['norm_amount_delta'] = amount_list

In [11]:
df.drop(columns=['investment_list', 'raised_amount_usd_list'], inplace=True)
# for easier checking of results, I dropped rows without delta
df.dropna(inplace=True)
df

Unnamed: 0,org_uuid,investment_delta,norm_amount_delta
0,df662812-7f97-0b43-9d3e-12f64f504fbb,"[angel to series_a, series_a to series_b, seri...","[1.8484848484848484, 0.736318407960199, 1.5887..."
12,f53cb4de-236e-0b1b-dee8-7104a8b018f9,[series_b to series_a],[-1.1111111111111112]
14,4111dc8b-c0df-2d24-ed33-30cd137b3098,[series_b to series_c],[-0.6666666666666666]
16,3d16cb4c-911e-75c0-de5a-15c316b39f98,[seed to series_b],[1.6756756756756757]
18,56b5f0c1-855f-2ebb-083f-16641f1db2e1,"[series_a to series_b, series_b to series_c]","[0.03125, -0.2033898305084746]"
...,...,...,...
220583,93e8e62f-8265-4cd8-861a-c3985e56b57a,[series_a to angel],[-0.9869657395907819]
220585,c8855bd8-1693-47d8-b922-3570bcea7c72,[series_a to angel],[-0.9953259028235109]
220589,7ce96082-3b68-4be6-abd5-aecf9ba6b6ca,"[angel to series_b, series_b to series_a]","[1.4173679047827679, -0.5868641679936706]"
220592,90cfa538-3926-42f8-ab77-7fd3f28a4a0e,[angel to series_a],[1.6259504608017652]


In [12]:
# find max delta
max_amount_delta_list = []

for delta_list in df['norm_amount_delta']:
    max_amount_delta_list.append(np.array(delta_list).max())

df['max_norm_amount_delta'] = max_amount_delta_list

In [13]:
# find mean delta
mean_amount_delta_list = []

for delta_list in df['norm_amount_delta']:
    mean_amount_delta_list.append(np.array(delta_list).mean())

df['mean_norm_amount_delta'] = mean_amount_delta_list

In [14]:
df.reset_index(inplace=True)
df.drop(axis=1, labels='index', inplace=True)
df

Unnamed: 0,org_uuid,investment_delta,norm_amount_delta,max_norm_amount_delta,mean_norm_amount_delta
0,df662812-7f97-0b43-9d3e-12f64f504fbb,"[angel to series_a, series_a to series_b, seri...","[1.8484848484848484, 0.736318407960199, 1.5887...",1.848485,0.603635
1,f53cb4de-236e-0b1b-dee8-7104a8b018f9,[series_b to series_a],[-1.1111111111111112],-1.111111,-1.111111
2,4111dc8b-c0df-2d24-ed33-30cd137b3098,[series_b to series_c],[-0.6666666666666666],-0.666667,-0.666667
3,3d16cb4c-911e-75c0-de5a-15c316b39f98,[seed to series_b],[1.6756756756756757],1.675676,1.675676
4,56b5f0c1-855f-2ebb-083f-16641f1db2e1,"[series_a to series_b, series_b to series_c]","[0.03125, -0.2033898305084746]",0.031250,-0.086070
...,...,...,...,...,...
47797,93e8e62f-8265-4cd8-861a-c3985e56b57a,[series_a to angel],[-0.9869657395907819],-0.986966,-0.986966
47798,c8855bd8-1693-47d8-b922-3570bcea7c72,[series_a to angel],[-0.9953259028235109],-0.995326,-0.995326
47799,7ce96082-3b68-4be6-abd5-aecf9ba6b6ca,"[angel to series_b, series_b to series_a]","[1.4173679047827679, -0.5868641679936706]",1.417368,0.415252
47800,90cfa538-3926-42f8-ab77-7fd3f28a4a0e,[angel to series_a],[1.6259504608017652],1.625950,1.625950
