Sample code for finding ROI (% change of two latest successive funding rounds / time difference).<br>
Put `funding_rounds.csv` here.

In [1]:
# import library
import pandas as pd
import numpy as np
from collections import Counter
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [2]:
# read the CSV
df = pd.read_csv("../bulk_export_processed/funding_rounds.csv")
#df = pd.read_csv("funding_rounds.csv") # place CSV here
# change the cols to keep
keep_col = ['investment_type','raised_amount_usd','org_uuid','announced_on'] # cols for illustration purpose only. choose more for real model
df = df[keep_col]

In [3]:
df.dropna(how = 'any', inplace=True)

no_series_info = (df['investment_type'] == 'undisclosed') | (df['investment_type'] == 'series_unknown')
df.drop(df[no_series_info].index, inplace=True)

df.drop(df[df['raised_amount_usd'].isnull()].index, inplace=True)

In [4]:
# df.sort_values(by=['org_uuid','raised_amount_usd'])
df.sort_values(by=['raised_amount_usd'])

Unnamed: 0,investment_type,raised_amount_usd,org_uuid,announced_on
247642,angel,0.000000e+00,3dfb60a7-08bb-63be-8d25-8439c78fc7ff,2018-01-03
303873,series_a,0.000000e+00,3dfb60a7-08bb-63be-8d25-8439c78fc7ff,2020-05-14
342056,pre_seed,0.000000e+00,33962bf0-400d-47f5-b24f-3c6eacbd719a,2020-10-01
269592,pre_seed,0.000000e+00,b6c181b3-299c-6117-1144-9b7689dc5b01,2014-01-16
303251,seed,0.000000e+00,ec3dcecf-e6d0-426c-ba17-d300b685368f,2020-03-09
...,...,...,...,...
210794,series_c,1.400000e+10,74a20af3-f4dd-6188-de60-c4ee6cd0ca4a,2018-06-08
55313,post_ipo_equity,2.127194e+10,e693e2f0-50bb-05ab-8a27-4078f5dacf11,2014-02-28
158549,private_equity,2.180169e+10,44996a96-b3bc-2498-2d15-ea91a4b608e2,2017-03-28
293281,post_ipo_equity,2.400000e+10,7ef1577a-8779-8044-4a42-669b796f8e4f,2020-01-06


In [5]:
# ordinal_investment_type = []

# # perform ordinal encoding: seed/angel = 0, series_a = 1, series_b = 2, series_c = 3
# for investment in df['investment_type']:
#     if investment == 'seed' or investment == 'angel':
#         ordinal_investment_type.append(0)
#     elif investment == 'series_a':
#         ordinal_investment_type.append(1)
#     elif investment == 'series_b':
#         ordinal_investment_type.append(2)
#     elif investment == 'series_c':
#         ordinal_investment_type.append(3)
#     else:
#         ordinal_investment_type.append(99)
        
# df['investment_type'] = ordinal_investment_type

# df.groupby(['investment_type'])

In [6]:
# Here, I convert datetime into number of years
df = df.reset_index(drop=True)
num_data = len(df)
today = ["2021-01-28"] * num_data
df['today'] = pd.DataFrame(today)
df['time'] = (pd.to_datetime(df['today'], errors='coerce', format='%Y-%m-%d') - \
              pd.to_datetime(df['announced_on'], errors='coerce', format='%Y-%m-%d'))


df['time'] = pd.to_numeric(df['time'].dt.days, downcast='integer')
df['time_in_year'] = df['time'].apply(lambda x: round(int(x)/365,4))
df.head()

Unnamed: 0,investment_type,raised_amount_usd,org_uuid,announced_on,today,time,time_in_year
0,angel,500000.0,df662812-7f97-0b43-9d3e-12f64f504fbb,2004-09-01,2021-01-28,5993,16.4192
1,series_a,12700000.0,df662812-7f97-0b43-9d3e-12f64f504fbb,2005-05-01,2021-01-28,5751,15.7562
2,series_b,27500000.0,df662812-7f97-0b43-9d3e-12f64f504fbb,2006-04-01,2021-01-28,5416,14.8384
3,series_b,10500000.0,f53cb4de-236e-0b1b-dee8-7104a8b018f9,2006-05-01,2021-01-28,5386,14.7562
4,series_b,10000000.0,4111dc8b-c0df-2d24-ed33-30cd137b3098,2007-03-01,2021-01-28,5082,13.9233


In [7]:
df = df.drop(columns = ['announced_on', 'today','time'])

In [8]:
#sort by latest to earliest, then group it
df = df.sort_values(by='time_in_year', ascending=False)
df = df.merge(df.groupby('org_uuid').agg(investment_list=('investment_type',list), raised_amount_usd_list=('raised_amount_usd',list), time_in_year_list = ('time_in_year',list)).reset_index())

In [9]:
# convert to include the latest two data only
#df['latest_two_inv_type'] = df['investment_list'].apply(lambda x: x[-2:])
#df['latest_two_funds_amount'] = df['raised_amount_usd_list'].apply(lambda x: x[-2:])
#df['latest_two_time_in_year'] = df['time_in_year_list'].apply(lambda x: x[-2:])

In [10]:
df.drop(columns=['investment_type', 'raised_amount_usd'], inplace=True)
# df.drop(columns=['investment_type', 'raised_amount_usd', 'announced_on'], inplace=True)
df.drop_duplicates(subset='org_uuid', inplace=True)
#df

In [11]:
investment_list = []

# zip() gives successive difference list
for investment in df['investment_list']:
    # indicate the change as "series_xxx to series_yyy"
    process = [(i + " to " + j) for i, j in zip(investment[: -1], investment[1 :])]
    investment_list.append(process if len(process) else None)

df['investment_delta'] = investment_list

In [12]:
amount_list = []

for amount in df['raised_amount_usd_list']:
    # find change in amount normalized to the smaller amount
    norm_amount_delta = [2 * (j - i)/ (i + j) if (i + j) != 0 else 0 for i, j in zip(amount[: -1], amount[1 :])]
    amount_list.append(norm_amount_delta if len(norm_amount_delta) else None)

df['norm_amount_delta'] = amount_list

In [13]:
time_elapsed_successive = []

for time in df['time_in_year_list']:
    time_elapsed = [(i-j) if len(time) != 1 else 0 for i, j in zip(time[: -1], time[1 :])]
    time_elapsed_successive.append(time_elapsed)
    
df['time_elapsed_successive_two_funds'] = time_elapsed_successive
df = df.reset_index(drop=True)

In [14]:
#include calculation of momentum based on latest two funds, and its change

momentum = []
for y in range(len(df['raised_amount_usd_list'])):
    if len(df['time_elapsed_successive_two_funds'].iloc[y]) != 0:
        change = []
        for i, j, k in zip(df['raised_amount_usd_list'].iloc[y][: -1], df['raised_amount_usd_list'].iloc[y][1 :], df['time_elapsed_successive_two_funds'].iloc[y][:]):
            if k != 0:
                change.append((i-j)/k)
            else:
                change.append("inf")
    else:
        change = None

    momentum.append(change)
df['momentum'] = momentum

In [15]:
df

Unnamed: 0,org_uuid,time_in_year,investment_list,raised_amount_usd_list,time_in_year_list,investment_delta,norm_amount_delta,time_elapsed_successive_two_funds,momentum
0,cb8014ea-46f9-2740-f108-e0920fdb2c4b,119.8274,"[grant, grant]","[2050345.0, 53846.0]","[119.8274, 3.9918]",[grant to grant],[-1.8976404708507926],[115.8356],[17235.62531725998]
1,b3486553-1d5f-4c90-bc5a-f0018c1dfd3d,117.1562,[pre_seed],[4000000.0],[117.1562],,,[],
2,170e8dd7-dcfb-41e9-8383-d14f572e8bfd,107.7589,"[angel, series_a, series_b, series_c, series_d]","[362757.0, 1611605.0, 3128304.0, 4599259.0, 72...","[107.7589, 6.6356, 5.4247, 4.937, 3.726]","[angel to series_a, series_a to series_b, seri...","[1.2650648665239708, 0.6399696703037969, 0.380...","[101.1233, 1.2109000000000005, 0.4876999999999...","[-12349.755199840196, -1252538.607647204, -301..."
3,14c94b74-c10a-45c5-b569-6f08f35574d2,100.3562,"[pre_seed, pre_seed]","[28211.0, 6957.0]","[100.3562, 0.9397]",[pre_seed to pre_seed],[-1.208712465878071],[99.4165],[213.78744976940447]
4,65d0628c-d1d8-4699-9e61-6ef3200839ab,70.4055,"[seed, pre_seed, pre_seed]","[59.0, 33.0, 59613.0]","[70.4055, 1.2986, 0.4082]","[seed to pre_seed, pre_seed to pre_seed]","[-0.5652173913043478, 1.9977869429634845]","[69.10690000000001, 0.8904]","[0.3762287123282913, -66913.74663072776]"
...,...,...,...,...,...,...,...,...,...
123363,dc5433a7-5837-42e4-990d-c7f240b69390,0.0849,[series_a],[8600000.0],[0.0849],,,[],
123364,72b016e6-446e-4e37-b220-5b932184f8e3,0.0849,[angel],[20000000.0],[0.0849],,,[],
123365,acd73df8-2c53-403d-8cf1-a7e0d54f7677,0.0849,[seed],[488480.0],[0.0849],,,[],
123366,c88592e1-1a02-4ab2-90ae-4938cd839f11,0.0849,[debt_financing],[1627492.0],[0.0849],,,[],


In [16]:
# # for later use I type the date-related code here first
# date_list = []

# for date in df['date_list']: # assume date is joined into df
#     # find time elapsed between rounds
#     time_delta = [(strptime(j, format='%Y-%m-%d') - strptime(i, format='%Y-%m-%d')) for i, j in zip(date[: -1], date[1 :])]
#     date_list.append(time_delta if len(time_delta) else None)

# df['time_elapsed'] = date_list

In [17]:
# find the duration of the job
# take mean for NaN
#mean = df['time_elapsed'].mean()
#df['time_elapsed'].fillna(value=mean, inplace=True)
# convert TimeDelta to int
#df['time_elapsed'] = df['time_elapsed'].apply(lambda x: x.days)

In [18]:
df.drop(columns=['investment_list', 'raised_amount_usd_list','time_in_year_list', 'time_elapsed_successive_two_funds'], inplace=True)
# df.drop(columns=['investment_list', 'raised_amount_usd_list', 'date_list'], inplace=True)
# for easier checking of results, I dropped rows without delta
df.dropna(inplace=True)
df

Unnamed: 0,org_uuid,time_in_year,investment_delta,norm_amount_delta,momentum
0,cb8014ea-46f9-2740-f108-e0920fdb2c4b,119.8274,[grant to grant],[-1.8976404708507926],[17235.62531725998]
2,170e8dd7-dcfb-41e9-8383-d14f572e8bfd,107.7589,"[angel to series_a, series_a to series_b, seri...","[1.2650648665239708, 0.6399696703037969, 0.380...","[-12349.755199840196, -1252538.607647204, -301..."
3,14c94b74-c10a-45c5-b569-6f08f35574d2,100.3562,[pre_seed to pre_seed],[-1.208712465878071],[213.78744976940447]
4,65d0628c-d1d8-4699-9e61-6ef3200839ab,70.4055,"[seed to pre_seed, pre_seed to pre_seed]","[-0.5652173913043478, 1.9977869429634845]","[0.3762287123282913, -66913.74663072776]"
6,7063d087-96b8-2cc1-ee88-c221288acc2a,43.7370,"[seed to seed, seed to post_ipo_equity, post_i...","[0.6086956521739131, 1.996003996003996, 1.4782...","[-72172.38890607259, -7781383.876411787, -4520..."
...,...,...,...,...,...
123234,756ee776-4210-43db-97fd-6c8ddcb30cd3,0.1178,[debt_financing to series_a],[0.0],[inf]
123239,f02593fd-ee80-4831-aac7-975ca689c8f4,0.1178,[series_a to series_b],[0.8193832599118943],[inf]
123255,b5dcc990-b603-4235-aeb4-1fb25d5ae8ab,0.1178,[seed to debt_financing],[-1.2000001274762053],[inf]
123266,37d71eda-45aa-4b9e-b093-f517eac55fa3,0.1151,[seed to debt_financing],[0.6666666666666666],[inf]


In [19]:
# if using our old defined way of ROI, i.e. the momentum between the latest two succesive fundings
df['ROI'] = df['momentum'].apply(lambda x: x[0])

In [20]:
# find max delta
max_amount_delta_list = []

for delta_list in df['norm_amount_delta']:
    max_amount_delta_list.append(np.array(delta_list).max())

df['max_norm_amount_delta'] = max_amount_delta_list

In [21]:
# find mean delta
mean_amount_delta_list = []

for delta_list in df['norm_amount_delta']:
    mean_amount_delta_list.append(np.array(delta_list).mean())

df['mean_norm_amount_delta'] = mean_amount_delta_list

In [22]:
# # find ROI
# roi_col = []

# for index, row in df.iterrows():
#     roi_list = np.divide(row['norm_amount_delta'], row['time_elapsed'])
#     roi_col.append(roi_list)

# df['roi_list'] = roi_col

In [23]:
# # find max ROI
# max_roi_list = []

# for roi_list in df['roi_list']:
#     max_roi_list.append(np.array(roi_list).max())

# df['max_roi'] = max_roi_list

In [24]:
# # find mean ROI
# mean_roi_list = []

# for roi_list in df['roi_list']:
#     mean_roi_list.append(np.array(roi_list).mean())

# df['mean_roi'] = mean_roi_list

In [25]:
df.reset_index(inplace=True)
df.drop(axis=1, labels='index', inplace=True)
df

Unnamed: 0,org_uuid,time_in_year,investment_delta,norm_amount_delta,momentum,ROI,max_norm_amount_delta,mean_norm_amount_delta
0,cb8014ea-46f9-2740-f108-e0920fdb2c4b,119.8274,[grant to grant],[-1.8976404708507926],[17235.62531725998],17235.6,-1.897640,-1.897640
1,170e8dd7-dcfb-41e9-8383-d14f572e8bfd,107.7589,"[angel to series_a, series_a to series_b, seri...","[1.2650648665239708, 0.6399696703037969, 0.380...","[-12349.755199840196, -1252538.607647204, -301...",-12349.8,1.265065,0.682891
2,14c94b74-c10a-45c5-b569-6f08f35574d2,100.3562,[pre_seed to pre_seed],[-1.208712465878071],[213.78744976940447],213.787,-1.208712,-1.208712
3,65d0628c-d1d8-4699-9e61-6ef3200839ab,70.4055,"[seed to pre_seed, pre_seed to pre_seed]","[-0.5652173913043478, 1.9977869429634845]","[0.3762287123282913, -66913.74663072776]",0.376229,1.997787,0.716285
4,7063d087-96b8-2cc1-ee88-c221288acc2a,43.7370,"[seed to seed, seed to post_ipo_equity, post_i...","[0.6086956521739131, 1.996003996003996, 1.4782...","[-72172.38890607259, -7781383.876411787, -4520...",-72172.4,1.996004,1.354073
...,...,...,...,...,...,...,...,...
47797,756ee776-4210-43db-97fd-6c8ddcb30cd3,0.1178,[debt_financing to series_a],[0.0],[inf],inf,0.000000,0.000000
47798,f02593fd-ee80-4831-aac7-975ca689c8f4,0.1178,[series_a to series_b],[0.8193832599118943],[inf],inf,0.819383,0.819383
47799,b5dcc990-b603-4235-aeb4-1fb25d5ae8ab,0.1178,[seed to debt_financing],[-1.2000001274762053],[inf],inf,-1.200000,-1.200000
47800,37d71eda-45aa-4b9e-b093-f517eac55fa3,0.1151,[seed to debt_financing],[0.6666666666666666],[inf],inf,0.666667,0.666667


In [26]:
df.to_csv("ROI.csv", index=False)