Sample code for finding ROI (% change of two latest successive funding rounds / time difference).<br>
Put `funding_rounds.csv` here.

In [1]:
# import library
import pandas as pd
import numpy as np
import datetime as dt
from dateutil.relativedelta import *
from collections import Counter
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [2]:
# Stardust ver one-hot encoder V2
def onehot_encoder_v2(df: pd.DataFrame, col_name: str, list_selected: list) -> pd.DataFrame:
    '''
    Performs one-hot encoding on a dataframe's column for its values with most occurrences

    :param pd.DataFrame df: dataframe to be processed
    :param str col_name: name of the encoded column
    :param list list_selected: list of most common values
    :return: processed dataframe
    :rtype: pd.DataFrame
    '''
    for item in list_selected:
        df[item] = np.where(df[col_name] == item, 1, 0) # whenever df[col_name] == cat replace it with 1 else 0
    df.drop(col_name, axis=1, inplace=True)

    return df

In [3]:
# read the CSV
df = pd.read_csv("../bulk_export_processed/funding_rounds.csv")
#df = pd.read_csv("funding_rounds.csv") # place CSV here
# change the cols to keep
keep_col = ['investment_type','raised_amount_usd','org_uuid','announced_on'] # cols for illustration purpose only. choose more for real model
df = df[keep_col]

In [4]:
df.dropna(inplace=True)

# no_series_info = (df['investment_type'] == 'undisclosed') | (df['investment_type'] == 'series_unknown')
# df.drop(df[no_series_info].index, inplace=True)

df.drop(df[df['raised_amount_usd'].isnull()].index, inplace=True)

In [5]:
df['announced_on'] = pd.to_datetime(df['announced_on'], errors='coerce', format='%d/%m/%Y')

In [6]:
# NEW
# filter by funding round announcement date:  on or after 2010-1-1, by 2020-12-31
# in accordance with company founding date restriction
start_date = dt.datetime(2010, 1, 1)
end_date = dt.datetime(2020, 12, 31)
after_start_date = df["announced_on"] >= start_date
before_end_date = df["announced_on"] <= end_date
df = df.loc[after_start_date & before_end_date]

In [7]:
df.set_index('announced_on', inplace=True)
df.head()

Unnamed: 0_level_0,investment_type,raised_amount_usd,org_uuid
announced_on,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-11-06,series_b,15000000.0,8a899f9c-886e-2b9c-378e-49bbeb316cfc
2011-03-31,series_b,1500000.0,31f9d866-3660-5367-8c52-028a7610e441
2014-11-26,series_unknown,5000000.0,95b566c0-b31a-4586-23f8-5eee072ea621
2010-12-01,series_a,1000000.0,e212a930-25b3-2de3-3c6c-19f0965fcb47
2014-01-09,series_a,1000000.0,e62b140b-2321-d4b5-a74c-19d989be276b


In [8]:
ordinal_investment_type = []

# perform ordinal encoding: seed/angel=1, a=2, b=3, c=4, d=5, e=6, f or above=7 
for investment in df['investment_type']:
    if investment == 'seed' or investment == 'angel':
        ordinal_investment_type.append(1)
    elif investment == 'series_a':
        ordinal_investment_type.append(2)
    elif investment == 'series_b':
        ordinal_investment_type.append(3)
    elif investment == 'series_c':
        ordinal_investment_type.append(4)
    elif investment == 'series_d':
        ordinal_investment_type.append(5)
    elif investment == 'series_e':
        ordinal_investment_type.append(6)
    elif investment == 'series_f' or investment == 'series_g' or investment == 'series_h' \
    or investment == 'series_i' or investment == 'series_j':
        ordinal_investment_type.append(7)
    else:
        ordinal_investment_type.append(0)
        
df['latest_investment_type'] = ordinal_investment_type

# df.groupby(['investment_type'])

In [9]:
df

Unnamed: 0_level_0,investment_type,raised_amount_usd,org_uuid,latest_investment_type
announced_on,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-11-06,series_b,15000000.0,8a899f9c-886e-2b9c-378e-49bbeb316cfc,3
2011-03-31,series_b,1500000.0,31f9d866-3660-5367-8c52-028a7610e441,3
2014-11-26,series_unknown,5000000.0,95b566c0-b31a-4586-23f8-5eee072ea621,0
2010-12-01,series_a,1000000.0,e212a930-25b3-2de3-3c6c-19f0965fcb47,2
2014-01-09,series_a,1000000.0,e62b140b-2321-d4b5-a74c-19d989be276b,2
...,...,...,...,...
2020-10-01,pre_seed,65000.0,7091dcc8-0886-4fc8-af3b-864a2adf986c,0
2017-02-17,angel,291337.0,33eceaba-87d5-4526-ac4a-18a18caf9f67,1
2014-10-01,angel,142800.0,8004c9c8-2a7a-4e40-a1df-adea3985bf8a,1
2020-05-27,pre_seed,50000.0,9d92669d-1d86-4ef5-97c5-50a8903d3ffa,0


In [10]:
irregular_funding = df['investment_type'].value_counts().drop(['seed','angel','series_a','series_b','series_c','series_d','series_e','series_f','series_g','series_h','series_i','series_j']).index.tolist()
print(irregular_funding)

['series_unknown', 'grant', 'debt_financing', 'pre_seed', 'post_ipo_equity', 'convertible_note', 'equity_crowdfunding', 'private_equity', 'undisclosed', 'post_ipo_debt', 'corporate_round', 'product_crowdfunding', 'non_equity_assistance', 'initial_coin_offering', 'secondary_market', 'post_ipo_secondary']


In [11]:
onehot_encoder_v2(df, 'investment_type', irregular_funding)

Unnamed: 0_level_0,raised_amount_usd,org_uuid,latest_investment_type,series_unknown,grant,debt_financing,pre_seed,post_ipo_equity,convertible_note,equity_crowdfunding,private_equity,undisclosed,post_ipo_debt,corporate_round,product_crowdfunding,non_equity_assistance,initial_coin_offering,secondary_market,post_ipo_secondary
announced_on,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2016-11-06,15000000.0,8a899f9c-886e-2b9c-378e-49bbeb316cfc,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2011-03-31,1500000.0,31f9d866-3660-5367-8c52-028a7610e441,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2014-11-26,5000000.0,95b566c0-b31a-4586-23f8-5eee072ea621,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2010-12-01,1000000.0,e212a930-25b3-2de3-3c6c-19f0965fcb47,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2014-01-09,1000000.0,e62b140b-2321-d4b5-a74c-19d989be276b,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-10-01,65000.0,7091dcc8-0886-4fc8-af3b-864a2adf986c,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2017-02-17,291337.0,33eceaba-87d5-4526-ac4a-18a18caf9f67,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2014-10-01,142800.0,8004c9c8-2a7a-4e40-a1df-adea3985bf8a,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2020-05-27,50000.0,9d92669d-1d86-4ef5-97c5-50a8903d3ffa,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
groupby_dict = {'latest_investment_type': np.max, 'raised_amount_usd': np.sum}
for key in irregular_funding:
    groupby_dict[key] = np.sum

df = df.groupby([pd.Grouper(freq='Q', closed='left'), 'org_uuid']).agg(groupby_dict)

In [13]:
df.reset_index(inplace=True)

In [14]:
df

Unnamed: 0,announced_on,org_uuid,latest_investment_type,raised_amount_usd,series_unknown,grant,debt_financing,pre_seed,post_ipo_equity,convertible_note,equity_crowdfunding,private_equity,undisclosed,post_ipo_debt,corporate_round,product_crowdfunding,non_equity_assistance,initial_coin_offering,secondary_market,post_ipo_secondary
0,2010-03-31,003b7da7-7e0a-1ebe-c32c-19210e9dcbdd,0,260000000.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2010-03-31,003d89d4-47dc-e35a-ddf6-eda9940f3ba7,1,500000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2010-03-31,006d0b80-23fd-e6d6-b916-1fb3e39ecdee,0,2195236.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2010-03-31,009e8957-d102-c674-6c9c-98eb11ac8e32,2,6500000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2010-03-31,00a4c0e1-c056-4aa7-9262-696dc5192466,1,8000000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226300,2020-12-31,fff4c94a-af41-4d44-8f42-eb72aa48dac3,0,3700000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
226301,2021-03-31,1a3bd65c-e63e-470c-8ce9-fe435fa2e3d3,1,400000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
226302,2021-03-31,23f8d0e6-ee4f-4434-91ac-c8d70ee0fced,1,1091824.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
226303,2021-03-31,6fa4d740-ebc0-e419-f7eb-cc38698b9032,1,2000000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


The code above are new, those below are old

In [15]:
df.sort_values(by=['org_uuid','announced_on'])
# df.sort_values(by=['raised_amount_usd'])

Unnamed: 0,announced_on,org_uuid,latest_investment_type,raised_amount_usd,series_unknown,grant,debt_financing,pre_seed,post_ipo_equity,convertible_note,equity_crowdfunding,private_equity,undisclosed,post_ipo_debt,corporate_round,product_crowdfunding,non_equity_assistance,initial_coin_offering,secondary_market,post_ipo_secondary
135718,2017-09-30,00000aa4-ba42-9b68-a9c3-040c9f3bf9b9,0,82607364.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48379,2014-03-31,00002470-bff7-6226-5800-0ca1b3787b6f,1,600000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
84655,2015-09-30,00002470-bff7-6226-5800-0ca1b3787b6f,1,1200000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
116709,2016-12-31,00002470-bff7-6226-5800-0ca1b3787b6f,1,1600000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
122643,2017-03-31,0000d497-c93a-eea3-eeb0-a943dfb4f71e,0,431576.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78001,2015-03-31,ffffabce-6d4a-b3d1-13c0-4e90cedf5270,0,15000.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
84654,2015-06-30,ffffabce-6d4a-b3d1-13c0-4e90cedf5270,1,80000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
91258,2015-09-30,ffffabce-6d4a-b3d1-13c0-4e90cedf5270,0,82000.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
110433,2016-06-30,ffffabce-6d4a-b3d1-13c0-4e90cedf5270,1,210000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
# convert datetime into number of years
df = df.reset_index(drop=True)
num_data = len(df)
today = ["2021-01-28"] * num_data
df['today'] = pd.DataFrame(today)
df['time'] = (pd.to_datetime(df['today'], errors='coerce', format='%Y-%m-%d') - \
              pd.to_datetime(df['announced_on'], errors='coerce', format='%Y-%m-%d'))


df['time'] = pd.to_numeric(df['time'].dt.days, downcast='integer')
df['time_in_year'] = df['time'].apply(lambda x: round(int(x)/365,4))
df.head()

Unnamed: 0,announced_on,org_uuid,latest_investment_type,raised_amount_usd,series_unknown,grant,debt_financing,pre_seed,post_ipo_equity,convertible_note,...,post_ipo_debt,corporate_round,product_crowdfunding,non_equity_assistance,initial_coin_offering,secondary_market,post_ipo_secondary,today,time,time_in_year
0,2010-03-31,003b7da7-7e0a-1ebe-c32c-19210e9dcbdd,0,260000000.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,2021-01-28,3956,10.8384
1,2010-03-31,003d89d4-47dc-e35a-ddf6-eda9940f3ba7,1,500000.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2021-01-28,3956,10.8384
2,2010-03-31,006d0b80-23fd-e6d6-b916-1fb3e39ecdee,0,2195236.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,2021-01-28,3956,10.8384
3,2010-03-31,009e8957-d102-c674-6c9c-98eb11ac8e32,2,6500000.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2021-01-28,3956,10.8384
4,2010-03-31,00a4c0e1-c056-4aa7-9262-696dc5192466,1,8000000.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2021-01-28,3956,10.8384


In [17]:
df.drop(columns=['announced_on','today','time'], inplace=True)

In [18]:
#sort by latest to earliest, then group it
df = df.sort_values(by='time_in_year', ascending=False)
df = df.merge(df.groupby('org_uuid').agg(latest_investment_list=('latest_investment_type',list), raised_amount_usd_list=('raised_amount_usd',list), time_in_year_list = ('time_in_year',list)).reset_index())

In [19]:
df.drop(columns=['latest_investment_type', 'raised_amount_usd'], inplace=True)

In [20]:
df

Unnamed: 0,org_uuid,series_unknown,grant,debt_financing,pre_seed,post_ipo_equity,convertible_note,equity_crowdfunding,private_equity,undisclosed,...,corporate_round,product_crowdfunding,non_equity_assistance,initial_coin_offering,secondary_market,post_ipo_secondary,time_in_year,latest_investment_list,raised_amount_usd_list,time_in_year_list
0,003b7da7-7e0a-1ebe-c32c-19210e9dcbdd,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,10.8384,"[0, 2, 0, 0, 0]","[260000000.0, 3689600.0, 10086407.0, 48063500....","[10.8384, 3.0795, 2.0795, 1.3315, 0.0767]"
1,003b7da7-7e0a-1ebe-c32c-19210e9dcbdd,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,3.0795,"[0, 2, 0, 0, 0]","[260000000.0, 3689600.0, 10086407.0, 48063500....","[10.8384, 3.0795, 2.0795, 1.3315, 0.0767]"
2,003b7da7-7e0a-1ebe-c32c-19210e9dcbdd,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2.0795,"[0, 2, 0, 0, 0]","[260000000.0, 3689600.0, 10086407.0, 48063500....","[10.8384, 3.0795, 2.0795, 1.3315, 0.0767]"
3,003b7da7-7e0a-1ebe-c32c-19210e9dcbdd,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1.3315,"[0, 2, 0, 0, 0]","[260000000.0, 3689600.0, 10086407.0, 48063500....","[10.8384, 3.0795, 2.0795, 1.3315, 0.0767]"
4,003b7da7-7e0a-1ebe-c32c-19210e9dcbdd,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0.0767,"[0, 2, 0, 0, 0]","[260000000.0, 3689600.0, 10086407.0, 48063500....","[10.8384, 3.0795, 2.0795, 1.3315, 0.0767]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226300,54a1740d-d88e-45c4-8466-e97829118e82,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0767,[0],[1295615.0],[0.0767]
226301,5499bd62-6c02-4261-9923-c5a7d6c55845,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0767,[0],[6527500.0],[0.0767]
226302,54459a4b-2844-4a87-b88e-4bf820031609,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0767,[3],[100000000.0],[0.0767]
226303,5428b5b6-72f5-44c5-882a-473781c44ba4,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0767,[0],[12000000.0],[0.0767]


In [21]:
irregular_funding_df = df[['org_uuid'] + irregular_funding]
irregular_funding_df_max = irregular_funding_df.groupby('org_uuid').sum()

In [22]:
df.drop_duplicates(subset='org_uuid', inplace=True)
df = df[['org_uuid','time_in_year','latest_investment_list','raised_amount_usd_list','time_in_year_list']]
df = df.set_index('org_uuid').join(irregular_funding_df_max)

In [23]:
df

Unnamed: 0_level_0,time_in_year,latest_investment_list,raised_amount_usd_list,time_in_year_list,series_unknown,grant,debt_financing,pre_seed,post_ipo_equity,convertible_note,equity_crowdfunding,private_equity,undisclosed,post_ipo_debt,corporate_round,product_crowdfunding,non_equity_assistance,initial_coin_offering,secondary_market,post_ipo_secondary
org_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
003b7da7-7e0a-1ebe-c32c-19210e9dcbdd,10.8384,"[0, 2, 0, 0, 0]","[260000000.0, 3689600.0, 10086407.0, 48063500....","[10.8384, 3.0795, 2.0795, 1.3315, 0.0767]",2,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
9ee87a58-15e9-2640-7578-4096611abd9e,10.8384,"[3, 0, 0, 0, 0]","[4000000.0, 100000.0, 4635000.0, 850000.0, 331...","[10.8384, 9.8384, 8.5863, 7.3342, 5.8356]",1,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0
a96f7ec5-5914-9f06-f38a-39eacdd8c3fb,10.8384,[0],[980000.0],[10.8384],0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
a9c4bceb-711d-4155-bc5e-1698f7cbc972,10.8384,"[2, 3]","[2928804.0, 20331200.0]","[10.8384, 6.0822]",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
a9d8ce09-b7aa-5415-7f98-f7b41b046af6,10.8384,"[0, 0]","[3100000.0, 13369999.0]","[10.8384, 3.0795]",0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54a1740d-d88e-45c4-8466-e97829118e82,0.0767,[0],[1295615.0],[0.0767],1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5499bd62-6c02-4261-9923-c5a7d6c55845,0.0767,[0],[6527500.0],[0.0767],1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
54459a4b-2844-4a87-b88e-4bf820031609,0.0767,[3],[100000000.0],[0.0767],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5428b5b6-72f5-44c5-882a-473781c44ba4,0.0767,[0],[12000000.0],[0.0767],1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [24]:
# convert to include the latest two data only
#df['latest_two_inv_type'] = df['investment_list'].apply(lambda x: x[-2:])
#df['latest_two_funds_amount'] = df['raised_amount_usd_list'].apply(lambda x: x[-2:])
#df['latest_two_time_in_year'] = df['time_in_year_list'].apply(lambda x: x[-2:])

In [25]:
# By Stardust: Pls put anything u want to do with lastest investment here


# investment_list = []

# # zip() gives successive difference list
# for investment in df['investment_list']:
#     # indicate the change as "series_xxx to series_yyy"
#     process = [(i + " to " + j) for i, j in zip(investment[: -1], investment[1 :])]
#     investment_list.append(process if len(process) else None)

# df['investment_delta'] = investment_list

In [26]:
amount_list = []

for amount in df['raised_amount_usd_list']:
    # find change in amount normalized to the smaller amount
    norm_amount_delta = [2 * (j - i)/ (i + j) if (i + j) != 0 else 0 for i, j in zip(amount[: -1], amount[1 :])]
    amount_list.append(norm_amount_delta if len(norm_amount_delta) else None)

df['norm_amount_delta'] = amount_list

In [27]:
time_elapsed_successive = []

for time in df['time_in_year_list']:
    time_elapsed = [(i-j) if len(time) != 1 else 0 for i, j in zip(time[: -1], time[1 :])]
    time_elapsed_successive.append(time_elapsed)
    
df['time_elapsed_successive_two_funds'] = time_elapsed_successive
df = df.reset_index(drop=True)

In [28]:
#include calculation of momentum based on latest two funds, and its change

momentum = []
for y in range(len(df['norm_amount_delta'])):
    if df['norm_amount_delta'].iloc[y] != None:
        change = []
        for j, k in zip(df['norm_amount_delta'].iloc[y][:], df['time_elapsed_successive_two_funds'].iloc[y][:]):
            if k != 0:
                change.append((j)/k)
            else:
                change.append("inf")
    else:
        change = None

    momentum.append(change)
df['momentum'] = momentum

In [29]:
df

Unnamed: 0,time_in_year,latest_investment_list,raised_amount_usd_list,time_in_year_list,series_unknown,grant,debt_financing,pre_seed,post_ipo_equity,convertible_note,...,post_ipo_debt,corporate_round,product_crowdfunding,non_equity_assistance,initial_coin_offering,secondary_market,post_ipo_secondary,norm_amount_delta,time_elapsed_successive_two_funds,momentum
0,10.8384,"[0, 2, 0, 0, 0]","[260000000.0, 3689600.0, 10086407.0, 48063500....","[10.8384, 3.0795, 2.0795, 1.3315, 0.0767]",2,0,1,0,0,0,...,0,0,0,0,0,0,0,"[-1.9440311639139352, 0.9286881169558059, 1.30...","[7.758900000000001, 1.0, 0.748, 1.2548]","[-0.2505549967023592, 0.9286881169558059, 1.74..."
1,10.8384,"[3, 0, 0, 0, 0]","[4000000.0, 100000.0, 4635000.0, 850000.0, 331...","[10.8384, 9.8384, 8.5863, 7.3342, 5.8356]",1,0,2,0,0,0,...,0,0,0,0,0,0,0,"[-1.9024390243902438, 1.9155227032734952, -1.3...","[1.0, 1.2521000000000004, 1.2520999999999995, ...","[-1.9024390243902438, 1.5298480179486418, -1.1..."
2,10.8384,[0],[980000.0],[10.8384],0,0,0,0,1,0,...,0,0,0,0,0,0,0,,[],
3,10.8384,"[2, 3]","[2928804.0, 20331200.0]","[10.8384, 6.0822]",0,0,0,0,0,0,...,0,0,0,0,0,0,0,[1.496336458067677],[4.7562],[0.3146075560463557]
4,10.8384,"[0, 0]","[3100000.0, 13369999.0]","[10.8384, 3.0795]",0,0,0,0,2,0,...,0,0,0,0,0,0,0,[1.2471159227149922],[7.758900000000001],[0.1607335991848061]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127676,0.0767,[0],[1295615.0],[0.0767],1,0,0,0,0,0,...,0,0,0,0,0,0,0,,[],
127677,0.0767,[0],[6527500.0],[0.0767],1,0,0,0,0,0,...,0,0,0,0,0,0,0,,[],
127678,0.0767,[3],[100000000.0],[0.0767],0,0,0,0,0,0,...,0,0,0,0,0,0,0,,[],
127679,0.0767,[0],[12000000.0],[0.0767],1,0,0,0,0,0,...,0,0,0,0,0,0,0,,[],


In [16]:
# # for later use I type the date-related code here first
# date_list = []

# for date in df['date_list']: # assume date is joined into df
#     # find time elapsed between rounds
#     time_delta = [(strptime(j, format='%Y-%m-%d') - strptime(i, format='%Y-%m-%d')) for i, j in zip(date[: -1], date[1 :])]
#     date_list.append(time_delta if len(time_delta) else None)

# df['time_elapsed'] = date_list

In [17]:
# find the duration of the job
# take mean for NaN
#mean = df['time_elapsed'].mean()
#df['time_elapsed'].fillna(value=mean, inplace=True)
# convert TimeDelta to int
#df['time_elapsed'] = df['time_elapsed'].apply(lambda x: x.days)

In [18]:
df.drop(columns=['investment_list', 'raised_amount_usd_list','time_in_year_list', 'time_elapsed_successive_two_funds'], inplace=True)
# df.drop(columns=['investment_list', 'raised_amount_usd_list', 'date_list'], inplace=True)
# for easier checking of results, I dropped rows without delta
#df.dropna(inplace=True)   ** Boris: I think no need to dropna here coz fundings even without delta could be useful
df

Unnamed: 0,org_uuid,time_in_year,investment_delta,norm_amount_delta,momentum
0,cb8014ea-46f9-2740-f108-e0920fdb2c4b,119.8274,[grant to grant],[-1.8976404708507926],[-0.0163821870897271]
1,b3486553-1d5f-4c90-bc5a-f0018c1dfd3d,117.1562,,,
2,170e8dd7-dcfb-41e9-8383-d14f572e8bfd,107.7589,"[angel to series_a, series_a to series_b, seri...","[1.2650648665239708, 0.6399696703037969, 0.380...","[0.012510122459650453, 0.5285074492557574, 0.7..."
3,14c94b74-c10a-45c5-b569-6f08f35574d2,100.3562,[pre_seed to pre_seed],[-1.208712465878071],[-0.012158066979606716]
4,65d0628c-d1d8-4699-9e61-6ef3200839ab,70.4055,"[seed to pre_seed, pre_seed to pre_seed]","[-0.5652173913043478, 1.9977869429634845]","[-0.008178885050615028, 2.2436960275870224]"
...,...,...,...,...,...
123363,dc5433a7-5837-42e4-990d-c7f240b69390,0.0849,,,
123364,72b016e6-446e-4e37-b220-5b932184f8e3,0.0849,,,
123365,acd73df8-2c53-403d-8cf1-a7e0d54f7677,0.0849,,,
123366,c88592e1-1a02-4ab2-90ae-4938cd839f11,0.0849,,,


In [19]:
# if using our old defined way of ROI, i.e. the momentum between the latest two succesive fundings
df['ROI'] = df['momentum'].apply(lambda x: x[0] if x != None else None)

In [20]:
# find max delta
max_amount_delta_list = []

for delta_list in df['norm_amount_delta']:
    max_amount_delta_list.append(np.array(delta_list).max())

df['max_norm_amount_delta'] = max_amount_delta_list

In [21]:
# find mean delta
mean_amount_delta_list = []

for delta_list in df['norm_amount_delta']:
    mean_amount_delta_list.append(np.array(delta_list).mean() if delta_list != None else None)

df['mean_norm_amount_delta'] = mean_amount_delta_list

In [22]:
# # find ROI
# roi_col = []

# for index, row in df.iterrows():
#     roi_list = np.divide(row['norm_amount_delta'], row['time_elapsed'])
#     roi_col.append(roi_list)

# df['roi_list'] = roi_col

In [23]:
# # find max ROI
# max_roi_list = []

# for roi_list in df['roi_list']:
#     max_roi_list.append(np.array(roi_list).max())

# df['max_roi'] = max_roi_list

In [24]:
# # find mean ROI
# mean_roi_list = []

# for roi_list in df['roi_list']:
#     mean_roi_list.append(np.array(roi_list).mean())

# df['mean_roi'] = mean_roi_list

In [25]:
df.reset_index(inplace=True)
df.drop(axis=1, labels='index', inplace=True)
df

Unnamed: 0,org_uuid,time_in_year,investment_delta,norm_amount_delta,momentum,ROI,max_norm_amount_delta,mean_norm_amount_delta
0,cb8014ea-46f9-2740-f108-e0920fdb2c4b,119.8274,[grant to grant],[-1.8976404708507926],[-0.0163821870897271],-0.0163822,-1.897640,-1.897640
1,b3486553-1d5f-4c90-bc5a-f0018c1dfd3d,117.1562,,,,,,
2,170e8dd7-dcfb-41e9-8383-d14f572e8bfd,107.7589,"[angel to series_a, series_a to series_b, seri...","[1.2650648665239708, 0.6399696703037969, 0.380...","[0.012510122459650453, 0.5285074492557574, 0.7...",0.0125101,1.265065,0.682891
3,14c94b74-c10a-45c5-b569-6f08f35574d2,100.3562,[pre_seed to pre_seed],[-1.208712465878071],[-0.012158066979606716],-0.0121581,-1.208712,-1.208712
4,65d0628c-d1d8-4699-9e61-6ef3200839ab,70.4055,"[seed to pre_seed, pre_seed to pre_seed]","[-0.5652173913043478, 1.9977869429634845]","[-0.008178885050615028, 2.2436960275870224]",-0.00817889,1.997787,0.716285
...,...,...,...,...,...,...,...,...
123363,dc5433a7-5837-42e4-990d-c7f240b69390,0.0849,,,,,,
123364,72b016e6-446e-4e37-b220-5b932184f8e3,0.0849,,,,,,
123365,acd73df8-2c53-403d-8cf1-a7e0d54f7677,0.0849,,,,,,
123366,c88592e1-1a02-4ab2-90ae-4938cd839f11,0.0849,,,,,,


In [26]:
# df.to_csv("ROI.csv", index=False)