Sample code for finding ROI (% change of two latest successive funding rounds / time difference).<br>
Put `funding_rounds.csv` here.

In [1]:
# import library
import pandas as pd
import numpy as np
from collections import Counter
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [2]:
# Stardust ver unique keyword search
def unique_keyword_search(df_col: pd.Series, num_selected: int) -> list:
    '''
    Performs unique keyword search on a dataframe's column for its most common keywords

    :param pd.Series df_col: column of a pd.DataFrame (e.g. df['col'])
    :param int num_selected: number of keywords
    :return: list of keywords in decreasing occurrence
    :rtype: list
    '''
    keywords = []
    for entry in df_col:
        words = entry.split(',')
        [keywords.append(word) for word in words]
    sorted_keywords = Counter(keywords).most_common()
    output_keywords = []
    for i in range(num_selected):
        output_keywords.append(sorted_keywords[i][0])
    return output_keywords

In [3]:
# Stardust ver one-hot encoder V2
def onehot_encoder_v2(df: pd.DataFrame, col_name: str, list_selected: list) -> pd.DataFrame:
    '''
    Performs one-hot encoding on a dataframe's column for its values with most occurrences

    :param pd.DataFrame df: dataframe to be processed
    :param str col_name: name of the encoded column
    :param list list_selected: list of most common values
    :return: processed dataframe
    :rtype: pd.DataFrame
    '''
    for item in list_selected:
        df[item] = np.where(df[col_name] == item, 1, 0) # whenever df[col_name] == cat replace it with 1 else 0
    df.drop(col_name, axis=1, inplace=True)

    return df

In [4]:
# read the CSV
df = pd.read_csv("../bulk_export_processed/cleaned_csv/funding_rounds_cleaned.csv")
# change the cols to keep
keep_col = ['investment_type','raised_amount_usd','org_uuid'] # cols for illustration purpose only. choose more for real model
df = df[keep_col]

In [5]:
df.dropna(inplace=True)
df.drop(df[df['investment_type'] == 'series_unknown'].index, inplace=True)

In [6]:
# investment_types = unique_keyword_search(df['investment_type'], 10)
# investment_types

In [7]:
# df.sort_values(by=['org_uuid','raised_amount_usd'])
df.sort_values(by=['raised_amount_usd'])

Unnamed: 0,investment_type,raised_amount_usd,org_uuid
1730,undisclosed,1.000000e+03,3cfa161b-4631-806a-aaf5-8e5523aaf4b4
5292,seed,1.581000e+03,6ee821a7-86ea-cf8e-43e0-773f4bf47417
5665,seed,3.621000e+03,70152a85-1327-ac78-db60-9e3b87efcbd8
5169,seed,5.000000e+03,9eebfab0-a43a-f4fe-a1f0-1e71c5a4c024
4816,seed,5.001000e+03,863960cb-075e-1964-78cb-3e8d52c15a93
...,...,...,...
719,series_d,3.000000e+08,ba83ed1d-efd1-8e9c-910b-aca4c13f81b2
5515,private_equity,3.497873e+08,7c323af1-1f0e-5b14-523a-16197b759aa8
4863,private_equity,4.300000e+08,c7420cf5-8369-0c9f-1483-c111c04fa1c7
5755,debt_financing,4.650000e+08,a367b036-5952-5435-7541-ad7ee8869e24


In [8]:
# ordinal_investment_type = []

# # perform ordinal encoding: seed/angel = 0, series_a = 1, series_b = 2, series_c = 3
# for investment in df['investment_type']:
#     if investment == 'seed' or investment == 'angel':
#         ordinal_investment_type.append(0)
#     elif investment == 'series_a':
#         ordinal_investment_type.append(1)
#     elif investment == 'series_b':
#         ordinal_investment_type.append(2)
#     elif investment == 'series_c':
#         ordinal_investment_type.append(3)
#     else:
#         ordinal_investment_type.append(99)
        
# df['investment_type'] = ordinal_investment_type

In [9]:
# df.groupby(['investment_type'])

In [10]:
df = df.merge(df.groupby('org_uuid').agg(investment_list=('investment_type',list), raised_amount_usd_list=('raised_amount_usd',list)).reset_index())

In [11]:
df.drop(columns=['investment_type', 'raised_amount_usd'], inplace=True)
df.drop_duplicates(subset='org_uuid', inplace=True)

In [12]:
investment_list = []

# zip() gives successive difference list
for investment in df['investment_list']:
    # indicate the change as "series_xxx to series_yyy"
    process = [(i + " to " + j) for i, j in zip(investment[: -1], investment[1 :])]
    investment_list.append(process if len(process) else None)

df['investment_delta'] = investment_list

In [13]:
amount_list = []

for amount in df['raised_amount_usd_list']:
    # find change in amount normalized to the smaller amount
    norm_amount_delta = [2 * (j - i)/ (i + j) for i, j in zip(amount[: -1], amount[1 :])]
    amount_list.append(norm_amount_delta if len(norm_amount_delta) else None)

df['norm_amount_delta'] = amount_list

In [14]:
df.drop(columns=['investment_list', 'raised_amount_usd_list'], inplace=True)
# for easier checking of results, I dropped rows without delta
df.dropna(inplace=True)
df

Unnamed: 0,org_uuid,investment_delta,norm_amount_delta
0,df662812-7f97-0b43-9d3e-12f64f504fbb,"[angel to series_a, series_a to series_b, seri...","[1.8484848484848484, 0.736318407960199, 1.5887..."
9,f53cb4de-236e-0b1b-dee8-7104a8b018f9,[series_b to series_a],[-1.1111111111111112]
11,4111dc8b-c0df-2d24-ed33-30cd137b3098,[series_b to series_c],[-0.6666666666666666]
14,56b5f0c1-855f-2ebb-083f-16641f1db2e1,[series_a to series_b],[0.03125]
16,21e77067-5537-408e-cad7-e5e72bb6ad86,"[seed to series_a, series_a to series_b]","[1.9573333333333334, 0.8324154209284028]"
...,...,...,...
4518,5f8959ff-f184-42d8-e6ce-14ad2694d82e,[series_b to series_a],[0.45901639344262296]
4526,1c60bc6a-b81b-2594-984b-d448b314d365,[debt_financing to series_c],[1.0]
4561,2a2427e7-2dbc-3559-8eb7-178237b9a24b,[seed to seed],[0.5457504589239416]
4574,f37b37f7-c9e2-858f-f9bc-49fb8f23cb88,[seed to series_a],[1.8836115742036599]


In [15]:
# find max delta
max_amount_delta_list = []

for delta_list in df['norm_amount_delta']:
    max_amount_delta_list.append(np.array(delta_list).max())

df['max_norm_amount_delta'] = max_amount_delta_list

In [16]:
# find mean delta
mean_amount_delta_list = []

for delta_list in df['norm_amount_delta']:
    mean_amount_delta_list.append(np.array(delta_list).mean())

df['mean_norm_amount_delta'] = mean_amount_delta_list

In [17]:
df.reset_index(inplace=True)
df.drop(axis=1, labels='index', inplace=True)
df

Unnamed: 0,org_uuid,investment_delta,norm_amount_delta,max_norm_amount_delta,mean_norm_amount_delta
0,df662812-7f97-0b43-9d3e-12f64f504fbb,"[angel to series_a, series_a to series_b, seri...","[1.8484848484848484, 0.736318407960199, 1.5887...",1.848485,0.478597
1,f53cb4de-236e-0b1b-dee8-7104a8b018f9,[series_b to series_a],[-1.1111111111111112],-1.111111,-1.111111
2,4111dc8b-c0df-2d24-ed33-30cd137b3098,[series_b to series_c],[-0.6666666666666666],-0.666667,-0.666667
3,56b5f0c1-855f-2ebb-083f-16641f1db2e1,[series_a to series_b],[0.03125],0.031250,0.031250
4,21e77067-5537-408e-cad7-e5e72bb6ad86,"[seed to series_a, series_a to series_b]","[1.9573333333333334, 0.8324154209284028]",1.957333,1.394874
...,...,...,...,...,...
976,5f8959ff-f184-42d8-e6ce-14ad2694d82e,[series_b to series_a],[0.45901639344262296],0.459016,0.459016
977,1c60bc6a-b81b-2594-984b-d448b314d365,[debt_financing to series_c],[1.0],1.000000,1.000000
978,2a2427e7-2dbc-3559-8eb7-178237b9a24b,[seed to seed],[0.5457504589239416],0.545750,0.545750
979,f37b37f7-c9e2-858f-f9bc-49fb8f23cb88,[seed to series_a],[1.8836115742036599],1.883612,1.883612
