This notebook prepares the data (in .csv) from bulk_export_processed by data cleansing and preprocessing.

In [1]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as pltiiiiiiiiiiiiiiii
import math
from collections import Counter

pd.set_option('display.max_columns', None,'display.max_row', None)
# pd.set_option('display.max_rows', 1000)

default_figsize = (15,5)

In [2]:
# import category information
df_ctg = pd.read_csv("../bulk_export_processed/cleaned_csv/category_groups_cleaned.csv")
df_fd_rd = pd.read_csv("../bulk_export_processed/cleaned_csv/funding_rounds_cleaned.csv")
df_org = pd.read_csv("../bulk_export_processed/cleaned_csv/organizations_cleaned.csv")

In [3]:
# df_category_groups.head()
df_org.head()

Unnamed: 0,uuid,country_code,region,city,status,category_list,category_groups_list,num_funding_rounds,total_funding_usd,founded_on,last_funding_on,closed_on,employee_count,primary_role,num_exits
0,cf3a40e6-920e-edfd-569e-371f84e0a4e4,USA,California,San Francisco,operating,"Finance,Financial Services,Health Care,Venture...","Financial Services,Health Care,Lending and Inv...",,,2015-01-01,,,101-250,investor,24.0
1,fa65a572-1621-dd22-57a8-92bb49217ac5,GBR,England,London,operating,"Financial Services,FinTech,Venture Capital","Financial Services,Lending and Investments",,,2018-01-01,,,11-50,investor,10.0
2,74a20af3-f4dd-6188-de60-c4ee6cd0ca4a,CHN,Zhejiang,Hangzhou,operating,"Banking,E-Commerce,Financial Services,FinTech,...","Commerce and Shopping,Financial Services,Inter...",4.0,22000000000.0,2014-10-01,2018-06-08,,5001-10000,company,12.0
3,f33a3674-ec6b-14ca-16dc-437f280dc10b,USA,Virginia,Reston,operating,"E-Commerce,Financial Services,Information Tech...","Commerce and Shopping,Financial Services,Infor...",,,2015-01-01,,,1-10,investor,19.0
4,8fa7fd0d-d5cc-425d-52cc-a2019e7d42a3,USA,California,Santa Monica,operating,"Cyber Security,Developer APIs,FinTech,Software","Financial Services,Information Technology,Priv...",1.0,,2013-08-20,2014-01-23,,1-10,company,


In [4]:
# # count the values of categories
# org_series = df_org['category_list'].value_counts()
# print(org_series)

In [5]:
# Stardust ver unique keyword search
def unique_keyword_search(df_col: pd.Series, num_selected: int) -> list:
    '''
    Performs unique keyword search on a dataframe's column for its most common keywords

    :param pd.Series df_col: column of a pd.DataFrame (e.g. df['col'])
    :param int num_selected: number of keywords
    :return: list of keywords in decreasing occurrence
    :rtype: list
    '''
    keywords = []
    for entry in df_col:
        words = entry.split(',')
        [keywords.append(word) for word in words]
    sorted_keywords = Counter(keywords).most_common()
    output_keywords = []
    for i in range(num_selected):
        output_keywords.append(sorted_keywords[i][0])
    return output_keywords

In [6]:
# Stardust ver one-hot encoder V2
def onehot_encoder_v2(df: pd.DataFrame, col_name: str, list_selected: list) -> pd.DataFrame:
    '''
    Performs one-hot encoding on a dataframe's column for its values with most occurrences

    :param pd.DataFrame df: dataframe to be processed
    :param str col_name: name of the encoded column
    :param list list_selected: list of most common values
    :return: processed dataframe
    :rtype: pd.DataFrame
    '''
    for item in list_selected:
        df[item] = np.where(df[col_name] == item, 1, 0) # whenever df[col_name] == cat replace it with 1 else 0
    df.drop(col_name, axis=1, inplace=True)

    return df

In [7]:
fintech_keywords = unique_keyword_search(df_org['category_list'], 30)

In [8]:
onehot_encoder_v2(df_org, 'category_list', fintech_keywords)
df_org.head(50)

Unnamed: 0,uuid,country_code,region,city,status,category_groups_list,num_funding_rounds,total_funding_usd,founded_on,last_funding_on,closed_on,employee_count,primary_role,num_exits,Financial Services,FinTech,Finance,Blockchain,Information Technology,Software,Payments,Cryptocurrency,Venture Capital,Internet,Banking,Consulting,Mobile Payments,E-Commerce,Insurance,Bitcoin,Artificial Intelligence,Mobile,Crowdfunding,SaaS,Real Estate,Apps,Personal Finance,Accounting,Mobile Apps,Asset Management,Marketplace,Lending,Big Data,Machine Learning
0,cf3a40e6-920e-edfd-569e-371f84e0a4e4,USA,California,San Francisco,operating,"Financial Services,Health Care,Lending and Inv...",,,2015-01-01,,,101-250,investor,24.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,fa65a572-1621-dd22-57a8-92bb49217ac5,GBR,England,London,operating,"Financial Services,Lending and Investments",,,2018-01-01,,,11-50,investor,10.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,74a20af3-f4dd-6188-de60-c4ee6cd0ca4a,CHN,Zhejiang,Hangzhou,operating,"Commerce and Shopping,Financial Services,Inter...",4.0,22000000000.0,2014-10-01,2018-06-08,,5001-10000,company,12.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,f33a3674-ec6b-14ca-16dc-437f280dc10b,USA,Virginia,Reston,operating,"Commerce and Shopping,Financial Services,Infor...",,,2015-01-01,,,1-10,investor,19.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,8fa7fd0d-d5cc-425d-52cc-a2019e7d42a3,USA,California,Santa Monica,operating,"Financial Services,Information Technology,Priv...",1.0,,2013-08-20,2014-01-23,,1-10,company,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,cb941a47-84d3-9cde-a7bd-8c82ac03c027,USA,Washington,Seattle,operating,"Data and Analytics,Financial Services,Payments...",,,2019-06-01,,,101-250,company,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,c06ed179-3a7b-bae1-b22f-eede6865727a,GBR,England,London,operating,"Financial Services,Media and Entertainment",,,2012-04-01,,,1-10,company,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,673158e3-b671-4027-7d76-b3abdbc2fc18,GBR,England,London,operating,"Financial Services,Lending and Investments",,,2013-01-01,,,1-10,investor,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,7e909f26-0ed2-5cf0-f37c-f6bb6c2f9927,USA,California,San Francisco,operating,"Commerce and Shopping,Financial Services,Lendi...",,,2018-01-01,,,unknown,investor,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,df135cb8-ec66-aac7-7239-1d3634f9602f,AUS,South Australia,Adelaide,operating,"Financial Services,Lending and Investments",,,2012-01-01,,,1-10,investor,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# This cell is obsolete

# use keyword to search for relevant categories
# fintech_searchfor = ['Finance', 'Financial Services']
# tech_searchfor = fintech_searchfor + ['Software', 'Hardware', 'Internet Services', 'Information Technology',
# 'Science and Engineering', 'Sustainability', 'Energy', 'Privacy and Security', 'Data and Analytics',
# 'Messaging and Telecommunications', 'Artificial Intelligence', 'Mobile', 'Platform', 'Biotechnology',
# 'Apps', 'Navigation and Mapping']

# categories can be formed from one or more above elementary keywords
# df_fintech_category = df_category_groups[df_category_groups['category_groups_list'].str.contains('|'.join(fintech_searchfor))]
# df_tech_category = df_category_groups[df_category_groups['category_groups_list'].str.contains('|'.join(tech_searchfor))]

# tech_category = {'Software', 'Hardware', ...} # incomplete
# df_fintech_category = df_category_groups[df_category_groups['category_groups_list'].isin(fintech_category)]


# print(df_fintech_category)

In [None]:
# a primitive plot, will improve laterrr
fig, ax = plt.subplots()

df_org['category_list'].value_counts().plot(kind='bar', figsize=default_figsize)

ax.set_title('Number of companies in fintech categories')
plt.show()

In [None]:
fig, ax = plt.subplots()

investor_count = df_funding_rounds['investor_count'].value_counts().sort_index()
investor_count.plot.bar(figsize=default_figsize)

# ax.set_xlim(1, 10) # bugged, idk why
ax.set_title('Number of investors in funding rounds')
plt.show()

In [None]:
investment_type_series = df_funding_rounds['investment_type'].value_counts()
print(investment_type_series)

In [None]:
# to be improved: group insignificant investment types
fig, ax = plt.subplots()

# investor_count = df_funding_rounds['investor_count'].value_counts()
investment_type_series.plot.bar(figsize=default_figsize)

ax.set_title('Number of investors against investment type')
plt.show()