In [1]:
import pandas as pd
import numpy as np
import chardet

#Detect csv file encode
rawdata = open('crunchbase-investments.csv', 'rb').read()
result = chardet.detect(rawdata)

result

{'encoding': 'Windows-1254',
 'confidence': 0.5299577919124673,
 'language': 'Turkish'}

# Processing chunks and determine memory footprint

In [2]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize = 5000, encoding='iso-8859-1')

chunk_memory = []
chunk_usage = []

for chunk in chunk_iter:
    chunk_memory.append((chunk.memory_usage(deep=True,index=False) / (1024 ** 2)))
    chunk_usage.append(sum(chunk.memory_usage(deep=True,index=False) / (1024 ** 2))) 
total_chunk = pd.concat(chunk_memory)
total_chunk_vc = total_chunk.groupby(total_chunk.index).sum()
print('Memory Footprint by Column')    
print(total_chunk_vc.sort_values())
print("\n")
print("Total memory footprint of all of the chunks combined: {} Mb".format(round(total_chunk.sum(), 2)))
print('Chunk memory footprint range: {} Mb - {} Mb'.format(round(chunk_usage[-1], 2), round(chunk_usage[0], 2)))

Memory Footprint by Column
raised_amount_usd         0.403366
funded_year               0.403366
investor_category_code    0.593590
investor_state_code       2.361876
investor_country_code     2.524654
investor_city             2.751430
company_state_code        2.962161
company_country_code      3.025223
funded_quarter            3.226837
funded_month              3.226837
investor_region           3.238946
funding_round_type        3.252704
company_region            3.253522
company_category_code     3.262619
company_city              3.343493
funded_at                 3.378091
company_name              3.424955
investor_name             3.734270
company_permalink         3.869808
investor_permalink        4.749821
dtype: float64


Total memory footprint of all of the chunks combined: 56.99 Mb
Chunk memory footprint range: 2.66 Mb - 5.58 Mb


# Calculate missing values for each column

In [3]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize = 5000, encoding='iso-8859-1')

missing = []
total_row = 0

for chunk in chunk_iter:
    missing.append(chunk.isnull().sum())
    total_row += len(chunk)

total_missing = pd.concat(missing)
total_missing_val = total_missing.groupby(total_missing.index).sum()

total_missing_val = total_missing_val.to_frame()

total_missing_val.columns = ['Missing']

total_missing_val['pct_missing'] = round(total_missing_val['Missing'] * 100 / total_row, 2)

total_missing_val.sort_values('pct_missing', ascending=False)

Unnamed: 0,Missing,pct_missing
investor_category_code,50427,95.38
investor_state_code,16809,31.79
investor_city,12480,23.61
investor_country_code,12001,22.7
raised_amount_usd,3599,6.81
company_category_code,643,1.22
company_city,533,1.01
company_state_code,492,0.93
funded_at,3,0.01
funded_month,3,0.01


# Identify the types for each column

In [4]:
# Empty dict to store types for each columns

col_type = {}

chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize = 5000, encoding='iso-8859-1')

for chunk in chunk_iter:
    for col in chunk.columns:
        if col not in col_type:
            col_type[col] = [str(chunk[col].dtypes)]
        else:
            break
            
col_type

{'company_permalink': ['object'],
 'company_name': ['object'],
 'company_category_code': ['object'],
 'company_country_code': ['object'],
 'company_state_code': ['object'],
 'company_region': ['object'],
 'company_city': ['object'],
 'investor_permalink': ['object'],
 'investor_name': ['object'],
 'investor_category_code': ['object'],
 'investor_country_code': ['object'],
 'investor_state_code': ['object'],
 'investor_region': ['object'],
 'investor_city': ['object'],
 'funding_round_type': ['object'],
 'funded_at': ['object'],
 'funded_month': ['object'],
 'funded_quarter': ['object'],
 'funded_year': ['int64'],
 'raised_amount_usd': ['float64']}

In [5]:
# Explore possible numeric columns

chunk.head(5)

Unnamed: 0,company_permalink,company_name,company_category_code,company_country_code,company_state_code,company_region,company_city,investor_permalink,investor_name,investor_category_code,investor_country_code,investor_state_code,investor_region,investor_city,funding_round_type,funded_at,funded_month,funded_quarter,funded_year,raised_amount_usd
50000,/company/nuorder,NuORDER,fashion,USA,CA,Los Angeles,West Hollywood,/person/mortimer-singer,Mortimer Singer,,,,unknown,,series-a,2012-10-01,2012-10,2012-Q4,2012,3060000.0
50001,/company/chacha,ChaCha,advertising,USA,IN,Indianapolis,Carmel,/person/morton-meyerson,Morton Meyerson,,,,unknown,,series-b,2007-10-01,2007-10,2007-Q4,2007,12000000.0
50002,/company/binfire,Binfire,software,USA,FL,Bocat Raton,Bocat Raton,/person/moshe-ariel,Moshe Ariel,,,,unknown,,angel,2008-04-18,2008-04,2008-Q2,2008,500000.0
50003,/company/binfire,Binfire,software,USA,FL,Bocat Raton,Bocat Raton,/person/moshe-ariel,Moshe Ariel,,,,unknown,,angel,2010-01-01,2010-01,2010-Q1,2010,750000.0
50004,/company/unified-color,Unified Color,software,USA,CA,SF Bay,South San Frnacisco,/person/mr-andrew-oung,Mr. Andrew Oung,,,,unknown,,angel,2010-01-01,2010-01,2010-Q1,2010,


# Identify columns to drop

In [6]:
# investor_category_code needs to be dropped due to high missing values percentage
# Remove columns with URl links ('company_permalink' and 'investor_permalink')
# Remove redundant columns like investor_region, funded_month and funded_year

drop_cols = ['investor_permalink', 'company_permalink', 'investor_category_code', 'funded_month', 'investor_region']
keep_cols = chunk.columns.drop(drop_cols)

col_type_refined = {}

chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize = 5000, encoding='iso-8859-1',usecols=keep_cols, parse_dates=['funded_at'])

for chunk in chunk_iter:
    for col in chunk.columns:
        if col not in col_type_refined:
            col_type_refined[col] = [str(chunk[col].dtypes)]
        else:
            break
            
col_type_refined

{'company_name': ['object'],
 'company_category_code': ['object'],
 'company_country_code': ['object'],
 'company_state_code': ['object'],
 'company_region': ['object'],
 'company_city': ['object'],
 'investor_name': ['object'],
 'investor_country_code': ['object'],
 'investor_state_code': ['object'],
 'investor_city': ['object'],
 'funding_round_type': ['object'],
 'funded_at': ['datetime64[ns]'],
 'funded_quarter': ['object'],
 'funded_year': ['int64'],
 'raised_amount_usd': ['float64']}

In [7]:
chunk.head(5)

Unnamed: 0,company_name,company_category_code,company_country_code,company_state_code,company_region,company_city,investor_name,investor_country_code,investor_state_code,investor_city,funding_round_type,funded_at,funded_quarter,funded_year,raised_amount_usd
50000,NuORDER,fashion,USA,CA,Los Angeles,West Hollywood,Mortimer Singer,,,,series-a,2012-10-01,2012-Q4,2012,3060000.0
50001,ChaCha,advertising,USA,IN,Indianapolis,Carmel,Morton Meyerson,,,,series-b,2007-10-01,2007-Q4,2007,12000000.0
50002,Binfire,software,USA,FL,Bocat Raton,Bocat Raton,Moshe Ariel,,,,angel,2008-04-18,2008-Q2,2008,500000.0
50003,Binfire,software,USA,FL,Bocat Raton,Bocat Raton,Moshe Ariel,,,,angel,2010-01-01,2010-Q1,2010,750000.0
50004,Unified Color,software,USA,CA,SF Bay,South San Frnacisco,Mr. Andrew Oung,,,,angel,2010-01-01,2010-Q1,2010,


# Percentage of unique values in each column

In [8]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize = 5000, encoding='iso-8859-1',usecols=keep_cols, parse_dates=['funded_at'])

unique_dct = {}

for chunk in chunk_iter:
    for col in chunk.columns:
        unique_count = chunk[col].value_counts()
        if col in unique_dct:
            unique_dct[col].append(unique_count)
        else:
            unique_dct[col] = [unique_count]

unique_count = {}
unique_values = {}
for col in unique_dct:
    combined_unique = pd.concat(unique_dct[col])
    final = combined_unique.groupby(combined_unique.index)
    unique_count[col] = len(final)
    unique_values[col] = combined_unique.groupby(combined_unique.index).sum()

unique_df = pd.DataFrame.from_dict(unique_count,orient='index')
unique_df.columns = ['Unique_count']
unique_df['Unique pct'] = round(unique_df['Unique_count'] * 100 / total_row, 2)

data_summary = unique_df.join(total_missing_val)

data_summary.sort_values(['Unique pct', 'pct_missing'])

Unnamed: 0,Unique_count,Unique pct,Missing,pct_missing
company_country_code,2,0.0,1,0.0
funding_round_type,9,0.02,3,0.01
funded_year,20,0.04,3,0.01
company_category_code,43,0.08,643,1.22
company_state_code,50,0.09,492,0.93
investor_state_code,50,0.09,16809,31.79
funded_quarter,72,0.14,3,0.01
investor_country_code,72,0.14,12001,22.7
company_region,546,1.03,1,0.0
investor_city,990,1.87,12480,23.61


In [9]:
# Check value of country and remove the erroneous data

unique_values['company_country_code']

2008-02        1
USA        52868
Name: company_country_code, dtype: int64

In [10]:
# There are 2 values for company_country_code columns: 2008-02 and USA
# We will remove 2008-02

chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize = 5000, encoding='iso-8859-1',usecols=keep_cols, parse_dates=['funded_at'])

country_count = []
for chunk in chunk_iter:
    chunk = chunk[chunk['company_country_code'] != '2008-02']
    country_count.append(chunk['company_country_code'].value_counts())
    
USA_count = pd.concat(country_count).sum()
USA_count

52868

# Convert appropriate columns into category type

In [11]:
# Dict for col type

convert_col_dtypes = {
    "company_country_code": "category", "funding_round_type": "category", 
    "company_category_code": "category", "company_state_code": "category",
    "investor_state_code": "category", "investor_country_code": "category",
    "company_region": "category", "investor_region": "category",
    "company_city": "category", "investor_city": "category"
}

chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize = 5000, encoding='iso-8859-1',
                          usecols=keep_cols, parse_dates=['funded_at'], dtype=convert_col_dtypes)

new_chunk_memory = []
new_chunk_usage = []
new_col_type = {}


for chunk in chunk_iter:
    # Process funded quarter to remove the year, then convert column to category type
    chunk['funded_quarter'] = chunk['funded_quarter'].str[-1:]
    chunk['funded_quarter'] = chunk['funded_quarter'].astype('category')
    
    # Minimize numeric columns memory footprint
    chunk['funded_year'] = pd.to_numeric(chunk['funded_year'], downcast='signed')
    chunk['raised_amount_usd'] = pd.to_numeric(chunk['raised_amount_usd'], downcast='float')
    
    # Calculate new memory footprint
    new_chunk_memory.append((chunk.memory_usage(deep=True,index=False) / (1024 ** 2)))
    new_chunk_usage.append(sum(chunk.memory_usage(deep=True,index=False) / (1024 ** 2))) 
    
    # Determine columns types
    for col in chunk.columns:
        if col not in new_col_type:
            new_col_type[col] = [str(chunk[col].dtypes)]
        else:
            break

col_type_df = pd.DataFrame.from_dict(new_col_type, orient='index')

new_total_chunk = pd.concat(new_chunk_memory)
new_total_chunk_vc = new_total_chunk.groupby(new_total_chunk.index).sum()

print('Memory Footprint by Column')    
print(new_total_chunk_vc.sort_values())
print("\n")
print("Total memory footprint of all of the chunks combined: {} Mb".format(round(new_total_chunk.sum(), 2)))
print('Chunk memory footprint range: {} Mb - {} Mb'.format(round(new_chunk_usage[-1], 2), round(new_chunk_usage[0], 2)))
print(col_type_df)

Memory Footprint by Column
company_country_code     0.051188
funded_quarter           0.054701
funding_round_type       0.059248
investor_country_code    0.079145
investor_state_code      0.079806
company_state_code       0.091649
company_category_code    0.091980
funded_year              0.129452
raised_amount_usd        0.201683
investor_city            0.300412
company_region           0.317361
funded_at                0.403366
company_city             0.624036
company_name             3.424955
investor_name            3.734270
dtype: float64


Total memory footprint of all of the chunks combined: 9.64 Mb
Chunk memory footprint range: 0.49 Mb - 0.92 Mb
                                    0
company_name                   object
company_category_code        category
company_country_code         category
company_state_code           category
company_region               category
company_city                 category
investor_name                  object
investor_country_code        cat

In [12]:
chunk.head(5)

Unnamed: 0,company_name,company_category_code,company_country_code,company_state_code,company_region,company_city,investor_name,investor_country_code,investor_state_code,investor_city,funding_round_type,funded_at,funded_quarter,funded_year,raised_amount_usd
50000,NuORDER,fashion,USA,CA,Los Angeles,West Hollywood,Mortimer Singer,,,,series-a,2012-10-01,4,2012,3060000.0
50001,ChaCha,advertising,USA,IN,Indianapolis,Carmel,Morton Meyerson,,,,series-b,2007-10-01,4,2007,12000000.0
50002,Binfire,software,USA,FL,Bocat Raton,Bocat Raton,Moshe Ariel,,,,angel,2008-04-18,2,2008,500000.0
50003,Binfire,software,USA,FL,Bocat Raton,Bocat Raton,Moshe Ariel,,,,angel,2010-01-01,1,2010,750000.0
50004,Unified Color,software,USA,CA,SF Bay,South San Frnacisco,Mr. Andrew Oung,,,,angel,2010-01-01,1,2010,


# Load data into SQLite database

In [13]:
import sqlite3

conn = sqlite3.connect('crunchbase.db')
cur = conn.cursor()
cur.execute('DROP TABLE IF EXISTS crunchbase')

chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize = 5000, encoding='iso-8859-1',
                          usecols=keep_cols, parse_dates=['funded_at'], dtype=convert_col_dtypes)

for chunk in chunk_iter:
    chunk.to_sql("crunchbase", conn, if_exists='append', index=False)
    
results_df = pd.read_sql('PRAGMA table_info(crunchbase);', conn)
print(results_df)

    cid                   name       type  notnull dflt_value  pk
0     0           company_name       TEXT        0       None   0
1     1  company_category_code       TEXT        0       None   0
2     2   company_country_code       TEXT        0       None   0
3     3     company_state_code       TEXT        0       None   0
4     4         company_region       TEXT        0       None   0
5     5           company_city       TEXT        0       None   0
6     6          investor_name       TEXT        0       None   0
7     7  investor_country_code       TEXT        0       None   0
8     8    investor_state_code       TEXT        0       None   0
9     9          investor_city       TEXT        0       None   0
10   10     funding_round_type       TEXT        0       None   0
11   11              funded_at  TIMESTAMP        0       None   0
12   12         funded_quarter       TEXT        0       None   0
13   13            funded_year    INTEGER        0       None   0
14   14   

In [14]:
!wc -c crunchbase.db

7000064 crunchbase.db


In [15]:
print('crunchbase.db uses {} Mb'.format(round(14000128 / (1025 ** 2),2)))

q = 'Select * from crunchbase'
test = pd.read_sql(q, conn)

print(len(test))

crunchbase.db uses 13.33 Mb
52870


# Data Analysis with SQLite3

In [16]:
# What proportion of the total amount of funds did the top 10% raise?

def pct_x(x):
    y = int(len(total_combined_q1) * x / 100 + (len(total_combined_q1) % (100/x) > 0))
    return y

q_1 = 'SELECT company_name, raised_amount_usd from crunchbase'

chunk_iter = pd.read_sql(q_1, conn, chunksize=5000)

q1_lst = []

for chunk in chunk_iter:
    chunk['raised_amount_usd'] = pd.to_numeric(chunk['raised_amount_usd'], downcast='float')
    chunk['raised_amount_usd'].fillna(0, inplace=True)
    q1_lst.append(chunk)
       
combined_q1 = pd.concat(q1_lst)
total_combined_q1 = combined_q1.groupby(combined_q1.company_name).sum().sort_values('raised_amount_usd', ascending=False)
total_combined_q1 = total_combined_q1.reset_index()

# Only analyze companies that raised money, else bottom 10% raised a total of $0
total_combined_q1 = total_combined_q1[total_combined_q1['raised_amount_usd'] > 0]

total_fund = sum(total_combined_q1['raised_amount_usd'])
# Top 10% funding
tier_10 = pct_x(10)
top_10_fund = sum(total_combined_q1['raised_amount_usd'][0:tier_10])

top_10_pct = round(top_10_fund * 100 / total_fund, 2)

# What proportion of the total amount of funds did the bottom 10% raise?

bottom_10_fund = sum(total_combined_q1['raised_amount_usd'][-tier_10:])
bottom_10_pct = round(bottom_10_fund * 100 / total_fund, 2)

print('Total funding: ${:,.2f}'.format(total_fund))
print('\n')
print('Top 10% total funding: ${:,.2f}'.format(top_10_fund))
print("The top 10% raises {}% of the total fund".format(top_10_pct))
print('\n')
print('Bottom 10% total funding: ${:,.2f}'.format(bottom_10_fund))
print("The bottom 10% raises {}% of the total fund".format(bottom_10_pct))

Total funding: $681,732,189,238.00


Top 10% total funding: $440,138,498,032.00
The top 10% raises 64.56% of the total fund


Bottom 10% total funding: $182,454,331.00
The bottom 10% raises 0.03% of the total fund


In [17]:
# What proportion of the total amount of funds did the top 1% raise?

tier_1 = pct_x(1)

top_1_fund = sum(total_combined_q1['raised_amount_usd'][0:tier_1])

top_1_pct = round(top_1_fund * 100 / total_fund, 2)

# What proportion of the total amount of funds did the bottom 10% raise?

bottom_1_fund = sum(total_combined_q1['raised_amount_usd'][-tier_1:])
bottom_1_pct = round(bottom_1_fund * 100 / total_fund, 4)

print('Total funding: ${:,.2f}'.format(total_fund))
print('\n')
print('Top 1% total funding: ${:,.2f}'.format(top_1_fund))
print("The top 1% raises {}% of the total fund".format(top_1_pct))
print('\n')
print('Bottom 1% total funding: ${:,.2f}'.format(bottom_1_fund))
print("The bottom 1% raises {}% of the total fund".format(bottom_1_pct))

# [:,] put a comma separator between thousands of number, while .2f indicates number of decimal

Total funding: $681,732,189,238.00


Top 1% total funding: $171,392,861,248.00
The top 1% raises 25.14% of the total fund


Bottom 1% total funding: $1,501,100.00
The bottom 1% raises 0.0002% of the total fund


In [18]:
# Which category of company attracted the most investments?

q_2 = 'SELECT company_category_code, raised_amount_usd from crunchbase'

chunk_iter = pd.read_sql(q_2, conn, chunksize=5000)

q2_lst = []

for chunk in chunk_iter:
    chunk['raised_amount_usd'] = pd.to_numeric(chunk['raised_amount_usd'], downcast='float')
    chunk['raised_amount_usd'].fillna(0, inplace=True)
    q2_lst.append(chunk)
       
combined_q2 = pd.concat(q2_lst)
total_combined_q2 = combined_q2.groupby(combined_q2.company_category_code).sum().sort_values('raised_amount_usd', ascending=False)

for i in range(0, 10):
    print('{:20s}    ${:>20,.2f}'.format(total_combined_q2.index[i], total_combined_q2['raised_amount_usd'][i]))
    
# [>20] for right-justified with 20 being number of space for data

biotech                 $  110,396,424,192.00
software                $   73,084,518,400.00
mobile                  $   64,777,379,840.00
cleantech               $   52,705,226,752.00
enterprise              $   45,860,925,440.00
web                     $   40,143,265,792.00
medical                 $   25,367,105,536.00
advertising             $   25,076,662,272.00
ecommerce               $   22,567,219,200.00
network_hosting         $   22,419,683,328.00


In [19]:
len(combined_q2)

52870

In [20]:
combined_q2.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52870 entries, 0 to 2869
Data columns (total 2 columns):
company_category_code    52227 non-null object
raised_amount_usd        52870 non-null float32
dtypes: float32(1), object(1)
memory usage: 3.9 MB


In [21]:
# Which investor contributed the most money (across all startups)?

q_3 = 'SELECT investor_name, raised_amount_usd from crunchbase'

chunk_iter = pd.read_sql(q_3, conn, chunksize=5000)

q3_lst = []

for chunk in chunk_iter:
    chunk['raised_amount_usd'] = pd.to_numeric(chunk['raised_amount_usd'], downcast='float')
    chunk['raised_amount_usd'].fillna(0, inplace=True)
    q3_lst.append(chunk)
       
combined_q3 = pd.concat(q3_lst)
total_combined_q3 = combined_q3.groupby(combined_q3.investor_name).sum().sort_values('raised_amount_usd', ascending=False)

for i in range(0, 10):
    print('{:35s}    ${:>20,.2f}'.format(total_combined_q3.index[i], total_combined_q3['raised_amount_usd'][i]))

Kleiner Perkins Caufield & Byers       $   11,217,826,816.00
New Enterprise Associates              $    9,692,541,952.00
Accel Partners                         $    6,472,125,952.00
Goldman Sachs                          $    6,375,458,816.00
Sequoia Capital                        $    6,039,402,496.00
Intel                                  $    5,969,200,128.00
Google                                 $    5,808,799,744.00
Time Warner                            $    5,729,999,872.00
Comcast                                $    5,669,000,192.00
Greylock Partners                      $    4,960,983,040.00


In [22]:
total_combined_q3 = total_combined_q3.reset_index()
total_combined_q3.head(5)

Unnamed: 0,investor_name,raised_amount_usd
0,Kleiner Perkins Caufield & Byers,11217830000.0
1,New Enterprise Associates,9692542000.0
2,Accel Partners,6472126000.0
3,Goldman Sachs,6375459000.0
4,Sequoia Capital,6039402000.0


In [23]:
total_combined_q3.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10465 entries, 0 to 10464
Data columns (total 2 columns):
investor_name        10465 non-null object
raised_amount_usd    10465 non-null float32
dtypes: float32(1), object(1)
memory usage: 788.2 KB


In [24]:
# Which investors contributed the most money per startup?

q_4 = 'SELECT investor_name, company_name from crunchbase'

chunk_iter = pd.read_sql(q_4, conn, chunksize=5000)

q4_lst = []

for chunk in chunk_iter:
    chunk = chunk.drop_duplicates('company_name')
    q4_lst.append(chunk['investor_name'].value_counts())
    
combined_q4 = pd.concat(q4_lst).to_frame().reset_index()

combined_q4.columns = ['investor_name', 'startup_count']

combined_investor = combined_q4.merge(total_combined_q3, on='investor_name').reset_index(drop=True)

combined_investor['amount_per_startup'] = round(combined_investor['raised_amount_usd'] / combined_investor['startup_count'], 2)
combined_investor = combined_investor.sort_values('amount_per_startup', ascending=False).reset_index(drop=True)

# Make a copy of dataframe to select only investors with 5 or more investment

combined_investor2 = combined_investor[combined_investor['startup_count'] >= 5].copy()
combined_investor2 = combined_investor2.sort_values('amount_per_startup', ascending=False).reset_index(drop=True)

In [25]:
# Consider all investors, which one made the most investment per startup?

combined_investor['amount_per_startup'] = combined_investor['amount_per_startup'].map('${:,.2f}'.format)
combined_investor['raised_amount_usd'] = combined_investor['raised_amount_usd'].map('${:,.2f}'.format)
combined_investor.head(10)

Unnamed: 0,investor_name,startup_count,raised_amount_usd,amount_per_startup
0,BrightHouse,1,"$4,700,000,256.00","$4,700,000,256.00"
1,Marlin Equity Partners,1,"$2,600,000,000.00","$2,600,000,000.00"
2,Comcast,3,"$5,669,000,192.00","$1,889,666,730.67"
3,Time Warner,4,"$5,729,999,872.00","$1,432,499,968.00"
4,GI Partners,1,"$1,050,000,000.00","$1,050,000,000.00"
5,Digital Sky Technologies,3,"$2,921,807,104.00","$973,935,701.33"
6,Azure Capital Partners,1,"$777,000,000.00","$777,000,000.00"
7,Siemens PLM Software,1,"$750,000,000.00","$750,000,000.00"
8,Intel,9,"$5,969,200,128.00","$663,244,458.67"
9,Eagle River Holdings,4,"$2,456,999,936.00","$614,249,984.00"


In [26]:
# Consider only investors with 5 or more startup investment, which one invests most per startup?

combined_investor2['amount_per_startup'] = combined_investor2['amount_per_startup'].map('${:,.2f}'.format)
combined_investor2['raised_amount_usd'] = combined_investor2['raised_amount_usd'].map('${:,.2f}'.format)
combined_investor2.head(10)

Unnamed: 0,investor_name,startup_count,raised_amount_usd,amount_per_startup
0,Intel,9,"$5,969,200,128.00","$663,244,458.67"
1,Google,18,"$5,808,799,744.00","$322,711,096.89"
2,T. Rowe Price,8,"$2,231,000,064.00","$278,875,008.00"
3,Silver Lake Partners,6,"$1,202,000,000.00","$200,333,333.33"
4,Morgan Stanley Venture Partners,6,"$1,187,900,032.00","$197,983,338.67"
5,Oak Hill Capital Partners,8,"$1,392,499,968.00","$174,062,496.00"
6,Bank of America,6,"$979,500,032.00","$163,250,005.33"
7,Glynn Capital Management,7,"$1,070,700,032.00","$152,957,147.43"
8,Microsoft,7,"$1,037,500,032.00","$148,214,290.29"
9,Technology Crossover Ventures,17,"$2,510,000,128.00","$147,647,066.35"


In [30]:
# Which funding round was the most popular?

q_5 = 'SELECT funding_round_type from crunchbase'

chunk_iter = pd.read_sql(q_5, conn, chunksize=5000)

q5_lst = []

for chunk in chunk_iter:
    q5_lst.append(chunk['funding_round_type'].value_counts())
    
combined_q5 = pd.concat(q5_lst).to_frame().reset_index()
combined_q5.columns = ['funding_round_type', 'count']
total_combined_q5 = combined_q5.groupby(combined_q5.funding_round_type).sum().sort_values('count', ascending=False)

total_combined_q5

Unnamed: 0_level_0,count
funding_round_type,Unnamed: 1_level_1
series-a,13938
series-c+,10870
angel,8989
venture,8917
series-b,8794
other,964
private-equity,357
post-ipo,33
crowdfunding,5
