In [1]:
### Load relevant packages
import pandas                  as pd
import numpy                   as np
import matplotlib.pyplot       as plt
import seaborn                 as sns
import statsmodels.formula.api as sm
import os
import random

%matplotlib inline
plt.style.use('ggplot')

In [2]:
## Creating Data Frame
df = pd.read_csv("data/US Crunchbase Companies (2).csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
## Dataframe of companies that are women-founded/led
df_women = df[df['Diversity Spotlight (US Only)'].astype(str).str.contains("Women")]
df_women

Unnamed: 0,Index,Organization Name,Organization Name URL,Industries,Headquarters Location,Description,CB Rank (Company),Total Funding Amount,Total Funding Amount Currency,Total Funding Amount Currency (in USD),...,Apptopia - Number of Apps,Apptopia - Downloads Last 30 Days,G2 Stack - Total Products Active,IPqwery - Patents Granted,IPqwery - Trademarks Registered,IPqwery - Most Popular Trademark Class,IPqwery - Most Popular Patent Class,Aberdeen - IT Spend,Aberdeen - IT Spend Currency,Aberdeen - IT Spend Currency (in USD)
63,64,A Bride By Samantha,https://www.crunchbase.com/organization/a-brid...,"Fashion, Wedding","Los Angeles, California, United States",A Bride By Samantha provides wedding dresses.,650937,,,,...,,,,,,,,,,
84,85,a COUPLE of GURUS,https://www.crunchbase.com/organization/a-coup...,"Consulting, Cyber Security, Information Techno...","Minneapolis, Minnesota, United States",a COUPLE of GURUS is a Managed IT Services Pro...,374005,,,,...,,,31.0,,,,,108021,USD,108021
87,88,A Curated World,https://www.crunchbase.com/organization/a-cura...,"E-Commerce, Home Decor, Interior Design","Providence, Rhode Island, United States",A Curated World offers online shopping for han...,135822,50000.0,USD,50000.0,...,,,18.0,,,,,,,
105,106,A Entertainment,https://www.crunchbase.com/organization/a-ente...,"Film, Film Distribution, Media and Entertainme...","Atlanta, Georgia, United States",Film,975375,,,,...,,,,,,,,,,
106,107,A Family First Community Services,https://www.crunchbase.com/organization/a-fami...,"Communities, Health Care, Service Industry","Atlanta, Georgia, United States",A Family First Community Services providesÂ ou...,139946,500000.0,USD,500000.0,...,,,3.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108896,108897,1226 Digital LLC,https://www.crunchbase.com/organization/1226-d...,"Advertising, E-Commerce, Mobile, Social Media,...","San Francisco, California, United States",Agency helping retailers and e-commerce brands...,617694,,,,...,,,5.0,,,,,,,
108924,108925,123,https://www.crunchbase.com/organization/new-pe...,,"Bomont, West Virginia, United States",123,670686,,,,...,,,,,,,,,,
108951,108952,100th Monkey Media,https://www.crunchbase.com/organization/100th-...,"Advertising, Consulting, Internet, Marketing, ...","Hillsboro, Missouri, United States",100th Monkey Media services clients nationwide.,720819,,,,...,,,18.0,,,,,,,
108962,108963,123 FRUTTY,https://www.crunchbase.com/organization/123-fr...,"Apps, Brand Marketing, Education","Orlando, Florida, United States","123 Frutty is a fashion brand for babies, todd...",746079,,,,...,,,9.0,,,,,,,


In [4]:
len(df_women)

6019

In [5]:
## Percentage of women-founded companies 
len(df_women)/len(df)

0.05519840796749906

5.52% of companies are women-founded or women-led

In [6]:
df_women.columns.values

array(['Index', 'Organization Name', 'Organization Name URL',
       'Industries', 'Headquarters Location', 'Description',
       'CB Rank (Company)', 'Total Funding Amount',
       'Total Funding Amount Currency',
       'Total Funding Amount Currency (in USD)', 'Founded Date',
       'Founded Date Precision', 'Headquarters Regions',
       'Diversity Spotlight (US Only)', 'Estimated Revenue Range',
       'Operating Status', 'Full Description', 'Hub Tags',
       'Number of Articles', 'Phone Number', 'Contact Email', 'LinkedIn',
       'Facebook', 'Twitter', 'Website', 'Company Type', 'Closed Date',
       'Closed Date Precision', 'Exit Date', 'Exit Date Precision',
       'Investor Type', 'Investment Stage',
       'Number of Portfolio Organizations', 'Number of Investments',
       'Number of Lead Investments', 'Number of Diversity Investments',
       'Number of Exits', 'Number of Exits (IPO)',
       'Accelerator Program Type', 'Accelerator Application Deadline',
       'Accelera

In [7]:
## Women businesses are only involved in 18% industries so it's not a good factor to take into account
## Qualitative interviews with founders and investors validated this assumption it was advised to use another variable
## for modeling purposes

len(df_women['Industry Groups'].unique())
len(df['Industry Groups'].unique())
len(df_women['Industry Groups'].unique())/len(df['Industry Groups'].unique())

0.18312597200622083

In [8]:
## Cleaning data frame to only include columns needed for the model
## Variables were determined by conducting qualitative interviews
df_women = df_women[['Total Funding Amount', 'Estimated Revenue Range', 'Number of Investments','Number of Exits (IPO)']]
df_women

Unnamed: 0,Total Funding Amount,Estimated Revenue Range,Number of Investments,Number of Exits (IPO)
63,,,,
84,,$1M to $10M,,
87,50000.0,Less than $1M,,
105,,,,
106,500000.0,$10M to $50M,,
...,...,...,...,...
108896,,Less than $1M,,
108924,,,,
108951,,Less than $1M,,
108962,,Less than $1M,,


In [9]:
df_women.dtypes

Total Funding Amount       float64
Estimated Revenue Range     object
Number of Investments       object
Number of Exits (IPO)      float64
dtype: object

In [10]:
df_women['Estimated Revenue Range'].unique()

array([nan, '$1M to $10M', 'Less than $1M', '$10M to $50M', '$1B to $10B',
       '$100M to $500M', '$50M to $100M', '$500M to $1B', '$10B+'],
      dtype=object)

In [11]:
## Changing column names to what needs to be determined
## The model will look at specific aspects of the columns

## Estimated Revenue will be treated as a binary variable where:
## 1 if revenue estimation >= 1
## 0 otherwise

## Number of Investments if greater than 1 means that the company most likely
## has a network built which gives them a greater chance of getting funded

## Number of IPO exits shows the founders have experience in reaching the IPO 
## stage which increases their chances of getting a funding because of both the 
## experience of getting their business to that level and of building a repo with investors

## Total Funding Amount is our target variable and in order for the decision tree classifier
## to be able to predict the result, this will not be included later in the df

In [12]:
## Renaming columns

df_women = df_women.rename(columns = {"Estimated Revenue Range": "Estimated Revenue >= 1M",
                                             "Number of Investments": "Number of Investments >= 1",
                                             "Number of Exits (IPO)": "Number of Exits (IPO) >= 1"})
df_women

Unnamed: 0,Total Funding Amount,Estimated Revenue >= 1M,Number of Investments >= 1,Number of Exits (IPO) >= 1
63,,,,
84,,$1M to $10M,,
87,50000.0,Less than $1M,,
105,,,,
106,500000.0,$10M to $50M,,
...,...,...,...,...
108896,,Less than $1M,,
108924,,,,
108951,,Less than $1M,,
108962,,Less than $1M,,


In [13]:
## Changing Estimated Revenue column to binary. 
## 1 is >=1, 0 otherwise

df_women.loc[(df_women['Estimated Revenue >= 1M'] == 'Less than $1M'),'Estimated Revenue >= 1M']=0
df_women.loc[~(df_women['Estimated Revenue >= 1M'] == 0),'Estimated Revenue >= 1M']=1
df_women

Unnamed: 0,Total Funding Amount,Estimated Revenue >= 1M,Number of Investments >= 1,Number of Exits (IPO) >= 1
63,,1,,
84,,1,,
87,50000.0,0,,
105,,1,,
106,500000.0,1,,
...,...,...,...,...
108896,,0,,
108924,,1,,
108951,,0,,
108962,,0,,


In [14]:
## We have a lot of missing values in our df and simply taking the average of the values
## would misinterpret the data

In [15]:
## assign Total Funding Amount as the target variable and divide the data set into
## target and input variables

target = df_women['Total Funding Amount']
df_women_inputs = df_women.drop('Total Funding Amount',axis='columns')

df_women_inputs

Unnamed: 0,Estimated Revenue >= 1M,Number of Investments >= 1,Number of Exits (IPO) >= 1
63,1,,
84,1,,
87,0,,
105,1,,
106,1,,
...,...,...,...
108896,0,,
108924,1,,
108951,0,,
108962,0,,


In [16]:
## Creating low,medium,high values for total funding amount

low_value = target.dropna().astype('float').quantile(0.25)
medium_value = target.dropna().astype('float').quantile(0.5)
high_value = target.dropna().astype('float').quantile(0.75)
print("The 25% quartile of non-empty total funding amount:", low_value)
print("The 50% quartile of non-empty total funding amount:", medium_value)
print("The 75% quartile of non-empty total funding amount:", high_value)

The 25% quartile of non-empty total funding amount: 400000.0
The 50% quartile of non-empty total funding amount: 2200000.0
The 75% quartile of non-empty total funding amount: 10500000.0


In [17]:
## need to convert everything to numbers for the model
df_women_inputs = df_women_inputs.astype(float)

print(df_women_inputs.dtypes)

Estimated Revenue >= 1M       float64
Number of Investments >= 1    float64
Number of Exits (IPO) >= 1    float64
dtype: object


In [18]:
## Filling in the missing values
df_women_inputs.isna().sum()

Estimated Revenue >= 1M          0
Number of Investments >= 1    5942
Number of Exits (IPO) >= 1    5992
dtype: int64

In [19]:
## KNN Imputation
## Split data into training and testing data
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split

X = df_women_inputs
y = target

## Splitting into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3)
knn = KNNImputer(n_neighbors=5,add_indicator=False)

## Imputing X train and test
X_train_imputed = pd.DataFrame(knn.fit_transform(X_train))
X_test_imputed = pd.DataFrame(knn.fit_transform(X_test))

In [20]:
## imputing y
## reshape since data has a single feature

y_train = np.array(y_train).reshape(-1,1)
y_train_imputed = pd.DataFrame(knn.fit_transform(y_train))

y_test = np.array(y_test).reshape(-1,1)
y_test_imputed = pd.DataFrame(knn.fit_transform(y_test))

In [21]:
#X_train_imputed
#X_test_imputed
#y_train_imputed
#y_test_imputed

In [22]:
y_train_imputed = y_train_imputed[0]
y_test_imputed = y_test_imputed[0]

In [23]:
## Low,medium,high for Funding Amount
## in training set
for i in range(len(y_train_imputed)):
  if(y_train_imputed[i] <= low_value):
    y_train_imputed[i] = "low"
  elif(y_train_imputed[i] <= medium_value):
    y_train_imputed[i] = "moderate"
  else:
    y_train_imputed[i] = "high"

In [24]:
for i in range(len(y_test_imputed)):
  if(y_test_imputed[i] <= low_value):
    y_test_imputed[i] = "low"
  elif(y_test_imputed[i] <= medium_value):
    y_test_imputed[i] = "moderate"
  else:
    y_test_imputed[i] = "high"

In [25]:
pd.Series(y_train_imputed).value_counts()

high        3393
moderate     412
low          408
Name: 0, dtype: int64

In [29]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth = 2)
rf = rf.fit(X_train_imputed, y_train_imputed)
score = rf.score(X_test_imputed, y_test_imputed)
print(score)

0.7967884828349945
