In [1]:
### Load relevant packages
import pandas                  as pd
import numpy                   as np
import matplotlib.pyplot       as plt
import seaborn                 as sns
import statsmodels.formula.api as sm
import os
import random

%matplotlib inline
plt.style.use('ggplot')

In [None]:
## Creating Data Frame
df = pd.read_csv("data/US Crunchbase Companies (2).csv")

In [None]:
## Dataframe of companies that are women-founded/led
df_women = df[df['Diversity Spotlight (US Only)'].astype(str).str.contains("Women")]
df_women

In [None]:
len(df_women)

In [None]:
## Percentage of women-founded companies 
len(df_women)/len(df)

5.52% of companies are women-founded or women-led

In [None]:
df_women.columns.values

In [None]:
## Women businesses are only involved in 18% industries so it's not a good factor to take into account
## Qualitative interviews with founders and investors validated this assumption it was advised to use another variable
## for modeling purposes

len(df_women['Industry Groups'].unique())
len(df['Industry Groups'].unique())
len(df_women['Industry Groups'].unique())/len(df['Industry Groups'].unique())

In [None]:
## Cleaning data frame to only include columns needed for the model
## Variables were determined by conducting qualitative interviews
df_women = df_women[['Total Funding Amount', 'Estimated Revenue Range', 'Number of Investments','Number of Exits (IPO)']]
df_women

In [None]:
df_women.dtypes

In [None]:
df_women['Estimated Revenue Range'].unique()

In [None]:
## Changing column names to what needs to be determined
## The model will look at specific aspects of the columns

## Estimated Revenue will be treated as a binary variable where:
## 1 if revenue estimation >= 1
## 0 otherwise

## Number of Investments if greater than 1 means that the company most likely
## has a network built which gives them a greater chance of getting funded

## Number of IPO exits shows the founders have experience in reaching the IPO 
## stage which increases their chances of getting a funding because of both the 
## experience of getting their business to that level and of building a repo with investors

## Total Funding Amount is our target variable and in order for the decision tree classifier
## to be able to predict the result, this will not be included later in the df

In [None]:
## Renaming columns

df_women = df_women.rename(columns = {"Estimated Revenue Range": "Estimated Revenue >= 1M",
                                             "Number of Investments": "Number of Investments >= 1",
                                             "Number of Exits (IPO)": "Number of Exits (IPO) >= 1"})
df_women

In [None]:
## Changing Estimated Revenue column to binary. 
## 1 is >=1, 0 otherwise

df_women.loc[(df_women['Estimated Revenue >= 1M'] == 'Less than $1M'),'Estimated Revenue >= 1M']=0
df_women.loc[~(df_women['Estimated Revenue >= 1M'] == 0),'Estimated Revenue >= 1M']=1
df_women

In [None]:
## We have a lot of missing values in our df and simply taking the average of the values
## would misinterpret the data

In [None]:
## assign Total Funding Amount as the target variable and divide the data set into
## target and input variables

target = df_women['Total Funding Amount']
df_women_inputs = df_women.drop('Total Funding Amount',axis='columns')

df_women_inputs

In [None]:
target.dtypes

In [None]:
## need to convert everything to numbers for the model
df_women_inputs = df_women_inputs.astype(float)

print(df_women_inputs.dtypes)

In [None]:
## Filling in the missing values
df_women_inputs.isna().sum()

In [None]:
## KNN Imputation
## Split data into training and testing data
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split

X_train = df_women_inputs
y_train = target

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3)

knn = KNNImputer(n_neighbors=5,add_indicator=True)
knn.fit(X_train)
X_train_imputed = pd.DataFrame(knn.transform(X_train))