In [1]:
import Preprocessing as pp
import pickle
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.svm import LinearSVC
from sklearn.preprocessing import Normalizer
from sklearn.feature_selection import SelectPercentile, SelectFromModel
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
import seaborn as sns

## Preprocessing and saving of preprocessed datasets

Details in separate ``preprocessing.py`` script. This notebook requires the ``company_data.json`` and ``jobs_data.json`` in a folder titled ``data`` to run.

In [8]:
dataset = pp.get_final_dataset()
pp.split_and_save(dataset)

In [9]:
with open("dataset_train", "rb") as file:
    train_data = pickle.load(file)

In [10]:
with open("dataset_test", "rb") as file:
    test_data = pickle.load(file)

## Splitting for different prediction tasks

Separates the required X and Y datasets for the different attempted prediction tasks

In [11]:
def split_X_and_Y(dataset, prediction_task='last_funding_regression'):
    """
    Since we're trying to do a few different prediction tasks, I thought to create this function that neatly makes
    the right X and Y datasets. Input the prediction task:
    'last_funding_regression' -- For the regression task that predicts how much funding a startup will get.
    'funding_stage_classification' -- For predicting the type of funding round the startup will get (seed, series A, etc.)
    'growth_stage_classification' -- For predicting what growth stage the startup is in, as suggested by Sarah's father.
    """
    
    last_funding_features = [col for col in dataset.columns if col.startswith('last_funding')]
    last_funding_round_features = [col for col in dataset.columns if col.startswith('last_funding_round_round')]
    growth_stage_features = [col for col in dataset.columns if col.startswith('growth_stage')]
    
    
    X = dataset.drop(last_funding_features+growth_stage_features, axis=1)
    
    if prediction_task == 'last_funding_regression':
        Y = dataset['last_funding']
    elif prediction_task == 'funding_stage_classification':
        Y = dataset[last_funding_round_features]
    elif prediction_task == 'growth_stage_classification':
        Y = dataset[growth_stage_features]
    
    return X, Y

In [12]:
X_train, Y_train = split_X_and_Y(train_data, 'funding_stage_classification')

In [13]:
X_train_regression, Y_train_regression = split_X_and_Y(train_data, 'last_funding_regression')

In [14]:
Y_train = Y_train[Y_train.columns[Y_train.sum()>100]]

## Removing outliers and feature selection

For the classification task, uncommon classes were removed. 
For the regression task, companies with over €10 million in funding were removed.

Features were selected from a combined analysis of the performances of the two predictive models and saved to a separate file imported below.

In [15]:
X_train = X_train[Y_train.sum(axis=1) == 1]
Y_train = Y_train[Y_train.sum(axis=1) == 1]

In [16]:
X_train_regression = X_train_regression[Y_train_regression < 10]
Y_train_regression = Y_train_regression[Y_train_regression < 10]

In [17]:
with open('features', 'rb') as file:
    features = pickle.load(file)

In [18]:
x = X_train[features]
x_regression = X_train_regression[features]

## Defining the models

In both cases Random Forests turned out to give the best predictions. The models are defined with gridsearch to find the optimal hyperparameters

In [19]:
imputer = SimpleImputer()
randomforest = RandomForestClassifier(n_estimators=100, n_jobs=-1)
randomforestregression = RandomForestRegressor(n_estimators=100, n_jobs=-1)
impute_select_randomforest_pipe = make_pipeline(imputer, randomforest)
impute_randomforest_regression_pipe = make_pipeline(imputer, randomforestregression)

In [20]:
gridsearch = GridSearchCV(impute_select_randomforest_pipe, 
                          {'randomforestclassifier__min_samples_split':[2,3,4,5],
                           'randomforestclassifier__max_features': [None],
                          }, cv=3)

gridsearch_regression = GridSearchCV(impute_randomforest_regression_pipe, 
                                     {'randomforestregressor__min_samples_split':[2,3,4,5],
                                      'randomforestregressor__max_features': [None, 'auto', 0.5]
                                     }, cv=3)

In [None]:
gridsearch.fit(x, Y_train)
gridsearch_regression.fit(x_regression, Y_train_regression)

In [22]:
print(gridsearch.best_score_)
print(gridsearch_regression.best_score_)

0.6630067128063694
0.8824849452804691


## Evaluating models on test set

In [23]:
X_test, Y_test = split_X_and_Y(test_data, 'funding_stage_classification')
X_test_regression, Y_test_regression = split_X_and_Y(test_data, 'last_funding_regression')

In [24]:
Y_test = Y_test[Y_train.columns]
X_test = X_test[Y_test.sum(axis=1) == 1]
Y_test = Y_test[Y_test.sum(axis=1) == 1]

In [25]:
X_test_regression = X_test_regression[~Y_test_regression.isna()]
Y_test_regression = Y_test_regression[~Y_test_regression.isna()]

X_test_regression = X_test_regression[Y_test_regression < 10]
Y_test_regression = Y_test_regression[Y_test_regression < 10]

In [26]:
gridsearch.score(X_test[features], Y_test)

0.6609745939192003

In [27]:
gridsearch_regression.score(X_test_regression[features], Y_test_regression)

0.8846728461676138

## Saving models for use in demo

``demo.py`` implements the live demo, which imports the saved models from this file.

In [28]:
with open("models", "wb") as file:
    pickle.dump((gridsearch, gridsearch_regression), file)