In [27]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB # Gaussian Naive Bays
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier #stochastic gradient descent
from sklearn.tree import DecisionTreeClassifier

In [28]:
df = pd.read_csv('input_data.csv', encoding = "ISO-8859-1")

In [29]:
df.head()

Unnamed: 0,Full Name,Primary Company,Founding Time,Previous startups?,Consulting before start-up,Undergraduate College,Standardized University,Education (Type of degree/major),Standardized Major,Degree Type,...,Standardized Graduate Institution,Graduate Studies,Standardized Graduate Studies,Graduate Diploma,Ever served as TA/Teacher/Professor/Mentor?,# Years of employment (non-founder),Worked as product manager/director/head/VP?,Worked at Google?,Worked at Microsoft?,Worked in Sales?
0,Chad Hurley,YouTube,2005.0,0.0,0.0,Indiana University of Pennsylvania,Indiana University of Pennsylvania,Fine Art,Design,,...,,,,,0.0,4.0,1.0,1.0,0.0,0.0
1,Gwyneth Paltrow,Goop Inc.,2008.0,0.0,0.0,,,,,,...,,,,,0.0,12.0,1.0,0.0,0.0,0.0
2,Jason Calacanis,Inside.com,2007.0,3.0,0.0,Fordham University,Fordham University,Psychology,Psychology,BA,...,,,,,0.0,4.0,1.0,0.0,0.0,0.0
3,Tony Fadell,Nest Labs,2010.0,1.0,0.0,University of Michigan,University of Michigan,Computer Engineering,Computer Science,BS,...,,,,,0.0,18.0,1.0,0.0,0.0,0.0
4,Matt Mullenweg,Automattic,2005.0,1.0,0.0,University of Houston,University of Houston,,,,...,,,,,0.0,15.0,1.0,0.0,0.0,0.0


In [30]:
df.columns

Index(['Full Name', 'Primary Company', 'Founding Time', 'Previous startups?',
       'Consulting before start-up', 'Undergraduate College',
       'Standardized University', 'Education (Type of degree/major)',
       'Standardized Major', 'Degree Type', 'Graduate Degree Time',
       'Graduate Institution', 'Standardized Graduate Institution',
       'Graduate Studies', 'Standardized Graduate Studies', 'Graduate Diploma',
       'Ever served as TA/Teacher/Professor/Mentor?',
       '# Years of employment (non-founder)',
       'Worked as product manager/director/head/VP?', 'Worked at Google?',
       'Worked at Microsoft?', 'Worked in Sales?'],
      dtype='object')

In [31]:
#create Ivy Leage column
def ivy_league(row):
    ivies = ['Harvard University', 'Yale University', 'Princeton University', 'Columbia University', 'Cornell University', 'Dartmouth College', 'Brown University' ]
    if row['Standardized University'] in ivies:
        return 1
    return 0

In [32]:
def stanford_berkeley(row):
    colleges = ['Stanford University','University of California Berkeley']
    if row['Standardized University'] in colleges:
        return 1
    return 0

In [33]:
df.loc[675]

Full Name                                                                       Craig H. Misrach
Primary Company                                                                 Freedom Meditech
Founding Time                                                                               2006
Previous startups?                                                                             0
Consulting before start-up                                                                     1
Undergraduate College                                             Indiana University Bloomington
Standardized University                                           Indiana University Bloomington
Education (Type of degree/major)                                            Accounting & Finance
Standardized Major                                                                      Business
Degree Type                                                                                   BS
Graduate Degree Time          

In [34]:
df = df.drop(df.index[676:])

In [35]:
df.tail()

Unnamed: 0,Full Name,Primary Company,Founding Time,Previous startups?,Consulting before start-up,Undergraduate College,Standardized University,Education (Type of degree/major),Standardized Major,Degree Type,...,Standardized Graduate Institution,Graduate Studies,Standardized Graduate Studies,Graduate Diploma,Ever served as TA/Teacher/Professor/Mentor?,# Years of employment (non-founder),Worked as product manager/director/head/VP?,Worked at Google?,Worked at Microsoft?,Worked in Sales?
671,Prat Moghe,Cazena,2013.0,2.0,0.0,"University of California, Los Angeles",University of California Los Angeles,Electrical Engineering,Electrical Engineering,PhD,...,,,,,0.0,7.0,1.0,0.0,0.0,0.0
672,Brian Spector,CertiVox,2009.0,0.0,0.0,Arizona State University,Arizona State University,Political Science and Government,Political Science,BS,...,,,,,0.0,14.0,0.0,0.0,0.0,1.0
673,Nish Kukadia,SecretSales,2007.0,0.0,0.0,The University of Manchester,University of Manchester,Management & Marketing,Business,BS,...,,Finance,Business,,0.0,3.0,0.0,0.0,0.0,0.0
674,Jesse Rothstein,ExtraHop Networks,2007.0,0.0,0.0,Rice University,Rice University,,,,...,,,,,0.0,10.0,0.0,0.0,0.0,0.0
675,Craig H. Misrach,Freedom Meditech,2006.0,0.0,1.0,Indiana University Bloomington,Indiana University Bloomington,Accounting & Finance,Business,BS,...,,Business,Business,MBA,0.0,5.0,0.0,0.0,0.0,0.0


In [36]:
df['Stanford or Berkeley'] = df.apply(stanford_berkeley,axis=1)
df.head()

Unnamed: 0,Full Name,Primary Company,Founding Time,Previous startups?,Consulting before start-up,Undergraduate College,Standardized University,Education (Type of degree/major),Standardized Major,Degree Type,...,Graduate Studies,Standardized Graduate Studies,Graduate Diploma,Ever served as TA/Teacher/Professor/Mentor?,# Years of employment (non-founder),Worked as product manager/director/head/VP?,Worked at Google?,Worked at Microsoft?,Worked in Sales?,Stanford or Berkeley
0,Chad Hurley,YouTube,2005.0,0.0,0.0,Indiana University of Pennsylvania,Indiana University of Pennsylvania,Fine Art,Design,,...,,,,0.0,4.0,1.0,1.0,0.0,0.0,0
1,Gwyneth Paltrow,Goop Inc.,2008.0,0.0,0.0,,,,,,...,,,,0.0,12.0,1.0,0.0,0.0,0.0,0
2,Jason Calacanis,Inside.com,2007.0,3.0,0.0,Fordham University,Fordham University,Psychology,Psychology,BA,...,,,,0.0,4.0,1.0,0.0,0.0,0.0,0
3,Tony Fadell,Nest Labs,2010.0,1.0,0.0,University of Michigan,University of Michigan,Computer Engineering,Computer Science,BS,...,,,,0.0,18.0,1.0,0.0,0.0,0.0,0
4,Matt Mullenweg,Automattic,2005.0,1.0,0.0,University of Houston,University of Houston,,,,...,,,,0.0,15.0,1.0,0.0,0.0,0.0,0


In [37]:
df['Ivy League'] = df.apply(ivy_league,axis=1)
df.head()

Unnamed: 0,Full Name,Primary Company,Founding Time,Previous startups?,Consulting before start-up,Undergraduate College,Standardized University,Education (Type of degree/major),Standardized Major,Degree Type,...,Standardized Graduate Studies,Graduate Diploma,Ever served as TA/Teacher/Professor/Mentor?,# Years of employment (non-founder),Worked as product manager/director/head/VP?,Worked at Google?,Worked at Microsoft?,Worked in Sales?,Stanford or Berkeley,Ivy League
0,Chad Hurley,YouTube,2005.0,0.0,0.0,Indiana University of Pennsylvania,Indiana University of Pennsylvania,Fine Art,Design,,...,,,0.0,4.0,1.0,1.0,0.0,0.0,0,0
1,Gwyneth Paltrow,Goop Inc.,2008.0,0.0,0.0,,,,,,...,,,0.0,12.0,1.0,0.0,0.0,0.0,0,0
2,Jason Calacanis,Inside.com,2007.0,3.0,0.0,Fordham University,Fordham University,Psychology,Psychology,BA,...,,,0.0,4.0,1.0,0.0,0.0,0.0,0,0
3,Tony Fadell,Nest Labs,2010.0,1.0,0.0,University of Michigan,University of Michigan,Computer Engineering,Computer Science,BS,...,,,0.0,18.0,1.0,0.0,0.0,0.0,0,0
4,Matt Mullenweg,Automattic,2005.0,1.0,0.0,University of Houston,University of Houston,,,,...,,,0.0,15.0,1.0,0.0,0.0,0.0,0,0


In [38]:
df.head()

Unnamed: 0,Full Name,Primary Company,Founding Time,Previous startups?,Consulting before start-up,Undergraduate College,Standardized University,Education (Type of degree/major),Standardized Major,Degree Type,...,Standardized Graduate Studies,Graduate Diploma,Ever served as TA/Teacher/Professor/Mentor?,# Years of employment (non-founder),Worked as product manager/director/head/VP?,Worked at Google?,Worked at Microsoft?,Worked in Sales?,Stanford or Berkeley,Ivy League
0,Chad Hurley,YouTube,2005.0,0.0,0.0,Indiana University of Pennsylvania,Indiana University of Pennsylvania,Fine Art,Design,,...,,,0.0,4.0,1.0,1.0,0.0,0.0,0,0
1,Gwyneth Paltrow,Goop Inc.,2008.0,0.0,0.0,,,,,,...,,,0.0,12.0,1.0,0.0,0.0,0.0,0,0
2,Jason Calacanis,Inside.com,2007.0,3.0,0.0,Fordham University,Fordham University,Psychology,Psychology,BA,...,,,0.0,4.0,1.0,0.0,0.0,0.0,0,0
3,Tony Fadell,Nest Labs,2010.0,1.0,0.0,University of Michigan,University of Michigan,Computer Engineering,Computer Science,BS,...,,,0.0,18.0,1.0,0.0,0.0,0.0,0,0
4,Matt Mullenweg,Automattic,2005.0,1.0,0.0,University of Houston,University of Houston,,,,...,,,0.0,15.0,1.0,0.0,0.0,0.0,0,0


In [39]:
df.drop(["Founding Time", "Undergraduate College", "Education (Type of degree/major)", "Graduate Institution", 
        "Graduate Studies", "Graduate Degree Time"], axis=1, inplace=True)

In [40]:
df.head()

Unnamed: 0,Full Name,Primary Company,Previous startups?,Consulting before start-up,Standardized University,Standardized Major,Degree Type,Standardized Graduate Institution,Standardized Graduate Studies,Graduate Diploma,Ever served as TA/Teacher/Professor/Mentor?,# Years of employment (non-founder),Worked as product manager/director/head/VP?,Worked at Google?,Worked at Microsoft?,Worked in Sales?,Stanford or Berkeley,Ivy League
0,Chad Hurley,YouTube,0.0,0.0,Indiana University of Pennsylvania,Design,,,,,0.0,4.0,1.0,1.0,0.0,0.0,0,0
1,Gwyneth Paltrow,Goop Inc.,0.0,0.0,,,,,,,0.0,12.0,1.0,0.0,0.0,0.0,0,0
2,Jason Calacanis,Inside.com,3.0,0.0,Fordham University,Psychology,BA,,,,0.0,4.0,1.0,0.0,0.0,0.0,0,0
3,Tony Fadell,Nest Labs,1.0,0.0,University of Michigan,Computer Science,BS,,,,0.0,18.0,1.0,0.0,0.0,0.0,0,0
4,Matt Mullenweg,Automattic,1.0,0.0,University of Houston,,,,,,0.0,15.0,1.0,0.0,0.0,0.0,0,0


In [41]:
links = pd.read_csv('links.csv', encoding = "ISO-8859-1")
df = pd.merge(df, links, on='Full Name')
df.head()

Unnamed: 0,Full Name,Primary Company,Previous startups?,Consulting before start-up,Standardized University,Standardized Major,Degree Type,Standardized Graduate Institution,Standardized Graduate Studies,Graduate Diploma,Ever served as TA/Teacher/Professor/Mentor?,# Years of employment (non-founder),Worked as product manager/director/head/VP?,Worked at Google?,Worked at Microsoft?,Worked in Sales?,Stanford or Berkeley,Ivy League,Crunchbase,LinkedIn
0,Chad Hurley,YouTube,0.0,0.0,Indiana University of Pennsylvania,Design,,,,,0.0,4.0,1.0,1.0,0.0,0.0,0,0,https://www.crunchbase.com/person/chad-hurley,http://www.linkedin.com/profile/view?id=5711
1,Gwyneth Paltrow,Goop Inc.,0.0,0.0,,,,,,,0.0,12.0,1.0,0.0,0.0,0.0,0,0,https://www.crunchbase.com/person/gwyneth-paltrow,https://www.linkedin.com/in/gwyneth-paltrow-48...
2,Jason Calacanis,Inside.com,3.0,0.0,Fordham University,Psychology,BA,,,,0.0,4.0,1.0,0.0,0.0,0.0,0,0,https://www.crunchbase.com/person/jason-calacanis,http://www.linkedin.com/in/jasoncalacanis
3,Tony Fadell,Nest Labs,1.0,0.0,University of Michigan,Computer Science,BS,,,,0.0,18.0,1.0,0.0,0.0,0.0,0,0,https://www.crunchbase.com/person/tony-fadell,http://www.linkedin.com/pub/tony-fadell/0/1/380
4,Matt Mullenweg,Automattic,1.0,0.0,University of Houston,,,,,,0.0,15.0,1.0,0.0,0.0,0.0,0,0,https://www.crunchbase.com/person/matt-mullenweg,http://www.linkedin.com/in/mattm


In [42]:
#years of employment - get rid of rows with nan
df = df.rename(columns={'# Years of employment (non-founder)': 'Years of Employment'})
df = df[df['Years of Employment'].notnull()]

In [43]:
df=df.fillna("")

In [44]:
df.head()

Unnamed: 0,Full Name,Primary Company,Previous startups?,Consulting before start-up,Standardized University,Standardized Major,Degree Type,Standardized Graduate Institution,Standardized Graduate Studies,Graduate Diploma,Ever served as TA/Teacher/Professor/Mentor?,Years of Employment,Worked as product manager/director/head/VP?,Worked at Google?,Worked at Microsoft?,Worked in Sales?,Stanford or Berkeley,Ivy League,Crunchbase,LinkedIn
0,Chad Hurley,YouTube,0.0,0.0,Indiana University of Pennsylvania,Design,,,,,0,4.0,1.0,1.0,0,0,0,0,https://www.crunchbase.com/person/chad-hurley,http://www.linkedin.com/profile/view?id=5711
1,Gwyneth Paltrow,Goop Inc.,0.0,0.0,,,,,,,0,12.0,1.0,0.0,0,0,0,0,https://www.crunchbase.com/person/gwyneth-paltrow,https://www.linkedin.com/in/gwyneth-paltrow-48...
2,Jason Calacanis,Inside.com,3.0,0.0,Fordham University,Psychology,BA,,,,0,4.0,1.0,0.0,0,0,0,0,https://www.crunchbase.com/person/jason-calacanis,http://www.linkedin.com/in/jasoncalacanis
3,Tony Fadell,Nest Labs,1.0,0.0,University of Michigan,Computer Science,BS,,,,0,18.0,1.0,0.0,0,0,0,0,https://www.crunchbase.com/person/tony-fadell,http://www.linkedin.com/pub/tony-fadell/0/1/380
4,Matt Mullenweg,Automattic,1.0,0.0,University of Houston,,,,,,0,15.0,1.0,0.0,0,0,0,0,https://www.crunchbase.com/person/matt-mullenweg,http://www.linkedin.com/in/mattm


In [45]:
df.tail()

Unnamed: 0,Full Name,Primary Company,Previous startups?,Consulting before start-up,Standardized University,Standardized Major,Degree Type,Standardized Graduate Institution,Standardized Graduate Studies,Graduate Diploma,Ever served as TA/Teacher/Professor/Mentor?,Years of Employment,Worked as product manager/director/head/VP?,Worked at Google?,Worked at Microsoft?,Worked in Sales?,Stanford or Berkeley,Ivy League,Crunchbase,LinkedIn
599,Cliff Moon,Opsee,1.0,0.0,University of Delaware,Computer Science,BS,,,,0,6.0,0.0,0.0,0,0,0,0,https://www.crunchbase.com/person/cliff-moon,http://www.linkedin.com/pub/cliff-moon/4/44b/283
600,Hesaam Esfandyarpour,Genapsys,1.0,0.0,Sharif University of Technology,Electrical Engineering,BS,Stanford University,Medicine,MS,1,6.0,0.0,0.0,0,0,0,0,https://www.crunchbase.com/person/hesaam-esfan...,https://www.linkedin.com/pub/hesaam-esfandyarp...
601,Marc Ruxin,TastemakerX,0.0,0.0,Hamilton College,History,BA,,Business,MBA,0,15.0,0.0,0.0,0,1,0,0,https://www.crunchbase.com/person/marc-ruxin,http://www.linkedin.com/in/marcruxin
602,Radu B. Rusu,Fyusion,0.0,0.0,,,,,Computer Science,PhD,1,10.0,0.0,0.0,0,0,0,0,https://www.crunchbase.com/person/radu-b--rusu,https://www.linkedin.com/in/radubogdanrusu
603,Ronni Zehavi,Hibob,2.0,0.0,Tel Aviv University,History,BA,,,,0,11.0,0.0,0.0,0,1,0,0,https://www.crunchbase.com/person/ronni-zehavi-2,https://www.linkedin.com/in/ronnizehavi


In [46]:
df.to_csv("cleaned_input_data.csv", index=False)

In [47]:
output = pd.read_csv('output_data.csv', encoding = "ISO-8859-1")

In [48]:
output.head()

Unnamed: 0,Full Name,Primary Company,Seed Valuation,A Valuation,B Valuation,Valuation Increase,Unnamed: 6,Series A Month,Series A Day,Series A Year,Unnamed: 10,Unnamed: 11,Unnamed: 12,CEO at A,CEO at B
0,Chad Hurley,YouTube,,14000000.0,88400000.0,6.314285714,,,,,,,,1.0,1.0
1,Gwyneth Paltrow,Goop Inc.,,40000000.0,65000000.0,1.625,,,,,,,,1.0,1.0
2,Jason Calacanis,Inside.com,"$7,000,000.00",11250000.0,,#VALUE!,,,,,,,,1.0,
3,Tony Fadell,Nest Labs,"$2,700,000.00",49210000.0,288120000.0,5.854907539,,,,,,,,1.0,1.0
4,Matt Mullenweg,Automattic,,8650000.0,238590000.0,27.58265896,,,,,,,,1.0,1.0


In [49]:
output.columns[6:]
output = output.drop(output.columns[6:], axis=1 )
output.head()

Unnamed: 0,Full Name,Primary Company,Seed Valuation,A Valuation,B Valuation,Valuation Increase
0,Chad Hurley,YouTube,,14000000.0,88400000.0,6.314285714
1,Gwyneth Paltrow,Goop Inc.,,40000000.0,65000000.0,1.625
2,Jason Calacanis,Inside.com,"$7,000,000.00",11250000.0,,#VALUE!
3,Tony Fadell,Nest Labs,"$2,700,000.00",49210000.0,288120000.0,5.854907539
4,Matt Mullenweg,Automattic,,8650000.0,238590000.0,27.58265896


In [50]:
output=output.fillna(0)

In [51]:
output.head()

Unnamed: 0,Full Name,Primary Company,Seed Valuation,A Valuation,B Valuation,Valuation Increase
0,Chad Hurley,YouTube,0,14000000.0,88400000.0,6.314285714
1,Gwyneth Paltrow,Goop Inc.,0,40000000.0,65000000.0,1.625
2,Jason Calacanis,Inside.com,"$7,000,000.00",11250000.0,0.0,#VALUE!
3,Tony Fadell,Nest Labs,"$2,700,000.00",49210000.0,288120000.0,5.854907539
4,Matt Mullenweg,Automattic,0,8650000.0,238590000.0,27.58265896


In [52]:
output.to_csv("cleaned_output_data.csv", index=False)