# IEOR 135: Applied Data Science for Venture Applications
## The Holy Grail of Venture Capital

Project Team: Julian Chan, Thomas Ferry, Mudit Goyal, Nitin Sampath, Yuan Zhou

IPython Notebook: Julian Chan

In [94]:
import numpy as np
import pandas as pd
import json

#### Load in founder data file that includes information about them such as company name, years of employment, university attended, etc.

In [95]:
founders = pd.read_csv("cleaned_input_data.csv", encoding="ISO-8859-1")
founders.fillna("")
founders.head()

Unnamed: 0,Full Name,Primary Company,Previous startups?,Consulting before start-up,Standardized University,Standardized Major,Degree Type,Standardized Graduate Institution,Standardized Graduate Studies,Graduate Diploma,Ever served as TA/Teacher/Professor/Mentor?,Years of Employment,Worked as product manager/director/head/VP?,Worked at Google?,Worked at Microsoft?,Worked in Sales?,Stanford or Berkeley,Ivy League,Crunchbase,LinkedIn
0,Chad Hurley,YouTube,0,0,Indiana University of Pennsylvania,Design,,,,,0.0,4.0,1,1,0.0,0.0,0,0,https://www.crunchbase.com/person/chad-hurley,http://www.linkedin.com/profile/view?id=5711
1,Gwyneth Paltrow,Goop Inc.,0,0,,,,,,,0.0,12.0,1,0,0.0,0.0,0,0,https://www.crunchbase.com/person/gwyneth-paltrow,https://www.linkedin.com/in/gwyneth-paltrow-48...
2,Jason Calacanis,Inside.com,3,0,Fordham University,Psychology,BA,,,,0.0,4.0,1,0,0.0,0.0,0,0,https://www.crunchbase.com/person/jason-calacanis,http://www.linkedin.com/in/jasoncalacanis
3,Tony Fadell,Nest Labs,1,0,University of Michigan,Computer Science,BS,,,,0.0,18.0,1,0,0.0,0.0,0,0,https://www.crunchbase.com/person/tony-fadell,http://www.linkedin.com/pub/tony-fadell/0/1/380
4,Matt Mullenweg,Automattic,1,0,University of Houston,,,,,,0.0,15.0,1,0,0.0,0.0,0,0,https://www.crunchbase.com/person/matt-mullenweg,http://www.linkedin.com/in/mattm


#### Load in Seed and Series Valuation data that includes information about how much their startup was valuated at in different stages.

In [96]:
output = pd.read_csv("cleaned_output_data.csv", encoding="ISO-8859-1")
output.head()

Unnamed: 0,Full Name,Primary Company,Seed Valuation,A Valuation,B Valuation,Valuation Increase
0,Chad Hurley,YouTube,0,14000000.0,88400000.0,6.314285714
1,Gwyneth Paltrow,Goop Inc.,0,40000000.0,65000000.0,1.625
2,Jason Calacanis,Inside.com,"$7,000,000.00",11250000.0,0.0,#VALUE!
3,Tony Fadell,Nest Labs,"$2,700,000.00",49210000.0,288120000.0,5.854907539
4,Matt Mullenweg,Automattic,0,8650000.0,238590000.0,27.58265896


#### Load in the weights for each feature generated from a Random Forest Regressor. The weights are determined using importance values from predicting series valuation from information about the founder. The weights are the importance values of each feature determined by the Random Forest Regressor.

In [97]:
# Load in weights for each feature (generated using RandomForestRegressor)
weight_df = pd.read_csv("weights.csv", encoding="ISO-8859-1")
weight_df.head()

Unnamed: 0,Feature,Importance,Feature Index
0,Previous startups?,0.060639,1
1,Consulting before start-up,0.036717,2
2,Standardized University,0.231523,3
3,Standardized Major,0.198227,4
4,Degree Type,0.087502,5


In [98]:
weights = weight_df["Importance"]

#### Create a similarity calculator object. See below for class API:
* (PRIVATE METHOD) _weightedContributionToSimilarity: given 2 feature values and the weight of that feature, compute the weighted contribution of this feature to the similarity score
    * If the feature is a string, we check for equality and assign a 0 (not equal) or 1 (equal).
    * If the feature is a numeric value,
        * If absolute difference between the 2 feature values is 0 or 1, assign 1
        * If absolute difference between the 2 feature values is > 1, assign $\frac{1}{difference}$
        
* (PRIVATE METHOD) _computeWeightedSimilarity: given information about a founder (in a numpy array) not in our database and the index of a founder in our database, compute the weighted similarity between the two founders
    * This score is essentially a linear combination of the contributions computed in the function above using the appropriate weights determined by our Random Forest Classifier.


* findKClosestFounders: given k (the number of similar founders desired) and information about a founder (in a numpy array) not in our database, find the k closest founders using their similarity score (the greater their similarity score, the closer they are).

In [99]:
class FounderSimilarityCalculator:
    def __init__(self, data, weights):
        """
        Input:
            data: pandas dataframe of feature values (assumes founder name is first column, company name is second column)
            weights: array of weights for each feature to be used in determining "distance" between founders
        """
        self.founders = data.iloc[:,0] # the first column is assumed to be founder name
        self.companies = data.iloc[:,1] # the second column is assumed to be company name
        self.crunchbase = data.iloc[:,18] # the 19th column is assumed to be the founder's CrunchBase link
        self.linkedin = data.iloc[:,19] # the 20th column is assumed to be the founder's LinkedIn link
        
        self.features = data.iloc[:,2:18]
        self.weights = weights
        self.sum_of_weights = np.sum(weights) # compute the sum of the weights as a normalizing factor for similarity
        
        assert self.features.shape[1] == len(self.weights) # ensure that the # of weights corresponds to # of features
    
    def _weightedContributionToSimilarity(self, feature1, feature2, weight):
        """
        Computes the weighted contribution of the current feature to the similarity measure.
        
        If the two values are the same, then we add 1*weight to the similarity score.
        
        For features whose values are continuous, the probability of 2 values being equal is very small, so we don't want
        to penalize them for being different. We can use a smoother penalty based on how different they are. So, if the 
        two values are different,
            1. Compute the absolute difference between the two values
            2. Add 1/difference * weight to the similarity score
        
        Input:
            feature1: feature of founder
            feature2: corresponding feature of different founder
            weight: weight on the feature
        """
        if isinstance(feature1, str) and isinstance(feature2, str):
            if feature1 == feature2:
                return weight
            else:
                return 0
        elif isinstance(feature1, float) and isinstance(feature2, float):
            if not (np.isnan(feature1) or np.isnan(feature2)):
                diff = np.abs(feature1 - feature2)
                if diff <= 1:
                    return weight
                else:
                    return 1/diff * weight
            else:
                return 0
        return 0
    
    def _computeWeightedSimilarity(self, founder1_data, founder2_index):
        """
        Computes the weighted similarity between 2 founders.
        
        Input:
            founder1_data: information (feature values) of founder 1
            founder2_index: integer index of founder 2
        """
        features1 = founder1_data
        features2 = self.features.iloc[founder2_index,:]
        
        similarity = 0
        for i in range(features1.shape[0]):
            similarity += self._weightedContributionToSimilarity(features1[i], features2[i], self.weights[i])
        
        return similarity/self.sum_of_weights
    
    def findKClosestFounders(self, k, founder_data):
        """
        Finds the k closest founders in terms of similarity to the founder corresponding to founder_index.
        
        Input:
            k: # of most similar founders
            founder_data: information (feature values) of founder of whom we wish to find similar founders
        """
        assert k < self.founders.shape[0]
        
        similarity = np.zeros(self.founders.shape[0])
        
        for i in range(self.founders.shape[0]):
            similarity[i] = self._computeWeightedSimilarity(founder_data, i)
            
        min_indices = similarity.argsort()[::-1][:k]
        
        closest_founders = []
        closest_companies = []
        similarity_score = []
        linkedin_links = []
        
        for i in min_indices:
            closest_founders.append(self.founders.iloc[i])
            closest_companies.append(self.companies.iloc[i])
            similarity_score.append(similarity[i])
            linkedin_links.append(self.linkedin.iloc[i])
        
        return closest_founders, closest_companies, similarity_score, linkedin_links

#### Create a FounderSimilarityCalculator instance using the founder data that we read in and the weights we computed using the Random Forest Regressor. Input the data of a founder (who is NOT in our database) that we are interested in. Then find the k closest founders in our database to the external founder.

In [113]:
fsc = FounderSimilarityCalculator(founders, weights)

# Find the 5 closest founders
founder_data = {"Full Name": "Stephen Torres",
               "Primary Company": "PV Solar Report",
               "Previous startups?": 1,
               "Consulting before start-up": 0,
               "Standardized University": "University of California Berkeley",
               "Standardized Major": "Business",
               "Degree Type": "BA",
               "Standardized Graduate Institution": "Cornell University",
               "Standradized Graduate Studies": "Business",
               "Graduate Diploma": "MBA",
               "Ever served as TA/Teacher/Professor/Mentor?": 1,
               "Years of Employment": 9,
               "Worked as product manager/director/head/VP?": 0,
               "Worked at Google?": 0,
               "Worked at Microsoft": 0,
               "Worked in Sales?": 1,
               "Stanford or Berkeley": 1,
               "Ivy League": 1,
               "Crunchbase": "",
               "LinkedIn": "https://www.linkedin.com/in/stephendtorres/"}

k = 5
closest_founders, closest_companies, similarity_score, linkedin = fsc.findKClosestFounders(k, 
                                                                            np.array(list(founder_data.values())[2:18]))

closest = list(zip(closest_founders, closest_companies, similarity_score, linkedin))

#### Format the results to write it to a JSON file to be fed into the front-end for display to the end user.

In [111]:
data = {}
data["name"] = founder_data["Full Name"]
data["company"] = founder_data["Primary Company"]
data["size"] = 3000
data["link"] = founder_data["LinkedIn"]

data["children"] = []
for tup in closest:
    name = tup[0]
    company = tup[1]
    similarity = round(tup[2], 4)
    linkedin = tup[3] if isinstance(tup[3], str) else ""
    size = tup[2]*(10**4)
    
    new_dict = {"name": name,
               "company": company,
               "similar": similarity,
               "link": linkedin,
               "size": size,}
    data["children"].append(new_dict)

with open('data.json', 'w') as outfile:  
    json.dump(data, outfile)

In [114]:
with open('data.json') as infile:
    data = json.load(infile)
    
    for key, val in data.items():
        if key != "children":
            print(key, ":", val)
        else:
            children = data["children"]
            print("\n***************Most similar founders:***************\n")
            for founder_info in children:
                for key, val in founder_info.items():
                    print(key, ":", val)
                print()

name : Stephen Torres
company : PV Solar Report
size : 3000
link : https://www.linkedin.com/in/stephendtorres/

***************Most similar founders:***************

name : Assaf Resnick
company : BigPanda
similar : 0.4297
link : https://www.linkedin.com/in/assafresnick
size : 4297.4994157025

name : Steven Lam
company : GOGOVAN
similar : 0.4297
link : https://hk.linkedin.com/in/stevenhylam
size : 4297.4994157025

name : David Niu
company : TINYhr
similar : 0.3663
link : http://www.linkedin.com/in/davidniu
size : 3662.5613763374377

name : David Yeom
company : Hollar
similar : 0.333
link : 
size : 3329.5958166704045

name : Tariq Hilaly
company : Lumity, Inc.
similar : 0.333
link : https://www.linkedin.com/in/tariqhilaly
size : 3329.5958166704045

