# IEOR 135: Applied Data Science for Venture Applications
## The Holy Grail of Venture Capital

Project Team: Julian Chan, Thomas Ferry, Mudit Goyal, Nitin Sampath, Yuan Zhou

IPython Notebook: Julian Chan

In [6]:
import numpy as np
import pandas as pd
import json

In [19]:
founders = pd.read_csv("cleaned_data_v2.csv", encoding="ISO-8859-1")
output = pd.read_csv("output_data_v2.csv", encoding="ISO-8859-1")

# Initialize weights for each feature; assume 2 columns are founder name and company name
weights = np.array([np.random.uniform(0, 1) for _ in range(founders.shape[1] - 2)])

In [20]:
class FounderSimilarityCalculator:
    def __init__(self, data, weights):
        """
        Input:
            data: pandas dataframe of feature values (assumes founder name is first column, company name is second column)
            weights: array of weights for each feature to be used in determining "distance" between founders
        """
        self.founders = data.iloc[:,0] # the first column is assumed to be founder name
        self.companies = data.iloc[:,1] # the second column is assumed to be company name
        self.crunchbase = data.iloc[:,20] # the 21st column is assumed to be the founder's CrunchBase link
        self.linkedin = data.iloc[:,21] # the 22nd column is assumed to be the founder's LinkedIn link
        
        self.features = data.iloc[:,2:]
        self.weights = weights
        self.sum_of_weights = np.sum(weights) # compute the sum of the weights as a normalizing factor for similarity
        
        assert self.features.shape[1] == len(self.weights) # ensure that the # of weights corresponds to # of features
    
    def _weightedContributionToSimilarity(self, feature1, feature2, weight):
        """
        Computes the weighted contribution of the current feature to the similarity measure.
        
        If the two values are the same, then we add 1*weight to the similarity score.
        
        For features whose values are continuous, the probability of 2 values being equal is very small, so we don't want
        to penalize them for being different. We can use a smoother penalty based on how different they are. So, if the 
        two values are different,
            1. Compute the absolute difference between the two values
            2. Add 1/difference * weight to the similarity score
        
        Input:
            feature1: feature of founder
            feature2: corresponding feature of different founder
            weight: weight on the feature
        """
        if isinstance(feature1, str) and isinstance(feature2, str):
            if feature1 == feature2:
                return weight
            else:
                return 0
        elif isinstance(feature1, float) and isinstance(feature2, float):
            if not (np.isnan(feature1) or np.isnan(feature2)):
                diff = np.abs(feature1 - feature2)
                if diff <= 1:
                    return weight
                else:
                    return 1/diff * weight
            else:
                return 0
        return 0
    
    def _computeWeightedSimilarity(self, founder1_index, founder2_index):
        """
        Computes the weighted similarity between 2 founders.
        
        Input:
            founder1_index: integer index of founder 1
            founder2_index: integer index of founder 2
        """
        founder1 = self.founders.iloc[founder1_index]
        founder2 = self.founders.iloc[founder2_index]
        
        features1 = self.features.iloc[founder1_index,:]
        features2 = self.features.iloc[founder2_index,:]
        
        similarity = 0
        for i in range(features1.shape[0]):
            similarity += self._weightedContributionToSimilarity(features1[i], features2[i], self.weights[i])
        
        return similarity/self.sum_of_weights
    
    def findKClosestFounders(self, k, founder_index):
        """
        Finds the k closest founders in terms of similarity to the founder corresponding to founder_index.
        
        Input:
            k: # of most similar founders
            founder_index: integer index of founder of whom we wish to find similar founders
        """
        assert k < self.founders.shape[0]
        
        similarity = np.zeros(self.founders.shape[0])
        
        for i in range(self.founders.shape[0]):
            if i == founder_index:
                continue
            similarity[i] = self._computeWeightedSimilarity(founder_index, i)
            
        min_indices = similarity.argsort()[::-1][:k]
        
        closest_founders = []
        closest_companies = []
        similarity_score = []
        crunchbase_links = []
        linkedin_links = []
        
        for i in min_indices:
            closest_founders.append(self.founders.iloc[i])
            closest_companies.append(self.companies.iloc[i])
            similarity_score.append(similarity[i])
            crunchbase_links.append(self.crunchbase.iloc[i])
            linkedin_links.append(self.linkedin.iloc[i])
        
        return closest_founders, closest_companies, similarity_score, crunchbase_links, linkedin_links

In [26]:
fsc = FounderSimilarityCalculator(founders, weights)

# Find the 5 closest founders
desired_founder_index = 15
closest_founders, closest_companies, similarity_score, crunchbase, linkedin = fsc.findKClosestFounders(5, desired_founder_index)

closest = list(zip(closest_founders, closest_companies, similarity_score, crunchbase, linkedin))

In [32]:
data = {}
data["name"] = fsc.founders[desired_founder_index]
data["company"] = fsc.companies[desired_founder_index]
data["size"] = 200000
data["Crunchbase"] = "www.crunchbase.com"
data["LinkedIn"] = "www.linkedin.com"

data["children"] = []
for tup in closest:
    name = tup[0]
    company = tup[1]
    similarity = tup[2]
    crunchbase = tup[3] if isinstance(tup[3], str) else ""
    linkedin = tup[4] if isinstance(tup[4], str) else ""
    size = tup[2]*10000
    
    new_dict = {"name": name,
               "company": company,
               "similarity": similarity,
               "Crunchbase": crunchbase,
               "LinkedIn": linkedin,
               "size": size,}
    data["children"].append(new_dict)

with open('data.json', 'w') as outfile:  
    json.dump(data, outfile)

In [33]:
with open('data.json') as infile:
    data = json.load(infile)
    print(data)

{'name': 'Brendan Iribe', 'company': 'Oculus', 'size': 200000, 'Crunchbase': 'www.crunchbase.com', 'LinkedIn': 'www.linkedin.com', 'children': [{'name': 'James Foster', 'company': 'ZeroFOX', 'similarity': 0.4859428116776635, 'Crunchbase': 'https://www.crunchbase.com/person/james-foster', 'LinkedIn': 'http://www.linkedin.com/in/jamescfoster', 'size': 4859.428116776635}, {'name': 'Anthony Casalena', 'company': 'Squarespace', 'similarity': 0.47838419255923587, 'Crunchbase': 'https://www.crunchbase.com/person/anthony-casalena', 'LinkedIn': 'http://www.linkedin.com/in/acasalena', 'size': 4783.841925592359}, {'name': 'Brett Schulman', 'company': 'Cava Grill', 'similarity': 0.4149902626625205, 'Crunchbase': 'https://www.crunchbase.com/person/brett-schulman', 'LinkedIn': '', 'size': 4149.902626625205}, {'name': 'Kevin Busque', 'company': 'Guideline Technologies Inc.', 'similarity': 0.4035416762814158, 'Crunchbase': 'https://www.crunchbase.com/person/kevin-busque', 'LinkedIn': 'https://www.link