# IEOR 135: Applied Data Science for Venture Applications
## The Holy Grail of Venture Capital

Project Team: Julian Chan, Thomas Ferry, Mudit Goyal, Nitin Sampath, Yuan Zhou

IPython Notebook: Julian Chan

In [47]:
import numpy as np
import pandas as pd

In [70]:
founders = pd.read_excel("data/Independent Variable Input.xlsx")

# Remove the founders for whom we only have their name and company name
founders.dropna(axis=0, thresh=9, inplace=True)

# Remove the features for which we have less than 25% of the data populated
cols_to_remove = []
for col in founders.columns:
    if founders[col].isnull().sum()/founders[col].shape[0] > 0.75:
        cols_to_remove.append(col)
founders.drop(cols_to_remove, axis=1, inplace=True)

# Initialize weights for each feature; assume 2 columns are founder name and company name
weights = np.array([np.random.uniform(0, 1) for _ in range(founders.shape[1] - 2)])

In [71]:
class FounderSimilarityCalculator:
    def __init__(self, data, weights):
        """
        Input:
            data: pandas dataframe of feature values (assumes founder name is first column, company name is second column)
            weights: array of weights for each feature to be used in determining "distance" between founders
        """
        self.founders = data.iloc[:,0] # the first column is assumed to be founder name
        self.companies = data.iloc[:,1] # the second column is assumed to be company name
        self.features = data.iloc[:,2:]
        self.weights = weights
        assert self.features.shape[1] == len(self.weights) # ensure that the # of weights corresponds to # of features
    
    def _computeSimpleDistance(self, founder1_index, founder2_index):
        """
        Computes the "distance" between two founders using the Euclidean norm.
        distance(A, B) = (A0 - B0)^2 + ... (An - Bn)^2
        
        Input:
            founder1_index: integer index of founder 1
            founder2_index: integer index of founder 2
        """
        founder1 = self.founders.iloc[founder1_index]
        founder2 = self.founders.iloc[founder2_index]
        
        features1 = self.features.iloc[founder1_index,:]
        features2 = self.features.iloc[founder2_index,:]
        
        simple_distance = 0
        for i in range(features1.shape[0]):
            if isinstance(features1[i], str) and isinstance(features2[i], str):
                if features1[i] == features2[i]:
                    simple_distance += 1
            elif isinstance(features1[i], float) and isinstance(features2[i], float):
                if not (np.isnan(features1[i]) and np.isnan(features2[i])):
                    if features1[i] == features2[i]:
                        simple_distance += 1
                        
        return simple_distance/features1.shape[0]
    
    def _computeWeightedDistance(self, founder1_index, founder2_index):
        """
        Computes the weighted "distance" between two founders using the Euclidean norm.
        distance(A, B) = w0*(A0 - B0)^2 + ... wn*(An - Bn)^2
        
        Input:
            founder1_index: integer index of founder 1
            founder2_index: integer index of founder 2
        """
        founder1 = self.founders.iloc[founder1_index]
        founder2 = self.founders.iloc[founder2_index]
        
        features1 = self.features.iloc[founder1_index,:]
        features2 = self.features.iloc[founder2_index,:]
        
        weighted_distance = 0
        for i in range(features1.shape[0]):
            if isinstance(features1[i], str) and isinstance(features2[i], str):
                if features1[i] == features2[i]:
                        weighted_distance += self.weights[i]
            elif isinstance(features1[i], float) and isinstance(features2[i], float):
                if not (np.isnan(features1[i]) and np.isnan(features2[i])):
                    if features1[i] == features2[i]:
                        weighted_distance += self.weights[i]
                        
        return weighted_distance/features1.shape[0]
    
    def computeDistance(self, founder1_index, founder2_index, metric="simple"):
        if metric == "simple":
            return self._computeSimpleDistance(founder1_index, founder2_index)
        elif metric == "weighted":
            return self._computeWeightedDistance(founder1_index, founder2_index)
    
    def findKClosestFounders(self, k, founder_index, metric="simple"):
        """
        Finds the k closest founders in terms of distance (using the metric specified) to the founder corresponding to
        founder_index.
        
        Input:
            k: # of most similar founders
            founder_index: integer index of founder of whom we wish to find similar founders
        """
        assert k < self.founders.shape[0]
        
        dist = np.empty(self.founders.shape[0])
        dist.fill(float('inf'))
        
        for i in range(self.founders.shape[0]):
            if i == founder_index:
                continue
            dist[i] = self.computeDistance(founder_index, i, metric)
            
        min_indices = dist.argsort()[:k]
        
        closest_founders = []
        closest_companies = []
        for i in min_indices:
            closest_founders.append(self.founders.iloc[i])
            closest_companies.append(self.companies.iloc[i])
        
        return closest_founders, closest_companies

In [76]:
fsc = FounderSimilarityCalculator(founders, weights)

closest_founders_simple, closest_companies_simple = fsc.findKClosestFounders(10, 0, "simple")
closest_founders_weighted, closest_companies_weighted = fsc.findKClosestFounders(10, 0, "weighted")

simple = list(zip(closest_founders_simple, closest_companies_simple))
weighted = list(zip(closest_founders_weighted, closest_companies_weighted))

print("Simple | Weighted")
for i in range(len(simple)):
    print(simple[i], "|", weighted[i])

Simple | Weighted
('Osman Khan', 'Paddle8') | ('Osman Khan', 'Paddle8')
('Rachel Carlson', 'Guild Education') | ('Rachel Carlson', 'Guild Education')
('Xander Oxman', 'Winc') | ('Joshua Feast', 'Cogito Corporation')
('Bhavin Shah', 'Refresh.io') | ('Alfred Chuang', 'Magnet Systems Inc.')
('Alfred Chuang', 'Magnet Systems Inc.') | ('James Gwertzman', 'PlayFab, Inc.')
('James Gwertzman', 'PlayFab, Inc.') | ('Rashid Mansoor', 'Hadean')
('Jim Kean', 'WellnessFX') | ('Xander Oxman', 'Winc')
('Rashid Mansoor', 'Hadean') | ('Jim Kean', 'WellnessFX')
('Prem Uppaluru', 'Transera Communications') | ('Prem Uppaluru', 'Transera Communications')
('Rob Principe', 'Scratch Music Group') | ('Bhavin Shah', 'Refresh.io')
