In [15]:
import requests
import pandas as pd
import numpy as np
import heapq
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import preprocessing

In [16]:
pd.options.mode.chained_assignment = None

key_mattermark = "93284adac11d34633e1781d5fb0129f579741eeaa41daa2e1e7f553b60d16020"
key_crunchbase = "6da4206c39f52b4f6bfd8ce2e83af25e"
API = 0
null=None

In [17]:
def matterMarkData(key_mattermark):
    '''
    Mattermark is a website which provides REST API based funding event information.
    However, it only provides 100 free API calls to access the data and after that
    it the key will expire.However, we have downloaded the data and stored it in csv.

    If API == 1, the data will be taken by REST API calls. It will generate the error
    when free API calls are over.
    If API == 0 : the data is already downloaded by website using the code "API == 1"
    should be used,
    '''
    if API == 1:
        mattermark_data = []
        total = 50
        for i in range(1,total):
            response = requests.get("https://api.mattermark.com/fundings/?key=93284adac11d34633e1781d5fb0129f579741eeaa41daa2e1e7f553b60d16020&page="+str(i)+"&per_page=10000")
                    
            if response.status_code < 400:
                fundingData = response.json()
                for item in fundingData["fundings"]:
                    temp = {}
                    temp["company_name"] = item.get("company_name", None)
                    temp["company_category_code"] = item.get("industry", None)
                    temp["company_country_code"] = item.get("country", None)
                    temp["company_state_code"] = item.get("state", None)	
                    temp["company_region"] = item.get("region",None)
                    temp["company_city"] = item.get("city", None)
            
                    #get funding round type and convert it to similar type as crunchbase data
                    funding = item.get("series", None)
                    if funding == 'a':
                        temp["funding_round_type"] = "series-a"
                    elif funding == 'b':
                        temp["funding_round_type"] = "series-b"
                    elif funding == 'c' or funding == 'd':
                        temp["funding_round_type"] = "series-c+"
                    elif funding == "unknown":
                        temp["funding_round_type"] = "other"
                    else:
                        temp["funding_round_type"] = "series-c+"

                    #get funding dates and convert it to similar type as crunchbase data
                    temp["funded_at"] = item.get("rounds_funding_date", None)
                    x = item.get("rounds_funding_date", None)
                    if x == None:
                        temp["funded_year"] = None
                    else:
                        temp["funded_year"] = x.split('-')[0]
                    temp["raised_amount_usd"] = item.get("amount", None)

                    #get investors data and split if there are multiple investors
                    investor = item.get("investors", None).split(",")
                    for item in investor:
                        temp["investor_name"] = investor

                        mattermark_data.append(temp)
                mattermark_dataset = pd.DataFrame(mattermark_data)
                mattermark_dataset.to_csv("C:/Akshata/Courses/summer19/large_scale_analytics/group_project/mattermark.csv")
            else:
                print("Error Invalid response: ", response.status_code)
                mattermark_dataset = pd.DataFrame(mattermark_data)
                mattermark_dataset.to_csv("C:/Akshata/Courses/summer19/large_scale_analytics/group_project/mattermark.csv")
                break

    else:
        mattermark_dataset = pd.read_csv("C:/Akshata/Courses/summer19/large_scale_analytics/group_project/mattermark.csv", encoding='unicode_escape', dtype=object)
    mattermark_dataset.drop(mattermark_dataset.columns[0],axis =1, inplace=True)
    return mattermark_dataset

In [18]:
def crunchBaseData(key_crunchbase):
    #The key in this code is already expired. The data is already saved as a csv file locally and will be used.
    '''
    response = requests.get('https://api.crunchbase.com/bulk/v4/bulk_export.tar.gz?user_key=6da4206c39f52b4f6bfd8ce2e83af25e')
    if response.status_code < 400:
        crunchbase_dataset = pd.read_csv('bulk_export.tar.gz', compression='gzip', header=0, sep=' ', quotechar='"', error_bad_lines=False)
    else:
        print("Error Invalid response:", response.status_code)
    '''
    #get crunchbase dataset
    crunchbase_dataset = pd.read_csv("C:/Akshata/Courses/summer19/large_scale_analytics/group_project/crunchbase-investments.csv", encoding='unicode_escape', dtype=object)

    crunchbase_dataset=crunchbase_dataset.drop(['company_permalink', 'investor_permalink', 'investor_category_code', 'investor_country_code', 'investor_state_code','investor_region', 'investor_city','funded_month', 'funded_quarter'],axis=1)

    crunchbase_dataset.duplicated(subset=None, keep='first')
    return crunchbase_dataset


def dataPreprocessing(crunchbase_dataset, mattermark_dataset):
    #join datasets
    total_dataset = pd.concat([crunchbase_dataset, mattermark_dataset], sort=True)

    #preprocess dataset
    total_dataset.dropna(axis=0, subset=['company_name','investor_name', 'raised_amount_usd',])
    total_dataset.dropna(inplace=True)

    #divide test data and training data
    msk = np.random.rand(len(total_dataset)) < 0.8
    df_train = total_dataset[msk]
    df_test = total_dataset[~msk]
    
    #label encoding
    le = preprocessing.LabelEncoder()
    le_ = preprocessing.LabelEncoder()

    #label encoding for training data
    df_train.company_category_code = le.fit_transform(df_train.company_category_code)
    df_train.company_country_code = le.fit_transform(df_train.company_country_code)
    df_train.company_state_code = le.fit_transform(df_train.company_state_code)
    df_train.funding_round_type = le.fit_transform(df_train.funding_round_type)
    df_train.raised_amount_usd = le.fit_transform(df_train.raised_amount_usd)

    #label encoding for test data
    df_test.company_category_code = le_.fit_transform(df_test.company_category_code)
    df_test.company_country_code = le_.fit_transform(df_test.company_country_code)
    df_test.company_state_code = le_.fit_transform(df_test.company_state_code)
    df_test.funding_round_type = le_.fit_transform(df_test.funding_round_type)
    df_test.raised_amount_usd = le_.fit_transform(df_test.raised_amount_usd)

    return df_train, df_test

    
def collaborativeFiltering(df_train, df_test):
    '''
    collaborative filtering is performed on investor basis.    
    '''
    #get unique investors in the dataset
    i=0
    similarity=0
    sim=0
    test_data=0
    train_data=0
    recommender = {}
    for test_name in df_test.investor_name.unique():
        df_test_current = df_test[df_test['investor_name']==test_name]
        for name in df_train.investor_name.unique():
            df_train_current = df_train[df_train['investor_name'] == name]
            for test_index, test_item in df_test_current.iterrows():
                for train_index, train_item in df_train_current.iterrows():
                    x = [train_item['company_category_code'],train_item['company_country_code'], train_item['company_state_code'], train_item['funding_round_type'], train_item['raised_amount_usd']]
                    x = np.array(x).reshape(1,-1)
                    y = [test_item['company_category_code'],test_item['company_country_code'], test_item['company_state_code'], test_item['funding_round_type'], test_item['raised_amount_usd']]
                    y = np.array(y).reshape(1,-1)
                    matrix = cosine_similarity(x,y)
                    sim+=matrix[0][0]
                #Moving average calculation
                sim/=(len(df_train_current))
                similarity+=sim
                sim=0
            #moving average calculation
            similarity/=(len(df_test_current))
            #Add final simialrity of test investor with train investor to the final dictionary            
            if recommender.get(test_name, None):
                recommender[test_name].update({name:similarity})
            else: recommender[test_name] = {name:similarity}
            similarity=0
        match = heapq.nlargest(10, recommender[test_name].items(), key=lambda i:i[1])

        #provide recommendation to the investor
        i=0
        print("Recommendation for ",test_name," based on Collaborative Filtering:- ")
        companies = []
        for item in match:
            if i < 10:
                x = df_train.loc[df_train['investor_name'] == item[0]]['company_name'].values[0]
                if x in companies:
                    pass
                else:
                    companies.append(x)
                    i+=1
            else: break
        for x in companies:
            print("    ", x)

        #Break statement is inserted here to come out of the loop after providing recommendation only for one test investor.
        #Remove break if recommendation is required for more than one test investor
        break
    return 0

    
def contentBasedFiltering(df_train, df_test):
    '''
    content based filtering is performed on per company basis.    
    '''
    
    #get unique investors in the dataset
    i=0
    similarity=0
    sim=0
    test_data=0
    train_data=0
    recommender = {}
    for test_name in df_test.company_name.unique():
        df_test_current = df_test[df_test['company_name']==test_name]
        for name in df_train.company_name.unique():
            df_train_current = df_train[df_train['company_name'] == name]
            for test_index, test_item in df_test_current.iterrows():
                for train_index, train_item in df_train_current.iterrows():
                    x = [train_item['company_category_code'],train_item['company_country_code'], train_item['company_state_code'], train_item['funding_round_type'], train_item['raised_amount_usd']]
                    x = np.array(x).reshape(1,-1)
                    y = [test_item['company_category_code'],test_item['company_country_code'], test_item['company_state_code'], test_item['funding_round_type'], test_item['raised_amount_usd']]
                    y = np.array(y).reshape(1,-1)
                    matrix = cosine_similarity(x,y)
                    sim+=matrix[0][0]
                #Moving average calculation
                sim/=(len(df_train_current))
                similarity+=sim
                sim=0
            #moving average calculation
            similarity/=(len(df_test_current))
            #Add final simialrity of test investor with train investor to the final dictionary            
            if recommender.get(test_name, None):
                recommender[test_name].update({name:similarity})
            else: recommender[test_name] = {name:similarity}
            similarity=0
        match = heapq.nlargest(10, recommender[test_name].items(), key=lambda i:i[1])
        #provide recommendation to the investor
        i=0
        print("Recommendation for ",df_test_current['investor_name'].item()," based on Content Based Filtering:-")
        for item in match:
            print("    ", item[0])
        #Break statement is inserted here to come out of the loop after providing recommendation only using one test company.
        #Remove break if recommendation is required using more than one test company.
        break
    return 0


In [19]:
mattermark_dataset = matterMarkData(key_mattermark)

In [20]:
crunchbase_dataset = crunchBaseData(key_crunchbase)

In [21]:
df_train, df_test = dataPreprocessing(crunchbase_dataset, mattermark_dataset)

In [22]:
contentBasedFiltering(df_train, df_test)

Recommendation for  10Xelerator  based on Content Based Filtering:-
     Oncofactor Corporation
     General Dynamics
     TappIn
     SnowShoe Stamp
     SplitSecnd
     CoverWallet
     AppTrigger
     ID.me
     Aclaris Therapeutics
     TriActive


0

In [23]:
collaborativeFiltering(df_train, df_test)

Recommendation for  10Xelerator  based on Collaborative Filtering:- 
     Moda Operandi
     Oncofactor Corporation
     Geospock
     CoverWallet
     ID.me
     Swept
     Plextronics
     MetaSolv
     Mental Canvas
     Palyon Medical


0