# Imports & Notes to Improve Workflow

In [1]:
# Setup matplotlib to plot inline (within the notebook)
%matplotlib inline
# Import the pyplot module of Matplotlib as plt
import matplotlib.pyplot as plt
# Import pandas under the abbreviation 'pd'
import pandas as pd
# Import NumPy under the abbreviation 'np'
import numpy as np
# Libraries to aid in web scraping
import requests
import json
import re
from bs4 import BeautifulSoup
from datetime import datetime
import seaborn as sns;
#Notes for efficienccy:
#shift + tab when cursor is in arguments to bring up its documentation
#ex: pd.DataFrame(Shift+tab here)
#recall: Json == Dictionary
#to know functions of pandas:
#dir(pd)

# Reading The Data

In [None]:
# using pandas and read_csv()
acquis = pd.read_csv("datasets/startups/acquisitions.csv")
additions = pd.read_csv("datasets/startups/additions.csv")
companies = pd.read_csv("datasets/startups/companies.csv")
invests = pd.read_csv("datasets/startups/investments.csv")
rounds = pd.read_csv("datasets/startups/rounds.csv")
dfAll = [acquis, additions, companies, invests, rounds]

In [None]:
#shapes (rows x cols) of each dataset
for ele in dfAll:
    print(ele.shape)

# Samples of the datasets

In [None]:
acquis.sample(5).transpose()

In [None]:
additions.sample(5).transpose()

In [None]:
companies.sample(5).transpose()

In [None]:
invests.sample(5).transpose()

In [None]:
rounds.sample(5).transpose()

# Seeing the columns in the datasets

In [None]:
def datasetsCols(dfList, datasetNames, pad=""):
    """
    Takes dataframes and returns a dataframe (df) with each column having a df's
    column names. The "pad" is to make sure all lists of columns have 
    the same length padded out with "pad"
    Example
    pad = "XX", datasetNames = ["cars", "planes"]
    dfList = [carsDF, planesDF]
    returned data frame:
            cars            planes
    0       numOfWheels     numOfWings
    1       manufacturer    manufacturer
    2       make            XX
    """
    cols = []
    maxArrayLen = 0
    for i, df in enumerate(dfList):
        cols.append(df.columns.tolist())
        maxArrayLen = max(maxArrayLen, len(cols[i]))
    
    dictCsvs = {}
    for i, df in enumerate(dfList):
        cols[i] += [pad] * (maxArrayLen - len(cols[i])) #padding the lists to make them have equal lengths 
        dictCsvs.update({datasetNames[i] : cols[i]})

    return pd.DataFrame(dictCsvs)

In [None]:
lstCsvNames = ["acquisitions", "additions", "companies", "investments", "rounds"]

dfAllCols = datasetsCols(dfAll, lstCsvNames)
dfAllCols

# Data Cleaning

First, lets change the columns in the `companies` dataset <br>
to be the similar to all other datasets: <br>
(we're doing this in case we merge the datasets together, we want them to have the same column names)

In [None]:
companies.rename(columns = {'permalink' : 'company_permalink', 
                            'name' : 'company_name',
                            'category_list' : 'company_category_list',
                            'country_code' : 'company_country_code',
                            'state_code' : 'company_state_code',
                            'region' : 'company_region',
                            'city' : 'company_city'}, inplace = True)

In [None]:
dfAll[2] = companies
dfAllCols = datasetsCols(dfAll, lstCsvNames)
dfAllCols

Let's start by analyzing `additions` dataset, as it has unusual format of columns

## Removing "additions" Dataset

In [None]:
additions.sample(5)

In [None]:
additions["content"].unique()

In [None]:
dfAllCols

In [None]:
additions.shape

In [None]:
companies.shape, rounds.shape

So apparently there are no columns that could be used to join with the other datasets. <br>
Furthermore, the "value" column is too vague to be useful. <br>
Therefore, the "additions" dataset will be discarded.

In [None]:
dfAll.pop(1)
lstCsvNames.pop(1)
len(dfAll), lstCsvNames

## `rounds` Dataset

In [None]:
dfAllCols.iloc[:, -2:]

In [None]:
invests.shape, rounds.shape

In [None]:
invests.head()

In [None]:
rounds.head()

In [None]:
len(invests["company_name"].unique()), len(rounds["company_name"].unique())

Since there are companies in `rounds` dataset that are not in `investments` dataset, <br>
therefore we should keep `rounds` dataset for further analysis.

# Data Cleaning (Cont.)

## Checking All Missing Values

In [None]:
def datasetsNulls(dfList, datasetNames, nullCol="nulls_", pad=""):
    """
    Use this when you want to display a column of column names,
    then a column of the null values, and repeat that for each dataset.
    Returns a dataframe
    """
    cols = []
    nulls = []
    maxArrayLen = 0
    for i, df in enumerate(dfList):
        cols.append(df.columns.tolist())
        nulls.append(df.isnull().sum().tolist())
        maxArrayLen = max(maxArrayLen, len(cols[i]))
    
    dictCsvs = {}
    for i, df in enumerate(dfList):
        cols[i] += [pad] * (maxArrayLen - len(cols[i])) #padding the lists to make them have equal lengths 
        nulls[i] += [None] *  (maxArrayLen - len(nulls[i]))
        dictCsvs[datasetNames[i]] = cols[i]
        dictCsvs[nullCol + str(i+1)] = nulls[i]
    
    return pd.DataFrame(dictCsvs)

In [None]:
dfAllNulls = datasetsNulls(dfAll, lstCsvNames)
dfAllNulls

### `dropna()` for columns with few missing values

Notice that there is one row that doesn't contain a company name, <br>
so let's validate that it is the same company across all datasets, <br>
so we can remove it:

In [None]:
datasetsCols(dfAll, lstCsvNames)

In [None]:
#company_permalink of each row that doesn't have a company name:
for df in dfAll:
    print(df[df['company_name'].isnull()]['company_permalink'].values)

So apparently, the `acquisitions` dataset has a different permalink that has no company name, <br>
So we could remove these two companies, as the total number of companies are big enough to get the insight that we want.

`dfAllNulls` dataframe also shows that there are missing values for `acquirer_name` and `investor_name` <br>
which can't be imputed, so we'll also remove them.

Finally, we'll remove the missing values for columns that have less than 100 `NaN`, <br>
as they're small in comparison to their respective datasets:

In [None]:
# This syntax won't work, as it turns out, df[0] is a copy of a dataframe (eg: a copy of acquisitions, so they don't share the same reference)
#for df in dfAll:
#    nulls = df.isnull().sum()
#    cols = nulls[(nulls <= 100) & (nulls != 0)].index.tolist()
#    df.dropna(subset=cols, inplace=True)
#dfAllNulls = datasetsNulls(dfAll, lstCsvNames)
#dfAllNulls

In [None]:
nulls = acquis.isnull().sum()
cols = nulls[(nulls <= 100) & (nulls != 0)].index.tolist()
acquis.dropna(subset=cols, inplace=True)

nulls.append(companies.isnull().sum()
cols.append(nulls[(nulls <= 100) & (nulls != 0)].index.tolist()
companies.dropna(subset=cols, inplace=True)

nulls = invests.isnull().sum()
cols = nulls[(nulls <= 100) & (nulls != 0)].index.tolist()
invests.dropna(subset=cols, inplace=True)

nulls = rounds.isnull().sum()
cols = nulls[(nulls <= 100) & (nulls != 0)].index.tolist()
rounds.dropna(subset=cols, inplace=True)

In [None]:
dfAll = [acquis, companies, invests, rounds]
lstCsvNames = ["acquisitions", "companies", "investments", "rounds"]
dfAllNulls = datasetsNulls(dfAll, lstCsvNames)
dfAllNulls

## Converting & Imputing Dates

In [None]:
#Farah: plz impute (not drop) the columns' missing values related to dates in each dataset which are:
#founded_at.. and that's it :] 
# Suggestion: convert the dates of all the columns that have date values from "object" to datetime
#       Helpful link: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#converting-to-timestamps
# Then, make the founded_at a couple of days before first_funding_at

#Sincerely No. -Farah :)

We are currently interested in 'founded_at', 'first_funding_at' and 'last_funding_at' columns. Before doing operations on dates, they need to be converted first from 'Object' to 'DateTime'.

In [None]:
companies['founded_at'] =pd.to_datetime(companies['founded_at'], errors = 'coerce')
companies['first_funding_at'] =pd.to_datetime(companies['first_funding_at'], errors = 'coerce') #3 rows will not be converted succesfully and will become "NaN" (shown later)
companies['last_funding_at'] =pd.to_datetime(companies['last_funding_at'], errors = 'coerce')
companies.dtypes

After converting, we need to impute the missing values on the dates of when those companies where founded at. So, this will be done by first creating a new column that calculates how long it took to get the first funding. This will be done by subtracting the date of the founding from the date of the first funding. Note that we will find some values in negative which indicate that some companies took funding before establishing it.


In [None]:
companies['time_before_first_fund'] = companies['first_funding_at'] - companies['founded_at']
companies['time_before_first_fund']

Now we need to calculate the mean of the time before the first funding and impute the missing values with this average. Notice that the average is 1370 days which is around 3.7 years which means there must be some outliers that messed up the average.

In [None]:
companies['time_before_first_fund'].mean()

Therefore, we will need to visualize those outliers and maybe if they are causing inaccuracy in the data, we can drop them.

In [None]:
plt.scatter(companies['first_funding_at'], companies['founded_at'], s = 20, alpha = 0.5)
plt.xlabel('first_funding_at')
plt.ylabel('founded_at')

We Noticed that there are some outliers than can be removed from the dataset so we will create a temporary dataframe that will contain the outliers so we can drop them later on. The first outliers that we want to deduct are the ones at the left of the center.

In [None]:
from datetime import datetime
tempdf = companies.copy()
tempdf = tempdf[(tempdf.first_funding_at <= datetime.strptime("01/01/1983", "%d/%m/%Y"))& (tempdf.founded_at <= datetime.strptime("01/01/2000", "%d/%m/%Y"))]
dropseries = tempdf.copy()
dropseries

Plotting the outliers on a separate graph:

In [None]:
plt.scatter(tempdf['first_funding_at'], tempdf['founded_at'], s = 20)
plt.xlabel('first_funding_at')
plt.ylabel('founded_at')

Now we will determine the outliers that are the bottom of the center from the original graph.

In [None]:
tempdf = companies.copy()
tempdf = tempdf[(tempdf.first_funding_at < datetime.strptime("01/01/2005", "%d/%m/%Y"))& (tempdf.founded_at <= datetime.strptime("01/01/1900", "%d/%m/%Y"))]
tempdf

Plotting the outliers on a separate graph:

In [None]:
plt.scatter(tempdf['first_funding_at'], tempdf['founded_at'], s = 20)
plt.xlabel('first_funding_at')
plt.ylabel('founded_at')

Now we will concatenate the newly discovered outliers to the ones from before

In [None]:
dropseries = pd.concat([dropseries, tempdf])
dropseries

Finally, we will deduct the outliers that are the top right corner

In [None]:
tempdf = companies.copy()
tempdf = tempdf[(tempdf.first_funding_at > datetime.strptime("01/01/2010", "%d/%m/%Y"))& (tempdf.founded_at >= datetime.strptime("01/01/2030", "%d/%m/%Y"))]
tempdf

Visualizing the outliers:

In [None]:
plt.scatter(tempdf['first_funding_at'], tempdf['founded_at'], s = 20)
plt.xlabel('first_funding_at')
plt.ylabel('founded_at')

Concatinating the outliers on the rest:

In [None]:
dropseries = pd.concat([dropseries, tempdf])
dropseries

Now, we can drop all of the outliers that have been detected.


In [None]:
list = dropseries.index
companies = companies.drop(list)
#companies['time_before_first_fund'].sort_values(ascending=False)

Plotting the graph once more after cleaning:

In [None]:
plt.scatter(companies['first_funding_at'], companies['founded_at'], s = 20, alpha=0.5)
plt.xlabel('first_funding_at')
plt.ylabel('founded_at')

Now, we can calculate the mean. Notice that there were not much of a difference but deducting the outliers helped keeping the data realistic because some dates were 2090, 2100, etc. which is unrealistic.

In [None]:
companies['time_before_first_fund'].mean()

Filling the missing values with the new mean:

In [None]:
companies['time_before_first_fund'].fillna(companies['time_before_first_fund'].mean(), inplace = True)
companies['time_before_first_fund']

After calculating the mean and filling the nulls with it, now we can impute the missing dates in the 'founded_at' column by subtracting the time before first funding from the first funding date.

In [None]:
companies['founded_at'].fillna(companies['first_funding_at'] - companies['time_before_first_fund'], inplace = True)
companies['founded_at']

Converting 'aquired_at' of the acquisitons dataset from Object to date:

In [None]:
acquis['acquired_at'] =pd.to_datetime(acquis['acquired_at'], errors = 'coerce')
acquis.dtypes

Converting 'funded_at' of the investments dataset from Object to date:

In [None]:
invests['funded_at'] =pd.to_datetime(invests['funded_at'], errors = 'coerce')
invests.dtypes

Converting 'funded_at' of the rounds dataset from Object to date:

In [None]:
rounds['funded_at'] =pd.to_datetime(rounds['funded_at'], errors = 'coerce')
rounds.dtypes

## Imputing Categories

Regarding the category list in the five datasets, since we don't have any indicator on deducing the null values in this column, we will fill the nulls with "missing". With samples, it's provided that the same rows (companies) that their category is missing in one dataset, are the same rows in the others so even merging the datasets will not be helpful.


In [None]:
companies['company_category_list'].fillna('missing', inplace=True)
acquis['company_category_list'].fillna('missing', inplace=True)
acquis['acquirer_category_list'].fillna('missing', inplace=True)
invests['company_category_list'].fillna('missing', inplace=True)
rounds['company_category_list'].fillna('missing', inplace=True)

## Imputing Raised Amount

There are alot of missing data in the raised amount column in the data set. One strategy that could be taken is calculating the mean of all of the funds of this certain company and impute its nulls with the average of total fund it has recieved before. First we calculate the mean of of the total funding of each company then add it in a new row, then compare each row if the raised amount column is null, then fill it with the average of the company otherwise leave it as it is.

In [None]:
total = invests.groupby(['company_name'])['raised_amount_usd'].transform(np.mean)
total = pd.DataFrame(total)
invests['total_raised'] = total
invests['raised_amount_usd'] = np.where((invests['raised_amount_usd'].isnull()), invests['total_raised'], invests['raised_amount_usd'])
invests['raised_amount_usd'].fillna(invests['raised_amount_usd'].mean(), inplace = True)
invests

In [None]:
#farah stop point here

## Cleaning & Imputing Geographical Locations

### Removing `company_state_code` and `company_region`

We don't need `company_state_code` in our analysis, <br>
As country code and city are sufficient to know <br>
an approximation of the geographical location of the company. <br>
Therefore, let's remove it:

In [None]:
companies[['company_country_code', 'company_state_code', 'company_city']].head(10)

In [None]:
companies.drop('company_state_code', axis=1, inplace=True)

`company_region` is also redundent, as it is usually the same as `company_city`.<br>
Let's validate this:

In [None]:
# Note: same logic applies for the permalinks
acquis.dropna(subset=['acquirer_permalink', 'acquirer_name'], inplace=True)
invests.dropna(subset=['investor_permalink', 'investor_name'], inplace=True)

Also, we'll remove the missing values for columns that have less than 100 `NaN`, <br>
as they're small in comparison to their respective datasets:

In [None]:
companies[companies.company_region == companies.company_city][['company_region', 'company_city']]

Let's see the mean of similarity between the strings using builtin library `difflib.SequenceMatcher`:

In [None]:
from difflib import SequenceMatcher
similarities = []
rgs = companies.company_region.values.tolist()
cts = companies.company_city.values.tolist()
for i in range(min(len(rgs), len(cts))):
    if not (rgs[i] != rgs[i] or cts[i] != cts[i]): # If either of the elements is NaN, then don't find the similarity
        similarities.append(SequenceMatcher(None, rgs[i], cts[i]).ratio()) # Gets the similarity between each two strings
avgSim = sum(similarities) / len(similarities)
avgSim

However, `SequenceMatcher` only finds the longest contiguous matching subsequence, <br>
and all other characters will drastically decrease the similarity, which isn't very accurate, <br>
Therefore, let's try to see how many times a string is a substring of another:

In [None]:
subsCount = 0
for i in range(min(len(rgs), len(cts))):
    if not (rgs[i] != rgs[i] or cts[i] != cts[i]):
        subsCount += (rgs[i] in cts[i] or cts[i] in rgs[i])
subsCount

So to summarize: almost half of the companies have the same data for `region` and `city` (25360) <br>
There are around 5000 companies that have the same semantic meaning between the data <br>
(e.g: 'New York City' is the same as 'NYC, New York City') <br>
Therefore, `company_region` can be removed without losing any possible future insights:

In [None]:
companies.drop('company_region', axis=1, inplace=True)

acquis.drop('company_region', axis=1, inplace=True)
acquis.drop('acquirer_region', axis=1, inplace=True)

invests.drop('company_region', axis=1, inplace=True)
invests.drop('investor_region', axis=1, inplace=True)

rounds.drop('company_region', axis=1, inplace=True)

### Imputing `company_country_code` and `company_city`

#### Failed attempt using `company_permalink`:
1. check the company's permalink
2. find HTML that contains the headquarters location (country and city)
3. scrape country and convert it to country code using `pycountry.countries`
4. scrape city and impute it in `company_city`

Step 1: appending "https://www.crunchbase.com" to each permalink <br>
to be able to access the company's webpage on crunchbase. <br>
However, since the new permalink will be much longer (thus slower for accessing as a dataset's primary index) <br>
we'll create a function that appends the string as a prefix and that will be accessed throughout the notebook

Step 2: Finding location of country and city on the webpage:
let's check company [004](https://www.crunchbase.com/organization/004)'s webpage for example: <Br>
<img src="Phase 1/countryAndCityInCrunchbase.png" width=400 height=300 />

By inspecting the HTML, we notice 4 instances of the class <br>
`component--field-formatter field-type-identifier-multi` (in a `span` element),<br>and the data that we want
is always the first instance: <br>
<img src="Phase 1/countryAndCityLocationInHTML.png" width=700 height=500 />

Step 3: scrape that info using `requests` and `bs4` libraries: <br>
(Note, we're using `threading.Thread` here to increase performance by scraping in multithreads)

However, this is a deadend, as crunchbase is powered by [distill network](https://www.google.com/search?q=distil+network&sxsrf=APq-WBt2mubTHz1xGfmKgpzlWtHah5qAaA%3A1648578960023&ei=kFFDYsx646qVB_nKh7gK&ved=0ahUKEwjMgq-a--v2AhVjVeUKHXnlAacQ4dUDCA4&uact=5&oq=distil+network&gs_lcp=Cgdnd3Mtd2l6EAMyBggAEAcQHjIGCAAQBxAeMgYIABAHEB4yBggAEAcQHjIGCAAQBxAeMgYIABAHEB4yBggAEAcQHjIGCAAQBxAeMgYIABAHEB4yBggAEAcQHjoHCCMQsAMQJzoHCAAQRxCwA0oECEEYAEoECEYYAFCyCliyCmCLD2gBcAF4AIABowGIAaMBkgEDMC4xmAEAoAEByAEKwAEB&sclient=gws-wiz#:~:text=distil%20networks%20uses%20machine%20learning%20to%20identify%20and%20mitigate%20potential%20bad%20bots%2C%20fingerprinting%20them%20so%20that%20they%20can%20still%20be%20tracked%20if%20they%20reconnect%20from%20a%20different%20ip%20address) <br>
So when we tried to scrap from it, the following html was always displayed: <br><br>
<img src="Phase 1/crunchbaseAccessDenied.png">

#### Failed attempt using [linkedin](https://www.linkedin.com/in/ashrafharess/)

It failed because after a while, linkedin detects that you are a bot: <br><br>
<img src='Phase 1/linkedinSecurityCheck.png' width=500 height=300>

However, the steps are displayed below to show how this was initially done using multithreads:

Step 1: Login into linkedin using `selenium.webdriver`:

In [None]:
from selenium import webdriver
import time
import warnings
warnings.filterwarnings('ignore') # to suppress warnings about internal code deprecations
options=webdriver.ChromeOptions()
options.add_argument('--incognito')
options.add_argument('--headless')
driver=webdriver.Chrome(options=options)
driver.get('https://www.linkedin.com/uas/login')

username = driver.find_element_by_id('username')
username.send_keys('xxfarah600xx@gmail.com')
password = driver.find_element_by_id('password')
password.send_keys('#0LinkedIn0#')
log_in_button = driver.find_element_by_class_name('from__button--floating')
log_in_button.click()

Step 2: Visualize the html of the data you need to scrape: <br><br>
<img src='Phase 1/linkedinAboutLocations.png' width=800 height=500>

In [None]:
driver.get('https://www.linkedin.com/company/absolvent/about/')
time.sleep(3) # sleeping to render javascript code before parsing to BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [None]:
locCard = soup.find('div', 'org-location-card pv2')
locCard.text

In [None]:
import re
re.findall(r'Primary[\n\s]+(\w+)', locCard.text)

Step 3: Visualize another part of the html in case step 3 doesn't work: <br><br>
<img src='Phase 1/linkedinAboutHeadquarters.png' width=700 height=500>

In [None]:
driver.get('https://www.linkedin.com/company/association-for-computing-machinery/about/')
time.sleep(3) # sleeping to render javascript code before parsing to BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [None]:
dl = soup.find('div', 'mb6').find('dl')
dl

In [None]:
dl = soup.find('div', 'mb6').find('dl')
nms = []
for tag in dl:
    txt = tag.text
    nms.append(txt)
nms

In [None]:
# getting the values in key-value pairs, then we will extract the country from key "Headquarters"
dl = soup.find('div', 'mb6').find('dl')
keys = []
values = []
for tag in dl.find_all('dt'):
    txt = tag.text.strip()
    if (txt != ''):
        keys.append(txt)
for tag in dl.find_all('dd'):
    txt = tag.text.strip()
    if (txt != ''):
        values.append(txt)


In [None]:
keys, values

In [None]:
values = [txt for txt in values if "Includes" not in txt] # a message which contains the word "Includes" sometime appears and is not needed, thus we don't include it
values

In [None]:
# putting the key-value pairs in a dictionary
linkedinDict = dict(zip(keys, values))
linkedinDict

In [None]:
linkedinDict["Headquarters"]

Step 4: getting the country code of the companies and their corresponding indices and putting them in a list of tuples:

In [None]:
import threading # using multithreads to speed things up a little
lock = threading.Lock()
import pycountry
from geopy.geocoders import Nominatim # using this library to convert cities to countries
import time
import re

geolocator = Nominatim(user_agent="ashraf196280@bue.edu.eg")

idxToNan = []
def fetchCountry(idx, link):
    try:
        lock.acquire()
        driver.get(link) # same driver used from step 1
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        lock.release()
        dl = soup.find('div', 'mb6').find('dl')
        keys = []
        values = []
        for tag in dl.find_all('dt'):
            txt = tag.text.strip()
            if (txt != ''):
                keys.append(txt)
        for tag in dl.find_all('dd'):
            txt = tag.text.strip()
            if (txt != ''):
                values.append(txt)
        values = [txt for txt in values if "Includes" not in txt]
        linkedinDict = dict(zip(keys, values))
        if ('Headquarters' in linkedinDict):
            city = linkedinDict["Headquarters"]
            city = city.split(',')[0] # eg: "Cupertino, California" will be ["Cupertino", "California"], so "Cupertino" will be returned
        else:
            locCard = soup.find('div', 'org-location-card pv2')
            city = re.findall(r'Primary[\n\s]+(\w+)', locCard.text)[0]
        lock.acquire()
        location = geolocator.geocode(city, language="en")
        lock.release()
        if location is None:
            raise
    except:
        lock.acquire()
        idxToNan.append((idx, "missing")) # means that this website is not found on linkedin, so declare it as missing
        lock.release()
        return
    
    country = location.address.split(', ')[-1]
    try:
        alpha3Code = pycountry.countries.get(name=country).alpha_3
    except:
        alpha3Code = country[0:3].upper() #if not found in pycountry, assume that it is the first 3 letters of the country name
    lock.acquire()
    idxToNan.append((idx, alpha3Code))
    lock.release()


In [None]:
from threading import Thread
def imputeFromLinks(df, colToSuffixIntoURL, colToImpute, start, end, linkPrefix = ""):
    links = df[df[colToImpute].isnull()][[colToSuffixIntoURL]].squeeze()[start:end] # "squeeze()" converts df to series
    threads = []
    for tup in links.iteritems(): # tup[0] --> index, tup[1] --> url (i.e. link)
        th = Thread(target=fetchCountry, args=(tup[0], linkPrefix + tup[1].replace(" ", "") + '/about')) #making sure company name doesn't have spaces when suffixed into the url
        threads.append(th)
        th.start()
    for th in threads:
        th.join() # to wait until all multithreads finish to properly display the dataframe

In [None]:
# single threaded version (just in case multithreads gets blocked by linkedin)
from threading import Thread
def imputeFromLinksSingleThreads(df, colToSuffixIntoURL, colToImpute, linkPrefix = ""):
    links = df[df[colToImpute].isnull()][[colToSuffixIntoURL]].squeeze() # "squeeze()" converts df to series
    for tup in links.iteritems(): # tup[0] --> index, tup[1] --> url (i.e. link)
        fetchCountry(tup[0], linkPrefix + tup[1].replace(" ", "") + '/about') #making sure company name doesn't have spaces when suffixed into the url
        

In [None]:
links = companies[companies['company_country_code'].isnull()][['company_name']].squeeze()
len(links)

In [None]:
#imputeFromLinks(companies, "company_name", "company_country_code", "https://www.linkedin.com/company/", 0, 10)

In [None]:
#idxToNan

In [None]:
# this was to save the output to a csv file in case the pc shut off after the above cell executed
#dfLinkedin = pd.DataFrame(idxToNan, columns=['index', 'company_country_code'])
#dfLinkedin.set_index('index')
#dfLinkedin.to_csv('Phase 1/countryCodesLinkedIn.csv', index=False)

#### Successful attempt using TLDs of `homepage_url`

Let's check the top level domain (TLD) of each link using regex and see if we can use that <br>
to impute the country code:

In [None]:
# As you can see, there are only approximately 1500 rows that don't contain neither country_code nor homepage_url
hUrls = companies[companies['company_country_code'].isnull()][['homepage_url']].squeeze()
hUrls.dropna(inplace=True)
len(hUrls), len(companies[companies['company_country_code'].isnull()])

In [None]:
hUrls.head(3)

In [None]:
hUrls.str.findall(r'(\.[^.]*)$').head(3) # use '\.([^.]*)$' if you don't want the '.', but we do, so leave it

In [None]:
hUrls = hUrls.str.findall(r'(\.[^.]*)$').apply(lambda x : ''.join(x).split('/')[0]) # apply() will convert the list of strings to a string and remove '/' at the end of the string
hUrls

In [None]:

unwantedVals = hUrls.str.contains(pat='^$|com') # "^$" means empty string, "|" means "or"
hUrls = hUrls[~unwantedVals] # "~" is equivalent to "unwantedVals == False"
hUrls

Create a dictionary where key = TLD, value = country <br>
e.g --> '.jp' : 'Japan'

In [None]:
#csv obtained from https://gist.github.com/derlin/421d2bb55018a1538271227ff6b1299d#file-country-codes-tlds-csv
tldsToCountries = pd.read_csv('Phase 1/country-codes-tlds.csv')
tldsToCountries.drop_duplicates(subset='tld', keep='first', inplace=True)
tldsToCountries['tld'] = tldsToCountries['tld'].apply(lambda x: x.replace(' ', ''))
tldsToCountries.to_csv(r'Phase 1/newCountryCodesTlds.csv', index = None, header=True)
tldsToCountries = tldsToCountries.set_index('tld').squeeze()
tldsToCountries = tldsToCountries.to_dict()
tldsToCountries['.jp']

Use that dictionary to convert TLDs to their respective countries:

In [None]:
hCompanyLocs = hUrls.copy()
hCompanyLocs = hCompanyLocs.apply(lambda x : tldsToCountries[x] if (x in tldsToCountries) else 'none')
hCompanyLocs = hCompanyLocs[hCompanyLocs != 'none']
hCompanyLocs

In [None]:
# 1213 countries could be imputed
len(hCompanyLocs)

Convert those countries to country codes:

In [None]:
import pycountry
def toAlphaCode3(country):
    countryInfo = pycountry.countries.get(name=country)
    if (countryInfo is None):
        return "none"
    return countryInfo.alpha_3

hCompanyLocs = hCompanyLocs.apply(lambda x : toAlphaCode3(x))
hCompanyLocs = hCompanyLocs[hCompanyLocs != 'none']
hCompanyLocs

In [None]:
# only 900 countries have country codes present, so impute those only
# the following is just to illustrate the for loop in the next cell:
hCompanyLocs.index[0], hCompanyLocs.iloc[0]

impute into `country_codes` of `companies` dataset

In [None]:
for i in range(len(hCompanyLocs)):
    idx = hCompanyLocs.index[i]
    code = hCompanyLocs.iloc[i]
    companies.at[idx, 'company_country_code'] = code

#### Imputing rest of cities and country codes

After the failed attempt to scrape the country and then get the country code of each company we will impute the values with "missing" for now.

In [None]:
companies['company_country_code'].fillna("missing", inplace=True)
companies['company_city'].fillna("missing", inplace=True)

acquis['company_country_code'].fillna("missing", inplace=True)
acquis['company_state_code'].fillna("missing", inplace=True)
acquis['company_city'].fillna("missing")
acquis['acquirer_country_code'].fillna("missing", inplace=True)
acquis['acquirer_state_code'].fillna("missing", inplace=True)
acquis['acquirer_city'].fillna("missing", inplace=True)

invests['company_country_code'].fillna("missing", inplace=True)
invests['company_state_code'].fillna("missing", inplace=True)
invests['company_city'].fillna("missing", inplace=True)
invests['investor_city'].fillna("missing", inplace=True)
invests['investor_country_code'].fillna("missing", inplace=True)
invests['investor_state_code'].fillna("missing", inplace=True)


rounds['company_country_code'].fillna("missing", inplace=True)


## Removing duplicates in `acquistions` dataset

In [None]:
# To show that there really are duplicates in acquisitions dataset
acquis[acquis.duplicated(keep = False) == True]

In [None]:
acquis.drop_duplicates()
acquis

## Checking and Converting `dtypes`

In [None]:
def datasetsDtypes(dfList, datasetNames, dtypesCol="dtypes_", pad=""):
    cols = []
    nulls = []
    maxArrayLen = 0
    for i, df in enumerate(dfList):
        cols.append(df.columns.tolist())
        nulls.append(df.dtypes.tolist())
        maxArrayLen = max(maxArrayLen, len(cols[i]))
    
    dictCsvs = {}
    for i, df in enumerate(dfList):
        cols[i] += [pad] * (maxArrayLen - len(cols[i])) #padding the lists to make them have equal lengths 
        nulls[i] += [None] *  (maxArrayLen - len(nulls[i]))
        dictCsvs[datasetNames[i]] = cols[i]
        dictCsvs[dtypesCol + str(i+1)] = nulls[i]
    
    return pd.DataFrame(dictCsvs)

In [None]:
datasetsDtypes(dfAll, lstCsvNames)

From the above dataframe, convert `funding_total_usd` <br>
in `companies` datset into float and then fill it and <br>
`raised_amount_usd` in `rounds` dataset to  nulls with zeros

In [None]:
companies['funding_total_usd'] = pd.to_numeric(companies['funding_total_usd'],errors = 'coerce')
companies['funding_total_usd'].fillna(0, inplace = True)
companies['funding_total_usd'] = companies['funding_total_usd'].astype(float, errors = 'raise')

In [None]:
rounds['raised_amount_usd'].fillna(0, inplace = True)

In [None]:
dfAll = [acquis, companies, invests, rounds] # Reassigning dfAll as sometimes the updates done in a dataframe don't reflect in the list
lstCsvNames = ["acquisitions", "companies", "investments", "rounds"]
datasetsDtypes(dfAll, lstCsvNames)

## Checking Nulls

Now that we imputed all missing data, we will check and see that there are no more null values in any data set. Notice that the other nulls are in columns that are dropped already in the dataframes.

In [None]:
dfAll = [acquis, companies, invests, rounds] # Reassigning dfAll as sometimes the updates done in a dataframe don't reflect in the list
lstCsvNames = ["acquisitions", "companies", "investments", "rounds"]
dfAllNulls = datasetsNulls(dfAll, lstCsvNames)
dfAllNulls

In [None]:
# Note that "founded_at" and "first_funding_at" has new NaNs, as the conversion to datetime wasn't successful in 3 rows
# so let's remove them
companies.dropna(subset=['founded_at'], inplace=True)
companies.dropna(subset=['first_funding_at'], inplace=True)

In [None]:
companies.reset_index(inplace = True)
companies.drop(['index'], axis = 1, inplace = True)

In [None]:
companies.dropna(subset = ['homepage_url'], inplace = True)
companies.isnull().sum()

# Saving Cleaned Datasets

In [None]:
#prefix = 'datasets/startupsCleaned/'
#suffix = 'Cleaned.csv'
#acquis.to_csv(prefix+'acquisitions'+suffix, index=False)
#companies.to_csv(prefix+'companies'+suffix, index=False)
#invests.to_csv(prefix+'investments'+suffix, index=False)
#rounds.to_csv(prefix+'rounds'+suffix, index=False)

# Answering Questions From The Given Datasets

## Question 1 (Ashraf & Farah)

-	Can the factors that affect a start-up’s growth be determined?
    -	For this: We are initially interested in <br> `funding_total_usd`, `status`, and `funding_rounds`


This question can be measured with two approches:
- Divide the companies into categories based on their status (either closed, operating or acquired) then look at the average of funding each category receives, then determine the correlation between the closed companies and their failure due to the low funds.
- the second approch is finding how frequent a company receives funding regardless of how big or small this funding is. Funding frequency is important to consistently pump money into the company so it is a good indicator.

First, we will join the two datasets to answer our question which are companies and rounds


In [None]:
companies.sort_values(by=['company_name'], inplace=True)
rounds.sort_values(by=['company_name'],inplace=True)
comp_rounds = companies.join(rounds.set_index('company_name'), on='company_name', lsuffix='_left', rsuffix='_right')
comp_rounds.columns

We will slice our new joined dataset into three categories based on the company's status:

In [None]:
operatingcomp_rounds = comp_rounds[comp_rounds['status'] == 'operating']
operatingcomp_rounds = operatingcomp_rounds[['company_name', 'funding_rounds', 'raised_amount_usd', 'funded_at']]
operatingcomp_rounds

In [None]:
acquiredcomp_rounds = comp_rounds[comp_rounds['status'] == 'acquired']
acquiredcomp_rounds = acquiredcomp_rounds[['company_name', 'funding_rounds', 'raised_amount_usd', 'funded_at']]
acquiredcomp_rounds

In [None]:
closedcomp_rounds = comp_rounds[comp_rounds['status'] == 'closed']
closedcomp_rounds = closedcomp_rounds[['company_name', 'funding_rounds', 'raised_amount_usd', 'funded_at']]
closedcomp_rounds

Now we group the operating, acquired, and closed companies by their name and see how much total of fundings across the rounds each company earned. We will see that some of the closed companies did not receive any fundings and this has to be one of the highlighted reasons to their failure.

In [None]:
operatingtotal = pd.DataFrame(operatingcomp_rounds.groupby(['company_name'])['raised_amount_usd'].sum())
operatingtotal

In [None]:
acquiredtotal = pd.DataFrame(acquiredcomp_rounds.groupby(['company_name'])['raised_amount_usd'].sum())
acquiredtotal

In [None]:
closedtotal = pd.DataFrame(closedcomp_rounds.groupby(['company_name'])['raised_amount_usd'].sum())
closedtotal

 Comparing the raised amount to the operating companies to the closed companies seem to be higher on average. Now let's calculate the average of the whole raised amount to see how much on average does a company need for operation.

In [None]:
d = {'status': ['operating', 'aquired', 'closed'], 'mean': [operatingtotal['raised_amount_usd'].mean(), acquiredtotal['raised_amount_usd'].mean(), closedtotal['raised_amount_usd'].mean()]}
statuses = pd.DataFrame(d)
statuses

We see now on the bar chart that the closed companies received the lowest amount of funding which can prove our intial hypothesis. The operating companies received more amount of funding compared to the closed companies while the acquired companies received much more which makes sense since the acquired companies mostly received those fundings either with debt until they got acquired so they recieved much more than either the closed or the operating.

In [None]:
plt.bar(statuses['status'], statuses['mean'])

In Conclusion, not getting enough funding can critically affect the company's chance of success. This indicates that the companies must focus more on finding fundings and investments into their startups.

Regarding the second approach, we will look now at the closed companies that recieved more than one funding round:

Now we will look at the time before the first funding were the fundings for the closed companies. We will ignore the negative signs and consider its absolute values as it might have one date came before the other or it might have been subtracted by 0 in case it received more than one funding on the same day. To solve the negatives problem we will perfom absolute function.

In [None]:
comp_rounds = companies.set_index('company_name')
comp_rounds['time_before_first_fund'] = comp_rounds['time_before_first_fund'].abs()
comp_rounds = comp_rounds.sort_values('time_before_first_fund', ascending=False)


In [None]:
closedcomp_rounds = comp_rounds[comp_rounds['status'] == 'closed']
closedcomp_rounds

As seen below and in the above dataframe that the biggest gap between fundingand the foundation was 38653 days which seems to be a significant gap on getting funded.

In [None]:
operatingcomp_rounds = comp_rounds[comp_rounds['status'] == 'operating']
operatingcomp_rounds

Seeing that the gap between fundingand establishment either the operating companies or the closed ones, it seems the operating companies had bigger gaps. so it may appear that the time between the foundtion and the first funding isn't an effective factor.

In conclusion, time taken to get a funding can be considered a non-critical factor in affecting the success of the company. It means that once the company rceives the funding nomatter when as long as it's suffcient (based on the last observation) it will get the company operating.

## Question 2 (Ashraf & Farah)

Regarding the Second Question:  <br>
- 	Which regions (countries) are most probable to have the most failed startups?
    -	For this: We are initially interested in <br> `country_code`, `status`, and `investor_name`


First we need to join the companies dataset with the investments dataset

In [None]:
companies.sort_values(by=['company_name'], inplace=True)
invests.sort_values(by=['company_name'],inplace=True)
comp_invests = companies.join(invests.set_index('company_name'), on='company_name',lsuffix='_left', rsuffix='_right')
comp_invests

In [None]:
comp_invests.T

Now, we will categorize the companies according to their status (oprating, closed, aquired)

In [None]:
closedcomp_invests = comp_invests[comp_invests['status'] == 'closed']
closedcomp_invests

We will group by the country to see the number of companies (of each status) in each country to gain insight if there's a massive difference between the number of operating companies and closed ones in a single country.

In [None]:
closedcomptotal = pd.DataFrame(closedcomp_invests.groupby(['company_country_code_left'])['company_name'].count())
closedcomptotal.drop('missing',inplace=True)
closedcomptotal = closedcomptotal.sort_values('company_name', ascending=False)
closedcomptotal

Here's a heatmap that illustrates the previous dataframe. It shows us clearly that the US has the most closed companies while others have less. We will take a sample of the first 10 and consider the rest outliers (as their values are 1) because it will be hard to visualize them and their valus won't affect the analysis much.

In [None]:
closedcomptotalsample = closedcomptotal.head(10)
sns.heatmap(closedcomptotalsample, vmin=100, vmax=7000);

Now we categorize the operating companies

In [None]:
operatingcomp_invests = comp_invests[comp_invests['status'] == 'operating']
operatingcomp_invests

We will group by the country and see how many operating companies in each country.

In [None]:
operatingtotal = pd.DataFrame(operatingcomp_invests.groupby(['company_country_code_left'])['company_name'].count())
operatingtotal.drop('missing',inplace=True)
operatingtotal = operatingtotal.sort_values('company_name', ascending=False)
operatingtotal

The heatmap yet again shows that the US has the most operating companies which seems noticable because it also had the most closed companies.We will take a sample of the first 10 and consider the rest outliers (as their values are 1) because it will be hard to visualize them and their valus won't affect the analysis much.

In [None]:
operatingtotalsample = operatingtotal.head(10)
operatingtotalsample
sns.heatmap(operatingtotalsample, vmin=1000, vmax=10000)

After looking in the closed and operating companies we see that we still can't determine if the country has more operting companies than the closed or not, so let's take one country and compare. For example, let's look at the USA since it has the highest number of operating companies and closed companies.

In [None]:
df = comp_invests.loc[comp_invests['company_country_code_left'] == 'USA']
df = pd.DataFrame(df.groupby(['status'])['status'].count())
df

The dataframe above shows us how although the US has strong economics and massive amount of investor as seen earlier, it has a bulk of closed companies but compared to operting it's only 0.08 (7956/92228) of it. So, this reflects that The US must have facilities and enough fundings for startups.

If we quickly look at another example and that is the country after the US which is The United Kingdom (GBR) it had 7846 oeprating company with 567 closed companies. Ofcourse, since the US is much bigger the number are biggr but we will look at the ratio between the closed to operating companies to find it 0.07. SIgnificantly it is closed to the US and we can consider that both countries have the same facilities and both show that the lack of investors can affect the companies.

In conclusion, as the ratio the ratio between the closed to operating companies increases, the more dificult it is for a company to start up. This will lead us to look closed into factors that makes those countries a difficult place to start a company.

## Question 3 (Farah)

### How the number of investors in a country can affect the number of successful companies in this country?

First, let's narrow down to the investors that invested in the companies in the same country because there's a posibility that an investor invested in a company in a different country.

In [None]:
comp_investors = invests[invests['company_country_code'] == invests['investor_country_code']]
comp_investors

Now, let's merge the new dataset with companies so we can get the status.

In [None]:
comp_investors = comp_investors.join(companies.set_index('company_name'), on='company_name',lsuffix='_left', rsuffix='_right')
comp_investors

Then we will categorize our data according to its status.

In [None]:
operatingcomp_invests = comp_investors[comp_investors['status'] == 'operating']
operatingcomp_invests

In [None]:
closedcomp_invests = comp_investors[comp_investors['status'] == 'closed']
closedcomp_invests

Next, we will group by the investor's country to see how many investors in each country invested in the operating companies.

In [None]:
operatingcomp_investstotal = pd.DataFrame(operatingcomp_invests.groupby(['investor_country_code'])['investor_name'].count())
operatingcomp_investstotal.drop('missing',inplace=True)
operatingcomp_investstotal = operatingcomp_investstotal.sort_values('investor_name', ascending=False)
operatingcomp_investstotal

Let's visualize our findinfs through a sample of the dataframe

In [None]:
operatingcomp_investstotal = operatingcomp_investstotal.head(10)
sns.heatmap(operatingcomp_investstotal, vmin=100, vmax=10000)

We will do the same for the closed companies

In [None]:
closedcomp_investstotal = pd.DataFrame(closedcomp_invests.groupby(['investor_country_code'])['investor_name'].count())
closedcomp_investstotal.drop('missing',inplace=True)
closedcomp_investstotal = closedcomp_investstotal.sort_values('investor_name', ascending=False)
closedcomp_investstotal

In [None]:
closedcomp_investstotal = closedcomp_investstotal.head(10)
sns.heatmap(closedcomp_investstotal, vmin=10, vmax=1000)

We can see from the data obtained how the number of investors in general in the closed companies is significantlly lower than the those who invested in the operating ones. This can be seen by establishing a ratio between the investors in closed companies to the investors in the operating companies and we will see the difference.

In conclusion, investing plays an important part in keeping the company operating. As the number of investors increases, the chance of success increases.

## Rest of Questions (Aisha)

Heads-up: 
1) Here I didn't use the cleaned dataset version, as we can infer some insights from the null values
2)Some of the data could have been better if they were represented in charts but due to their data types it wasn't really possible and some of the others with favorable data types had labelling merging with one another making it impossible to read.

### Question 4 : Analyzing Funding Rounds

We'll check the number of times funding is repeated overall irrespective of the differenece in company status and then we will see the company statuses.

#### Companies data set

In [None]:
import pandas as pd
df=pd.read_csv('datasets/startups/companies.csv')
df.head()


In [None]:
df.drop(['state_code', 'region'],axis=1,inplace=True)
df['first_funding_at']=df['first_funding_at'].fillna("not available")
df.head()

In [None]:
import matplotlib.pyplot as plt
df.status.hist()
plt.title('status difference')

As the histogram above shows, we have more companies operating as compared to acquired ones which would imply that they received far more funding and produced more than those which were acquired.

#### IPO companies?

It's a situation in which one starts a business by soliciting capital from family, friends, and investors in exchange for a portion of the company. We go to an investor bank and inform them about the idea or product to raise money for this company. They will then introduce us to institutional investors / people who are interested in this idea or product, and they will sell some of the business's shares to assist in bringing the company to the public market. This helps raise a lot of money, sell shares at a reasonable price that suits the shareholder, raise more money through additional rounds of investment, and promote the brand because IPOs deliver a good credibility boost.
The disadvantages include:
When it comes to matters like openness, the SEC (Securities and Exchange Commission is the US federal agency in charge of regulating and managing financial markets) and exchanges have stringent criteria. As a result, public firms are required to reveal a great deal of business-related information, which may result in competitors obtaining access to material that is the company's trade secret.
Less control as shareholders will now affect decision making.


We'll now separate companies in to 3 status categories; operating, acquired, and ipo. Then we'll compare the funding rounds based on these statuses.

In [None]:
operating=df.loc[df['status']=='operating']
acquired=df.loc[df['status']=='acquired']
ipo=df.loc[df['status']=='ipo']

#### 1-Operating companies

In [None]:

import matplotlib.pyplot as plt
operating.funding_rounds.value_counts().plot(kind="bar",alpha=0.5)
plt.title('Funding rounds')
plt.xlabel('funding')
plt.ylabel('funding frequency') 
plt.show()

The above curve can be matched to the values below and we can deduce that operating companies get more one time funding rounds that two.

In [None]:
operating['funding_rounds'].value_counts()

Now we will see how this differs from acquired companies.

#### 2-Acquired companies 

In [None]:
#acquired companies

plt.title('funding rounds for aqcuired companies')
acquired.funding_rounds.value_counts().plot(kind="bar",alpha=0.5)
plt.title('Funding rounds')
plt.xlabel('funding')
plt.ylabel('funding frequency') 
plt.show()

This can be confirmed and further understood with the values below.

In [None]:
acquired['funding_rounds'].value_counts()

As we can see, both operating and acquired companies get more one time funding rounds than 2 with operating companies having by far the largest fundings. For IPO companies is illustrated below.

#### 3-IPO companies

In [None]:
#ipo companies
import matplotlib.pyplot as plt

plt.title('funding rounds for IPO companies')
ipo.funding_rounds.value_counts().plot(kind="bar",alpha=0.5)
plt.title('Funding rounds')
plt.xlabel('funding')
plt.ylabel('funding frequency') 
plt.show()

In [None]:
ipo['funding_rounds'].value_counts()

We can conclude that operating companies exceed in founding rounds but all have higher one time fundings.

#### Investments data set part

In [None]:
d=pd.read_csv('datasets/startups/investments.csv')
d.head()


In [None]:
d.drop([ 'company_state_code','company_region','investor_state_code','investor_region'],axis=1,inplace=True)
d['raised_amount_usd']=d['raised_amount_usd'].fillna("not available")
d['funding_round_code']=d['funding_round_code'].fillna("not available")
d['company_country_code']=d['company_country_code'].fillna("not provided")
d['investor_country_code']=d['investor_country_code'].fillna("not provided")
d.head()

In [None]:
d.drop_duplicates(subset=['company_country_code'])
d.drop_duplicates(subset=['investor_country_code'])
d.head()

#### Question 5: How many investors do we have in different countries and which country dominates?

We will do this by checking the amount of times country codes repeat in the investor tables and the company country location names which could tell a little more about the country's financial and business status.

In [None]:
d.investor_country_code.value_counts()

In [None]:
d.company_country_code.value_counts()

The above data shows that we have more investors in the USA and companies receiving investments as compared to other countries.
We can deduce that this country has a better financial and business status as compared to the rest making it appear like pool of gain for investors.

#### Which investor dominates the market?

In [None]:
d['investor_name'].value_counts()

As we can see, Sequoia Capital dominates the market. Wayra and 500 Startups seem to be quit close in count from which we could deduce that they are competitors just like the rest of the companies in the list but the fact that their count have a 4 point gap could mean that they follow up on one another.

#### Question 6: Which company is most famous amongst investors?

In [None]:
d['company_name'].value_counts()

The above data shows that the Uber company dominates the market with DocuSign and Fab having the same count which may infere that one is as trust worth as the other. The fact Uber is at the top of the list also tells us that this company has a lot of credibility in the eyes of investors. 

#### Question 7: What about these company's specialization as compared to investment?Which category (field) attracts more funding? 

 We will check the category list for both companies and investments dataframes.

In [None]:
#companies
df = df[df.category_list!= 'NaN']
df.category_list.value_counts()

In [None]:
#investments
d = d[d.company_category_list!= 'NaN']
d.company_category_list.value_counts()

As we can see, although biotechnology is more ofinterest to investors, companies seem to have more interest in software which could be due to expensive and delicate nature of biotechnology.

#### Question 8: Are most of the investors also acquirers?

In [None]:
p=pd.read_csv("datasets/startups/acquisitions.csv")
p.head()

In [None]:
p.drop(['company_state_code', 'company_region','acquirer_state_code','acquirer_region'],axis=1,inplace=True)
p['company_category_list']=p['company_category_list'].fillna("not provided")
p['company_country_code']=p['company_country_code'].fillna("not provided")


p.head()

Now we'll count the frequency of appearance of acquirer names to determine which one of them is also an investor and compare it to the company names to figure out if the same companies are famous with investors too in this data set.

In [None]:
p.acquirer_name.value_counts()

In [None]:
p.company_name.value_counts()

Shockingly, the first to top the list is Cisco unlike in the investor data set where Sequoia Capital tops the list. Also, it can be noticed that the companies which have had the most share acquisitions are Unveil Technologies and Ufree.

#### Footnote: What can some of the null or missing values infer?

To do this we'll check the total missing values in the acquisitions data set.

In [None]:
p.isna().sum()

* The null values in the price_amount column may mean that the acquisition process have not being completed, was cancelled, or the files containing this information are not available.

* Null in company_city may point out that either the location of the company is unexact or that the acquisition process was cancelled. The reverse may be true in case with the acquirer_city.

# Answering Questions By Web Scraping

In [2]:
companiesC = pd.read_csv("datasets/startupsCleaned/companiesCleaned.csv", parse_dates=['founded_at', 'first_funding_at', 'last_funding_at', 'time_before_first_fund'])

In [3]:
import requests
import threading
from threading import Thread
session = requests.Session()

In [4]:
lock = threading.Lock()
def searchByName(name):
    """
    Searches for the company in the API, and retreives its ID
    """
    url = "https://app.apollo.io/api/v1/omnisearch/search"
    headers = {
        "Content-Type": "application/json",
        "Cookie": "zp__initial_utm_source=www.google.com; ZP_Pricing_Split_Test_Variant=21Q3_EC_V49; drift_aid=67cbe002-652d-41f5-8299-25cdf42b3bd3; driftt_aid=67cbe002-652d-41f5-8299-25cdf42b3bd3; remember_token_leadgenie_v2=IjYyNzI0N2Y0MDMyOTA0MDBkOTI3ZjA0NV9sZWFkZ2VuaWVjb29raWVoYXNoIg==--b84a729982761895bd61aaf87c5a9de49f9802dd; app_token=6438fb65adc314e0404230bf41536a2a; zp__utm_source=accounts.google.com; drift_eid=627247f403290400d927f045; __stripe_mid=06eaafd7-76d2-4770-908b-d8f07c1cbebe36217d; GCLB=CJLFtP2rss3o3AE; drift_campaign_refresh=ed98cb61-0a57-4710-9487-473ae481e321; X-CSRF-TOKEN=DvMh51GSNs0pr8bDfPnsZwYMhNmg9yjY+RctJv44cP/CGhK/9YFKq8FTlgBtwqSva3K2ZVboYhTIID5DIpEn9g==; _leadgenie_session=c1RCOGFzSTZuWGsyZjR6TnEwM3RNczZaVXFSREFESFNUWm9XYTZEcElndGZ6My9YbkZVbkpwY1hseEc3bVNCd3BLSkNOWU02UE4xNmJyeWh5QzhpZTFKbnh5Y1dlbFNmQ0NtM0VKckttUXlhMmx5MnFaV3diZllNY3pseExZQk1tcC9vMHFVWGNlS092WlhYWnlHL1J3PT0tLUdqSisxOWRjZkdyRnpHTlZtU1ZhaUE9PQ==--2231905501d53c714778df3cf86d8cbde66e7647",
        "Origin": "https://app.apollo.io",
        "Referer": "https://app.apollo.io/"
    }
    data = {
        "query":f"{name}",
        "num_fetch_result":1,
        "cacheKey":1650463168797
    }
    try:
        lock.acquire()
        response = session.post(url, headers=headers, data=json.dumps(data), timeout=5)
        lock.release()
    except:
        lock.release()
        return "ACCOUNT BLOCKED"

    try:
        if not len(response.json()['organizations']):
            return "NOT FOUND"
        else:
            return response.json()['organizations'][0]['id']
    except:
        return "ACCOUNT BLOCKED"
    

In [5]:
from random import randint
import time
lock = threading.Lock()

def getCompanies(id):
    """
    Returns Company Info from using the ID retrieved
    """
    if id in ["NOT FOUND", "ACCOUNT BLOCKED"]:
        return "NOT FOUND"
    else:
        url = 'https://app.apollo.io/api/v1/organizations/'+id
        headers = {
            "Content-Type": "application/json",

            "Cookie": "zp__initial_utm_source=www.google.com; ZP_Pricing_Split_Test_Variant=21Q3_EC_V49; drift_aid=67cbe002-652d-41f5-8299-25cdf42b3bd3; driftt_aid=67cbe002-652d-41f5-8299-25cdf42b3bd3; remember_token_leadgenie_v2=IjYyNzI0N2Y0MDMyOTA0MDBkOTI3ZjA0NV9sZWFkZ2VuaWVjb29raWVoYXNoIg==--b84a729982761895bd61aaf87c5a9de49f9802dd; app_token=6438fb65adc314e0404230bf41536a2a; zp__utm_source=accounts.google.com; drift_eid=627247f403290400d927f045; __stripe_mid=06eaafd7-76d2-4770-908b-d8f07c1cbebe36217d; GCLB=CJLFtP2rss3o3AE; drift_campaign_refresh=ed98cb61-0a57-4710-9487-473ae481e321; X-CSRF-TOKEN=DvMh51GSNs0pr8bDfPnsZwYMhNmg9yjY+RctJv44cP/CGhK/9YFKq8FTlgBtwqSva3K2ZVboYhTIID5DIpEn9g==; _leadgenie_session=c1RCOGFzSTZuWGsyZjR6TnEwM3RNczZaVXFSREFESFNUWm9XYTZEcElndGZ6My9YbkZVbkpwY1hseEc3bVNCd3BLSkNOWU02UE4xNmJyeWh5QzhpZTFKbnh5Y1dlbFNmQ0NtM0VKckttUXlhMmx5MnFaV3diZllNY3pseExZQk1tcC9vMHFVWGNlS092WlhYWnlHL1J3PT0tLUdqSisxOWRjZkdyRnpHTlZtU1ZhaUE9PQ==--2231905501d53c714778df3cf86d8cbde66e7647",
            "Origin": "https://app.apollo.io",
            "Referer": "https://app.apollo.io/"
        }
        try:
            lock.acquire()
            response = session.get(url, headers=headers, timeout=5)
            lock.release()
        except:
            lock.release()
            return "NOT FOUND"
        return response.text

In [6]:
companiesC['employee_count'] = 0.0
companiesC['country'] = ""
companiesC['number_of_technologies'] = 0.0
companiesC['annual_rev'] = 0.0

In [7]:
df1 = companiesC.loc[0:10000]
df2 = companiesC.loc[30001:40000]
#df2 = companiesC.loc[30001:61267]
dict = {
    "index" : [],
    "employee_count" : [],
    "country" : [],
    "number_of_technologies" : [],
    "annual_rev" : []
}

In [8]:
import threading
from threading import Thread
import time
lock = threading.Lock()

def getCompanyInfo(idx, searchTerm):
    res = getCompanies(searchByName(searchTerm))
    try:
        res = json.loads(res)
    except:
        print('NO', idx)
        return None #usually returns None if getCompanies() returns "NOT FOUND"
    print('YES', idx)
    try:
        employee_count = res['organization']['estimated_num_employees']
    except:
        employee_count = np.NaN
    try:
        country = res['organization']['country']
    except:
        country = np.NaN
    try:
        number_of_technologies = len(res['organization']['technology_names'])
    except:
        number_of_technologies = np.NaN
    try:
        annual_rev = res['organization']['annual_revenue']
    except:
        annual_rev = np.NaN
    lock.acquire()
    compInfoTups.append((idx, employee_count, country, number_of_technologies, annual_rev))
    lock.release()


In [9]:
for i in range(61, 1000):
    threads = []
    compInfoTups = []
    try:
        for j in range(10):
            j += i*10
            searchTerm = df2['homepage_url'][j+30001]
            th = Thread(target=getCompanyInfo, args=(j+30001, searchTerm))
            threads.append(th)
            th.start()
        for th in threads:
            th.join()
    finally:
        dfTups = pd.DataFrame(compInfoTups, columns=['index', 'employee_count', 'country', 'number_of_technologies', 'annual_rev'])
        dfTups.to_csv('Scrapped Companies/scrapped_companies_2.csv', model='a', index=False, header=False)
    
    

NO 30001
YES 30002
YES 30003
YES 30004
YES 30005
YES 30006
YES 30007
YES 30008
YES 30009
YES 30010
YES 30011
YES 30012
YES 30013
YES 30014
YES 30015
YES 30016
YES 30017
YES 30018
YES 30019
YES 30020
NO 30028
YES 30021
YES 30022
YES 30023
YES 30024
YES 30025
YES 30026
YES 30027
YES 30029
YES 30030
YES 30031
YES 30032
YES 30033
YES 30034
YES 30035
YES 30036
YES 30037
YES 30038
YES 30039
YES 30040
NO 30046
YES 30041
YES 30042
YES 30043
YES 30044
YES 30045
YES 30047
YES 30048
YES 30049
YES 30050
NO 30051
NO 30052
YES 30053
YES 30054
YES 30055
YES 30056
YES 30057
YES 30058
YES 30059
YES 30060
YES 30061
YES 30062
YES 30063
YES 30064
YES 30065
YES 30066
YES 30067
YES 30068
YES 30069
YES 30070
NO 30072
NO 30075
YES 30071
YES 30073
YES 30074
YES 30076
YES 30077
YES 30078
YES 30079
YES 30080
YES 30081
YES 30082
YES 30083
YES 30084
YES 30085
YES 30086
YES 30087
YES 30088
YES 30089
YES 30090
NO 30094
YES 30091
YES 30092
YES 30093
YES 30095
YES 30096
YES 30097
YES 30098
YES 30099
YES 30100
YES 3010

In [35]:
compInfoTups

[]

In [None]:
"""
for url in range(len(df1)):
    res = getCompanies(searchByName(df1['homepage_url'][url]))
    emp = re.search(r"\"estimated_num_employees\"
    :([0-9]+)" , res)
    if emp is not None:
        df1['employee_count'][url] = emp.group(1)
    elif emp is None:
        emp = "-"
        df1['employee_count'][url] = emp

    country = re.search(r"(\"country\"\:)\"(\w+.*)\"\,\"o" , res)
    if country is not None:
        df1['country'][url] = country.group(2)
    elif country is None:
        country = "-"
        df1['country'][url] = country

    tech = re.findall(r"\"current_technologies\"\:.*\]" , res)
    tech = re.findall(r"\"name\"\:", str(tech))
    tech_count = sum('name' in t for t in tech)
    df1['number_of_technologies'][url] = tech_count

    revenue = re.search(r"\"annual_revenue\"\:([0-9]+)" , res)
    if revenue is not None:
        df1['revenue'][url] = revenue.group(1)
    elif revenue is None:
        revenue = "-"
        df1['revenue'][url] = revenue

df1.to_csv('Scrapped Companies/scrapped_companies.csv', index=False)
df1
"""