In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
df = pd.read_csv('../input/startup_funding.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.isnull().sum() / df.shape[0]

In [None]:
# Drop Remarks column as it has too many missing values.
# the subvertical may also be difficult to work with as ~39%.
# also the AmountInUSD is miss more than a third. Since this 
# is an important columns we will want to keep it.
df.drop(columns=['Remarks'], axis=1, inplace=True)

In [None]:
# Clean up investment amount.
df['AmountInUSD'].replace(to_replace=',', value='', regex=True, inplace=True)
df['AmountInUSD'].fillna(0, inplace=True)
df['AmountInUSD'] = df['AmountInUSD'].astype(int)

In [None]:
# Need to remove .com and .in from names in case of duplicates.
df['StartupName'].replace(to_replace=['.in', '.com', '.co'], value='', regex=True, inplace=True)

In [None]:
# Remove rows where the Investor is unidentified.
df = df[~df['InvestorsName'].isin(['Undisclosed HNIs', 'undisclosed investors', 'Undisclosed Investor', 'Undisclosed investor', 'Undisclosed', 'Undisclosed investors', 'Undisclosed Investors', 'Undisclosed investors'])]

In [None]:
# Consider how to treat rows that have multiple investors. 
# Perhaps split them into multiple rows, one per investor?


In [None]:
# Plot the number investments per investor and the range of investment sizes.
import matplotlib.pyplot as plt
%matplotlib inline

count_by_investor = df.groupby(['InvestorsName']).size().reset_index(name='NumberInvestmentsPerInvestor').sort_values(by='NumberInvestmentsPerInvestor', ascending=False)

# Select the top 20 most active investors to investigate further.
count_by_investor = count_by_investor[0:20]

most_active_investors = count_by_investor['InvestorsName'].tolist()
number_investments = count_by_investor['NumberInvestmentsPerInvestor'].tolist()

x_pos = [i for i, _ in enumerate(most_active_investors)]

plt.barh(x_pos, number_investments)

plt.ylabel("Investor")
plt.xlabel("Number of Investments")
plt.title("20 Most Active Start-up Investors")

plt.yticks(x_pos, most_active_investors)

plt.show()


In the data set there are **X** number if investments made by **Y** investors (or groups of investors). Undisclosed investors represrent **Z** percent of the investments and are excluded from the sample.

We plot the top 20 most active investors over the sample period that made **Y** investments. This number ranges from 24 investments made by the Indian Angel Network and Ratan Tata to 5 investment. This suggest there is a long tail of investors making a small number of investments each year.

In [None]:
# Plot the range of investment sizes per
active_investors = df[df['InvestorsName'].isin(most_active_investors)]
active_investors['AmountInUSD'] = np.log(active_investors['AmountInUSD'])

plt.boxplot(active_investors['AmountInUSD'], active_investors['InvestorsName'])
plt.show()

In [None]:
# Plot the number of investments per investment type and range of values.

count_by_investment_type = df.groupby(['InvestmentType']).size().reset_index(name='NumberPerInvestmentType').sort_values(by='NumberPerInvestmentType', ascending=False)

investment_type = count_by_investment_type['InvestmentType'].tolist()
number_investments_type = count_by_investment_type['NumberPerInvestmentType'].tolist()

x_pos = [i for i, _ in enumerate(investment_type)]

plt.barh(x_pos, number_investments_type)

plt.ylabel("Investment Type")
plt.xlabel("Number of Investments")
plt.title("Number of Investments by Investment Type")

plt.yticks(x_pos, investment_type)

plt.show()

In [None]:
count_by_industry = df.groupby(['IndustryVertical']).size().reset_index(name='NumberInvestmentsPerIndustry').sort_values(by='NumberInvestmentsPerIndustry', ascending=False)

# Select the top 10 most active industries to investigate further.
count_by_industry = count_by_industry[0:10]

most_active_industry = count_by_industry['IndustryVertical'].tolist()
number_investments = count_by_industry['NumberInvestmentsPerIndustry'].tolist()

x_pos = [i for i, _ in enumerate(most_active_industry)]

plt.barh(x_pos, number_investments)

plt.ylabel("Industry Vertical")
plt.xlabel("Number of Investments")
plt.title("10 Most Active Start-up IndustryVerticals")

plt.yticks(x_pos, most_active_industry)

plt.show()

In [None]:
# Plot a a timeseries investments by top 20 investors.

In [None]:
# Key questions
# (1) Do investors tend to invest in similar industries?
# (2) Do investors tend to invest in the same companies?
# (3) Do industries become popular at certain times?
# (4) Is investment concentrated in industries or locations?

In [None]:
# Plot investment by industry over time.

# Covert industries outside the top 10 most active to 'Other'

df.loc[df['IndustryVertical'].isin(most_active_industry), 'IndustryVertical'] = 'Others'


#investment_by_date = df.groupby(['Date', 'IndustryVertical'])['AmountInUSD'].sum().reset_index(name='MonthlyInvestment')
#investment_by_date['IndustryVertical']

In [None]:
df

In [None]:


date = df['Date'].tolist()
bronzes = np.array([38, 17, 26, 19, 15])
silvers = np.array([37, 23, 18, 18, 10])
golds = np.array([46, 27, 26, 19, 17])
ind = [x for x, _ in enumerate(countries)]

plt.bar(ind, golds, width=0.8, label='golds', color='gold', bottom=silvers+bronzes)
plt.bar(ind, silvers, width=0.8, label='silvers', color='silver', bottom=bronzes)
plt.bar(ind, bronzes, width=0.8, label='bronzes', color='#CD853F')

plt.xticks(ind, countries)
plt.ylabel("Medals")
plt.xlabel("Countries")
plt.legend(loc="upper right")
plt.title("2012 Olympics Top Scorers")

plt.show()

In [None]:
# Convert date to MMYY
df['Date'].strftime("%m-%Y")