In [6]:
import numpy as np
import pandas as pd 
import plotly
from wordcloud import WordCloud
from io import StringIO
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import joblib

from pathlib import Path

import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option("display.precision", 3)
pd.set_option("display.expand_frame_repr", False)

plotly.offline.init_notebook_mode()

ModuleNotFoundError: No module named 'plotly'

In [None]:
funding_rounds=pd.read_csv("/kaggle/input/startup-investments/funding_rounds.csv")
milestones=pd.read_csv("/kaggle/input/startup-investments/milestones.csv")
acquisitions=pd.read_csv("/kaggle/input/startup-investments/acquisitions.csv")
ipos=pd.read_csv("/kaggle/input/startup-investments/ipos.csv")
relationships=pd.read_csv("/kaggle/input/startup-investments/relationships.csv")
objects=pd.read_csv("/kaggle/input/startup-investments/objects.csv")
offices=pd.read_csv("/kaggle/input/startup-investments/offices.csv")
people=pd.read_csv("/kaggle/input/startup-investments/people.csv")
degrees=pd.read_csv("/kaggle/input/startup-investments/degrees.csv")
investments=pd.read_csv("/kaggle/input/startup-investments/investments.csv")
funds=pd.read_csv("/kaggle/input/startup-investments/funds.csv")

In [None]:
#funding_rounds: raised_amount_usd, is_first_round, post_money_valuation_usd, object_id, funding_round_id
#objects: name, category_code, founded_at, country_code, first_funding_at, funding_total_usd,milestones, entity_id
#people: object_id, id, affiliation_name
#degrees: object_id, id, subject, institution, degree_type
#investments: funded_object_id,funding_round_id
#funds:id, fund_id, object_id, name, raised_amount, raised_currency_code
#relationships: person_object_id,relationship_object_id
funding_rounds = funding_rounds[['raised_amount_usd', 'is_first_round', 'post_money_valuation_usd', 'object_id', 'funding_round_id']]
objects = objects[['id', 'name', 'category_code', 'founded_at', 'country_code', 'first_funding_at', 'funding_total_usd', 'milestones', 'entity_id']]
people = people[['id', 'object_id', 'affiliation_name']]
degrees = degrees[['id', 'object_id', 'subject','institution', 'degree_type']]
investments = investments[['funded_object_id', 'funding_round_id']]
funds = funds[['id', 'fund_id', 'object_id', 'name', 'raised_amount', 'raised_currency_code']]
relationships = relationships[['id', 'person_object_id', 'relationship_object_id']]

# Data Cleaning

In [None]:
#merge datasets with companies' info 
df = pd.merge(objects, funding_rounds, left_on = 'id', right_on = 'object_id')
df1 = df.drop('object_id', axis = 1)
company = df1.merge(investments, left_on = 'id', right_on = 'funded_object_id')
company

Unnamed: 0,id,name,category_code,founded_at,country_code,first_funding_at,funding_total_usd,milestones,entity_id,raised_amount_usd,is_first_round,post_money_valuation_usd,funding_round_id_x,funded_object_id,funding_round_id_y
0,c:1,Wetpaint,web,2005-10-17,USA,2005-10-01,3.975e+07,5,1,5.250e+06,0,0.0,888,c:1,888
1,c:1,Wetpaint,web,2005-10-17,USA,2005-10-01,3.975e+07,5,1,5.250e+06,0,0.0,888,c:1,888
2,c:1,Wetpaint,web,2005-10-17,USA,2005-10-01,3.975e+07,5,1,5.250e+06,0,0.0,888,c:1,889
3,c:1,Wetpaint,web,2005-10-17,USA,2005-10-01,3.975e+07,5,1,5.250e+06,0,0.0,888,c:1,889
4,c:1,Wetpaint,web,2005-10-17,USA,2005-10-01,3.975e+07,5,1,5.250e+06,0,0.0,888,c:1,889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223893,c:998,Wamba,web,2013-05-14,,2007-09-01,4.112e+06,3,998,4.112e+06,1,0.0,737,c:998,737
223894,c:9994,Imindi,advertising,2008-09-01,,2007-01-01,5.000e+05,1,9994,2.500e+05,0,0.0,3253,c:9994,3253
223895,c:9994,Imindi,advertising,2008-09-01,,2007-01-01,5.000e+05,1,9994,2.500e+05,0,0.0,3253,c:9994,6112
223896,c:9994,Imindi,advertising,2008-09-01,,2007-01-01,5.000e+05,1,9994,2.500e+05,1,0.0,6112,c:9994,3253


In [None]:
#merge datasets with people's info 
df2 = pd.merge(people, degrees, on = 'object_id')
people = df2.drop(['id_y', 'id_x'], axis = 1)
people

Unnamed: 0,object_id,affiliation_name,subject,institution,degree_type
0,p:2,Blue Nile,Electrical Engineering/Computer Science,"University of California, Berkeley",BS
1,p:2,Blue Nile,Applied Mathematics,"University of California, Berkeley",BS
2,p:3,Wetpaint,,Washington University in St Louis,BBA
3,p:3,Wetpaint,,Indiana University,MBA
4,p:5,Zoho,Advanced Business Professional Course,The Aji Network,Degree
...,...,...,...,...,...
109051,p:268528,Unaffiliated,Electrical Engineering,Colorado State University,B.S.
109052,p:268528,Unaffiliated,,University of San Diego School of Law,J.D.
109053,p:268560,Unaffiliated,,Samford University,MBA
109054,p:268589,Unaffiliated,Accounting,Iowa State University,B.S.


In [None]:
#based on relationship dataset, merge company and people; drop null
merged_company_and_p = pd.merge(company, relationships, left_on = 'id', right_on = 'relationship_object_id')
merged_people_and_c = pd.merge(people, relationships, left_on = 'object_id', right_on = 'person_object_id')
investments = pd.merge(merged_company_and_p, merged_people_and_c, on = ['relationship_object_id', 'person_object_id'], how = 'inner')
investments = investments.drop(['id_x', 'id_y', 'object_id', 'id', 'funded_object_id'], axis = 1).drop_duplicates(subset=['name']).dropna()

In [None]:
investments

Unnamed: 0,name,category_code,founded_at,country_code,first_funding_at,funding_total_usd,milestones,entity_id,raised_amount_usd,is_first_round,post_money_valuation_usd,funding_round_id_x,funding_round_id_y,person_object_id,relationship_object_id,affiliation_name,subject,institution,degree_type
0,Wetpaint,web,2005-10-17,USA,2005-10-01,3.975e+07,5,1,5.250e+06,0,0.0,888,888,p:2,c:1,Blue Nile,Electrical Engineering/Computer Science,"University of California, Berkeley",BS
945,FriendFeed,web,2007-10-01,USA,2008-02-26,5.000e+06,3,1001,5.000e+06,1,0.0,1644,1644,p:2714,c:1001,Google,Computer Science w/ Distinction,Stanford University,BS
999,Mobclix,mobile,2008-03-01,USA,2008-09-01,0.000e+00,4,10014,0.000e+00,1,0.0,6682,6682,p:16924,c:10014,Mobclix,Computer Engineering,Santa Clara University,MS
1008,Fitbit,health,2007-10-01,USA,2008-10-10,6.807e+07,0,10015,2.000e+06,0,0.0,3619,3619,p:16930,c:10015,Fitbit,Computer Science,Harvard University,DNF
2718,MTPV,cleantech,2003-01-01,USA,2011-03-08,1.013e+07,1,100155,6.500e+06,0,0.0,25184,25184,p:108552,c:100155,Unaffiliated,applied sciences,Harvard University,BS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3182961,MESoft,software,2002-01-01,USA,2007-04-15,6.000e+06,0,9893,6.000e+06,1,0.0,3380,3380,p:16819,c:9893,Microsoft,Film & Television,"University of California, Los Angeles",BA
3182963,Amie Street,web,2011-07-04,USA,2007-08-01,3.900e+06,0,99,0.000e+00,0,0.0,307,9070,p:278,c:99,AmieStreet,Modern Culture and Media,Brown University,BA
3183115,wunderloop,advertising,1999-01-01,USA,2007-01-01,1.000e+07,1,992,1.000e+07,1,0.0,733,733,p:2676,c:992,wunderloop,Electrical Engineering,Berlin Institute of Technology.,Degree
3183371,asklaila,web,2006-11-01,IND,2007-09-01,1.000e+07,1,995,1.000e+07,1,0.0,735,735,p:2691,c:995,asklaila,Computer Science,IIT Kanpur,BTECH


# feature engineering

In [None]:
#seperate founded_at/first_funding_at datetime object to year, (month, day) 
investments['founded_at'] = pd.to_datetime(investments['founded_at'])
investments['founded_year'] = investments['founded_at'].map(lambda x: x.year)
investments['founded_month'] = investments['founded_at'].map(lambda x: x.month)
investments['founded_day'] = investments['founded_at'].map(lambda x: x.day)

investments['first_funding_at'] = pd.to_datetime(investments['first_funding_at'])
investments['first_funding_year'] = investments['first_funding_at'].map(lambda x: x.year)

investments

Unnamed: 0,name,category_code,founded_at,country_code,first_funding_at,funding_total_usd,milestones,entity_id,raised_amount_usd,is_first_round,post_money_valuation_usd,funding_round_id_x,funding_round_id_y,person_object_id,relationship_object_id,affiliation_name,subject,institution,degree_type,founded_year,founded_month,founded_day,first_funding_year
0,Wetpaint,web,2005-10-17,USA,2005-10-01,3.975e+07,5,1,5.250e+06,0,0.0,888,888,p:2,c:1,Blue Nile,Electrical Engineering/Computer Science,"University of California, Berkeley",BS,2005,10,17,2005
945,FriendFeed,web,2007-10-01,USA,2008-02-26,5.000e+06,3,1001,5.000e+06,1,0.0,1644,1644,p:2714,c:1001,Google,Computer Science w/ Distinction,Stanford University,BS,2007,10,1,2008
999,Mobclix,mobile,2008-03-01,USA,2008-09-01,0.000e+00,4,10014,0.000e+00,1,0.0,6682,6682,p:16924,c:10014,Mobclix,Computer Engineering,Santa Clara University,MS,2008,3,1,2008
1008,Fitbit,health,2007-10-01,USA,2008-10-10,6.807e+07,0,10015,2.000e+06,0,0.0,3619,3619,p:16930,c:10015,Fitbit,Computer Science,Harvard University,DNF,2007,10,1,2008
2718,MTPV,cleantech,2003-01-01,USA,2011-03-08,1.013e+07,1,100155,6.500e+06,0,0.0,25184,25184,p:108552,c:100155,Unaffiliated,applied sciences,Harvard University,BS,2003,1,1,2011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3182961,MESoft,software,2002-01-01,USA,2007-04-15,6.000e+06,0,9893,6.000e+06,1,0.0,3380,3380,p:16819,c:9893,Microsoft,Film & Television,"University of California, Los Angeles",BA,2002,1,1,2007
3182963,Amie Street,web,2011-07-04,USA,2007-08-01,3.900e+06,0,99,0.000e+00,0,0.0,307,9070,p:278,c:99,AmieStreet,Modern Culture and Media,Brown University,BA,2011,7,4,2007
3183115,wunderloop,advertising,1999-01-01,USA,2007-01-01,1.000e+07,1,992,1.000e+07,1,0.0,733,733,p:2676,c:992,wunderloop,Electrical Engineering,Berlin Institute of Technology.,Degree,1999,1,1,2007
3183371,asklaila,web,2006-11-01,IND,2007-09-01,1.000e+07,1,995,1.000e+07,1,0.0,735,735,p:2691,c:995,asklaila,Computer Science,IIT Kanpur,BTECH,2006,11,1,2007


In [None]:
#funding_year_difference: funding year - founded year
investments['funding_year_difference'] = investments['first_funding_year'] - investments['founded_year']
investments

Unnamed: 0,name,category_code,founded_at,country_code,first_funding_at,funding_total_usd,milestones,entity_id,raised_amount_usd,is_first_round,post_money_valuation_usd,funding_round_id_x,funding_round_id_y,person_object_id,relationship_object_id,affiliation_name,subject,institution,degree_type,founded_year,founded_month,founded_day,first_funding_year,funding_year_difference
0,Wetpaint,web,2005-10-17,USA,2005-10-01,3.975e+07,5,1,5.250e+06,0,0.0,888,888,p:2,c:1,Blue Nile,Electrical Engineering/Computer Science,"University of California, Berkeley",BS,2005,10,17,2005,0
945,FriendFeed,web,2007-10-01,USA,2008-02-26,5.000e+06,3,1001,5.000e+06,1,0.0,1644,1644,p:2714,c:1001,Google,Computer Science w/ Distinction,Stanford University,BS,2007,10,1,2008,1
999,Mobclix,mobile,2008-03-01,USA,2008-09-01,0.000e+00,4,10014,0.000e+00,1,0.0,6682,6682,p:16924,c:10014,Mobclix,Computer Engineering,Santa Clara University,MS,2008,3,1,2008,0
1008,Fitbit,health,2007-10-01,USA,2008-10-10,6.807e+07,0,10015,2.000e+06,0,0.0,3619,3619,p:16930,c:10015,Fitbit,Computer Science,Harvard University,DNF,2007,10,1,2008,1
2718,MTPV,cleantech,2003-01-01,USA,2011-03-08,1.013e+07,1,100155,6.500e+06,0,0.0,25184,25184,p:108552,c:100155,Unaffiliated,applied sciences,Harvard University,BS,2003,1,1,2011,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3182961,MESoft,software,2002-01-01,USA,2007-04-15,6.000e+06,0,9893,6.000e+06,1,0.0,3380,3380,p:16819,c:9893,Microsoft,Film & Television,"University of California, Los Angeles",BA,2002,1,1,2007,5
3182963,Amie Street,web,2011-07-04,USA,2007-08-01,3.900e+06,0,99,0.000e+00,0,0.0,307,9070,p:278,c:99,AmieStreet,Modern Culture and Media,Brown University,BA,2011,7,4,2007,-4
3183115,wunderloop,advertising,1999-01-01,USA,2007-01-01,1.000e+07,1,992,1.000e+07,1,0.0,733,733,p:2676,c:992,wunderloop,Electrical Engineering,Berlin Institute of Technology.,Degree,1999,1,1,2007,8
3183371,asklaila,web,2006-11-01,IND,2007-09-01,1.000e+07,1,995,1.000e+07,1,0.0,735,735,p:2691,c:995,asklaila,Computer Science,IIT Kanpur,BTECH,2006,11,1,2007,1


In [None]:
#one-hot-encoding for country_code
investments_countries = pd.get_dummies(investments['country_code'])
investments = pd.concat([investments,investments_countries],axis=1)
investments.head()

Unnamed: 0,name,category_code,founded_at,country_code,first_funding_at,funding_total_usd,milestones,entity_id,raised_amount_usd,is_first_round,post_money_valuation_usd,funding_round_id_x,funding_round_id_y,person_object_id,relationship_object_id,affiliation_name,subject,institution,degree_type,founded_year,founded_month,founded_day,first_funding_year,funding_year_difference,ANT,ARE,ARG,AUS,AUT,BEL,BGD,BGR,BHR,BMU,BRA,CAN,CHE,CHL,CHN,COL,CYP,CZE,DEU,DNK,EGY,ESP,EST,FIN,FRA,GBR,GEO,GHA,GIB,GRC,HKG,HRV,HUN,IDN,IND,IRL,ISL,ISR,ITA,JOR,JPN,KOR,LTU,LUX,LVA,MAR,MEX,MKD,MYS,NGA,NLD,NOR,NZL,PAK,PAN,PER,PHL,POL,PRT,PRY,ROM,RUS,SAU,SGP,SVK,SVN,SWE,TUR,UKR,URY,USA,VNM,ZAF
0,Wetpaint,web,2005-10-17,USA,2005-10-01,39750000.0,5,1,5250000.0,0,0.0,888,888,p:2,c:1,Blue Nile,Electrical Engineering/Computer Science,"University of California, Berkeley",BS,2005,10,17,2005,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
945,FriendFeed,web,2007-10-01,USA,2008-02-26,5000000.0,3,1001,5000000.0,1,0.0,1644,1644,p:2714,c:1001,Google,Computer Science w/ Distinction,Stanford University,BS,2007,10,1,2008,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
999,Mobclix,mobile,2008-03-01,USA,2008-09-01,0.0,4,10014,0.0,1,0.0,6682,6682,p:16924,c:10014,Mobclix,Computer Engineering,Santa Clara University,MS,2008,3,1,2008,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1008,Fitbit,health,2007-10-01,USA,2008-10-10,68070000.0,0,10015,2000000.0,0,0.0,3619,3619,p:16930,c:10015,Fitbit,Computer Science,Harvard University,DNF,2007,10,1,2008,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2718,MTPV,cleantech,2003-01-01,USA,2011-03-08,10130000.0,1,100155,6500000.0,0,0.0,25184,25184,p:108552,c:100155,Unaffiliated,applied sciences,Harvard University,BS,2003,1,1,2011,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [None]:
#one-hot-encoding for category_code
category_encoded = pd.get_dummies(investments['category_code'])
investments = pd.concat([investments,category_encoded],axis = 1)
investments

Unnamed: 0,name,category_code,founded_at,country_code,first_funding_at,funding_total_usd,milestones,entity_id,raised_amount_usd,is_first_round,post_money_valuation_usd,funding_round_id_x,funding_round_id_y,person_object_id,relationship_object_id,affiliation_name,subject,institution,degree_type,founded_year,founded_month,founded_day,first_funding_year,funding_year_difference,ANT,ARE,ARG,AUS,AUT,BEL,BGD,BGR,BHR,BMU,BRA,CAN,CHE,CHL,CHN,COL,CYP,CZE,DEU,DNK,EGY,ESP,EST,FIN,FRA,GBR,GEO,GHA,GIB,GRC,HKG,HRV,HUN,IDN,IND,IRL,ISL,ISR,ITA,JOR,JPN,KOR,LTU,LUX,LVA,MAR,MEX,MKD,MYS,NGA,NLD,NOR,NZL,PAK,PAN,PER,PHL,POL,PRT,PRY,ROM,RUS,SAU,SGP,SVK,SVN,SWE,TUR,UKR,URY,USA,VNM,ZAF,advertising,analytics,automotive,biotech,cleantech,consulting,design,ecommerce,education,enterprise,fashion,finance,games_video,government,hardware,health,hospitality,legal,local,manufacturing,medical,messaging,mobile,music,nanotech,network_hosting,news,nonprofit,other,pets,photo_video,public_relations,real_estate,search,security,semiconductor,social,software,sports,transportation,travel,web
0,Wetpaint,web,2005-10-17,USA,2005-10-01,3.975e+07,5,1,5.250e+06,0,0.0,888,888,p:2,c:1,Blue Nile,Electrical Engineering/Computer Science,"University of California, Berkeley",BS,2005,10,17,2005,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
945,FriendFeed,web,2007-10-01,USA,2008-02-26,5.000e+06,3,1001,5.000e+06,1,0.0,1644,1644,p:2714,c:1001,Google,Computer Science w/ Distinction,Stanford University,BS,2007,10,1,2008,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
999,Mobclix,mobile,2008-03-01,USA,2008-09-01,0.000e+00,4,10014,0.000e+00,1,0.0,6682,6682,p:16924,c:10014,Mobclix,Computer Engineering,Santa Clara University,MS,2008,3,1,2008,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1008,Fitbit,health,2007-10-01,USA,2008-10-10,6.807e+07,0,10015,2.000e+06,0,0.0,3619,3619,p:16930,c:10015,Fitbit,Computer Science,Harvard University,DNF,2007,10,1,2008,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2718,MTPV,cleantech,2003-01-01,USA,2011-03-08,1.013e+07,1,100155,6.500e+06,0,0.0,25184,25184,p:108552,c:100155,Unaffiliated,applied sciences,Harvard University,BS,2003,1,1,2011,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3182961,MESoft,software,2002-01-01,USA,2007-04-15,6.000e+06,0,9893,6.000e+06,1,0.0,3380,3380,p:16819,c:9893,Microsoft,Film & Television,"University of California, Los Angeles",BA,2002,1,1,2007,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3182963,Amie Street,web,2011-07-04,USA,2007-08-01,3.900e+06,0,99,0.000e+00,0,0.0,307,9070,p:278,c:99,AmieStreet,Modern Culture and Media,Brown University,BA,2011,7,4,2007,-4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3183115,wunderloop,advertising,1999-01-01,USA,2007-01-01,1.000e+07,1,992,1.000e+07,1,0.0,733,733,p:2676,c:992,wunderloop,Electrical Engineering,Berlin Institute of Technology.,Degree,1999,1,1,2007,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3183371,asklaila,web,2006-11-01,IND,2007-09-01,1.000e+07,1,995,1.000e+07,1,0.0,735,735,p:2691,c:995,asklaila,Computer Science,IIT Kanpur,BTECH,2006,11,1,2007,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [None]:
#categorize degree_type; one-hot-encoding for degree_type 
import re


degree_types = {
    'Bachelors': ['B[A-Za-z]+', 'Bachelor','College','college','Undergraduate'],
    'Masters': ['M[A-Za-z]+', 'MBA', 'MPP', 'ME', 'MENG', 'MSc', 'MSIA', 'MA', 'M.Tech', 'Masters of Professional Studies', 'Magister Artium','Graduate'],
    'PhD': ['Ph[.]*[Dd]*[.]?', 'D Phil', 'Doctor', 'Doctorate', 'Post Doc'],
    'Other': ['Diploma', 'Degree', 'Associate', 'College', 'Undergraduate', 'Graduate', 'Honours', 'Honors', 'Certificate']
}

def categorize_degree(degree_str):
    for degree_type, degree_regexes in degree_types.items():
        for degree_regex in degree_regexes:
            if re.search(degree_regex, degree_str):
                return degree_type
    return 'Other'

standard_degrees = []

for degree in investments['degree_type']:
    standard_degree = categorize_degree(degree)
    standard_degrees.append(standard_degree)
   
investments['degree_type'] = standard_degrees

degree_type_encoded = pd.get_dummies(investments['degree_type'])
investments = pd.concat([investments,degree_type_encoded], axis = 1)
investments

Unnamed: 0,name,category_code,founded_at,country_code,first_funding_at,funding_total_usd,milestones,entity_id,raised_amount_usd,is_first_round,post_money_valuation_usd,funding_round_id_x,funding_round_id_y,person_object_id,relationship_object_id,affiliation_name,subject,institution,degree_type,founded_year,founded_month,founded_day,first_funding_year,funding_year_difference,ANT,ARE,ARG,AUS,AUT,BEL,BGD,BGR,BHR,BMU,BRA,CAN,CHE,CHL,CHN,COL,CYP,CZE,DEU,DNK,EGY,ESP,EST,FIN,FRA,GBR,GEO,GHA,GIB,GRC,HKG,HRV,HUN,IDN,IND,IRL,ISL,ISR,ITA,JOR,JPN,KOR,LTU,LUX,LVA,MAR,MEX,MKD,MYS,NGA,NLD,NOR,NZL,PAK,PAN,PER,PHL,POL,PRT,PRY,ROM,RUS,SAU,SGP,SVK,SVN,SWE,TUR,UKR,URY,USA,VNM,ZAF,advertising,analytics,automotive,biotech,cleantech,consulting,design,ecommerce,education,enterprise,fashion,finance,games_video,government,hardware,health,hospitality,legal,local,manufacturing,medical,messaging,mobile,music,nanotech,network_hosting,news,nonprofit,other,pets,photo_video,public_relations,real_estate,search,security,semiconductor,social,software,sports,transportation,travel,web,Bachelors,Masters,Other,PhD
0,Wetpaint,web,2005-10-17,USA,2005-10-01,3.975e+07,5,1,5.250e+06,0,0.0,888,888,p:2,c:1,Blue Nile,Electrical Engineering/Computer Science,"University of California, Berkeley",Bachelors,2005,10,17,2005,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
945,FriendFeed,web,2007-10-01,USA,2008-02-26,5.000e+06,3,1001,5.000e+06,1,0.0,1644,1644,p:2714,c:1001,Google,Computer Science w/ Distinction,Stanford University,Bachelors,2007,10,1,2008,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
999,Mobclix,mobile,2008-03-01,USA,2008-09-01,0.000e+00,4,10014,0.000e+00,1,0.0,6682,6682,p:16924,c:10014,Mobclix,Computer Engineering,Santa Clara University,Masters,2008,3,1,2008,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1008,Fitbit,health,2007-10-01,USA,2008-10-10,6.807e+07,0,10015,2.000e+06,0,0.0,3619,3619,p:16930,c:10015,Fitbit,Computer Science,Harvard University,Other,2007,10,1,2008,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2718,MTPV,cleantech,2003-01-01,USA,2011-03-08,1.013e+07,1,100155,6.500e+06,0,0.0,25184,25184,p:108552,c:100155,Unaffiliated,applied sciences,Harvard University,Bachelors,2003,1,1,2011,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3182961,MESoft,software,2002-01-01,USA,2007-04-15,6.000e+06,0,9893,6.000e+06,1,0.0,3380,3380,p:16819,c:9893,Microsoft,Film & Television,"University of California, Los Angeles",Bachelors,2002,1,1,2007,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
3182963,Amie Street,web,2011-07-04,USA,2007-08-01,3.900e+06,0,99,0.000e+00,0,0.0,307,9070,p:278,c:99,AmieStreet,Modern Culture and Media,Brown University,Bachelors,2011,7,4,2007,-4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
3183115,wunderloop,advertising,1999-01-01,USA,2007-01-01,1.000e+07,1,992,1.000e+07,1,0.0,733,733,p:2676,c:992,wunderloop,Electrical Engineering,Berlin Institute of Technology.,Other,1999,1,1,2007,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3183371,asklaila,web,2006-11-01,IND,2007-09-01,1.000e+07,1,995,1.000e+07,1,0.0,735,735,p:2691,c:995,asklaila,Computer Science,IIT Kanpur,Bachelors,2006,11,1,2007,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0


# Modeling

In [None]:
X = investments.drop(['founded_year', 'first_funding_year', 'name', 'category_code', 'founded_at', 'country_code', 'first_funding_at', 'entity_id', 'funding_round_id_x', 'funding_year_difference', 'funding_round_id_y', 'raised_amount_usd', 'funding_total_usd', 'person_object_id', 'relationship_object_id', 'affiliation_name', 'subject', 'institution', 'degree_type', 'founded_month', 'founded_day'], axis = 1) 
y = investments['raised_amount_usd']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_train) 
y_pred = linear_regressor.predict(X_test)
MSE_linear = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", MSE_linear)

Mean Squared Error: 183298376593634.22


In [None]:
import joblib

In [None]:
#DOWNLOAD THE MODEL 
from joblib import dump
joblib.dump(linear_regressor, 'regression_model.joblib')

NameError: name 'joblib' is not defined