In [1]:
# Import our dependencies

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import pandas as pd
import tensorflow as tf
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN
from sklearn.decomposition import PCA


In [2]:
import boto3
import s3fs
import awscli
import pandas as pd

In [3]:
s3 = boto3.resource('s3')

for bucket in s3.buckets.all():
    print(bucket.name)

team3-final-bucket
wshih-bucket


In [4]:
client = boto3.client('s3')
path = 's3://team3-final-bucket/Resources_clean/objects_clean.csv'

In [5]:
objects_clean = pd.read_csv(path, index_col=[0])
objects_clean.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,object_id,entity_type,parent_id,name,category_code,status,founded_at,closed_at,country_code,state_code,...,investment_rounds,invested_companies,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,first_milestone_at,last_milestone_at,milestones,relationships
0,c:1,Company,,Wetpaint,web,operating,2005-10-17,,USA,WA,...,0,0,2005-10-01,2008-05-19,3,39750000.0,2010-09-05,2013-09-18,5,17
1,c:10,Company,,Flektor,games_video,acquired,,,USA,CA,...,0,0,,,0,0.0,,,0,6
2,c:100,Company,,There,games_video,acquired,,,USA,CA,...,0,0,,,0,0.0,2003-02-01,2011-09-23,4,12
3,c:10000,Company,,MYWEBBO,network_hosting,operating,2008-07-26,,,,...,0,0,,,0,0.0,,,0,0
4,c:10001,Company,,THE Movie Streamer,games_video,operating,2008-07-26,,,,...,0,0,,,0,0.0,,,0,0


In [6]:
# Exploring the data

In [7]:
# Pulling in one df all data attributes that have the same meaning but broken down to multiple tables
# such as Funding amount that has to be collected for each entity_type from different tables

In [8]:
# Dropping non-features columns
objects_clean

Unnamed: 0,object_id,entity_type,parent_id,name,category_code,status,founded_at,closed_at,country_code,state_code,...,investment_rounds,invested_companies,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,first_milestone_at,last_milestone_at,milestones,relationships
0,c:1,Company,,Wetpaint,web,operating,2005-10-17,,USA,WA,...,0,0,2005-10-01,2008-05-19,3,39750000.0,2010-09-05,2013-09-18,5,17
1,c:10,Company,,Flektor,games_video,acquired,,,USA,CA,...,0,0,,,0,0.0,,,0,6
2,c:100,Company,,There,games_video,acquired,,,USA,CA,...,0,0,,,0,0.0,2003-02-01,2011-09-23,4,12
3,c:10000,Company,,MYWEBBO,network_hosting,operating,2008-07-26,,,,...,0,0,,,0,0.0,,,0,0
4,c:10001,Company,,THE Movie Streamer,games_video,operating,2008-07-26,,,,...,0,0,,,0,0.0,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462646,r:9995,Product,c:14164,"SiteLink, listing feed for Brokerages",,operating,,,,,...,0,0,,,0,0.0,,,0,0
462647,r:9996,Product,c:14164,"EDCLink, listing feed for Economic Development...",,operating,,,,,...,0,0,,,0,0.0,,,0,0
462648,r:9997,Product,c:14164,"Cmail, broadcast email marketing",,operating,,,,,...,0,0,,,0,0.0,,,0,0
462649,r:9998,Product,c:14164,"CatylistCRM, contact database",,operating,,,,,...,0,0,,,0,0.0,,,0,0


In [9]:
# Merging with other tables that have important features missing in this table

In [10]:
# Cleaning the data types, set all categorical columns to string, clean NANs, zeros

In [11]:
# Defining the target and setting lables to 0 and 1
# As we are going to predict start-up failure the "Status" column should be defined as target
# The status in this data set contains a few values: aquired, alpha, beta, closed, development, ipo, live, operating, private
# Closed status will have definition of failure, all pther statuses - non-failure
objects_clean_copy = objects_clean[['entity_type','category_code','status','region','funding_total_usd']].copy()


In [12]:
objects_clean_copy.dtypes

entity_type           object
category_code         object
status                object
region                object
funding_total_usd    float64
dtype: object

In [13]:
objects_clean_copy['status'].unique()

array(['operating', 'acquired', 'closed', 'ipo', 'live', 'beta',
       'private', 'alpha', 'development'], dtype=object)

In [14]:
objects_clean_copy['status'].replace(to_replace=['closed'], value='0', inplace=True)
objects_clean_copy['status'].replace(to_replace=['operating', 'acquired', 'ipo', 'live', 'beta',
       'private', 'alpha', 'development'], value='1', inplace=True)

In [15]:
objects_clean_copy.head(20)

Unnamed: 0,entity_type,category_code,status,region,funding_total_usd
0,Company,web,1,Seattle,39750000.0
1,Company,games_video,1,Los Angeles,0.0
2,Company,games_video,1,SF Bay,0.0
3,Company,network_hosting,1,unknown,0.0
4,Company,games_video,1,unknown,0.0
5,Company,advertising,1,Agadir,0.0
6,Company,cleantech,1,Vadodara,0.0
7,Company,,1,unknown,0.0
8,Company,advertising,1,New York,0.0
9,Company,enterprise,1,unknown,0.0


In [16]:
objects_clean_copy['status'] = pd.to_numeric(objects_clean_copy['status'])
objects_clean_copy.dtypes

entity_type           object
category_code         object
status                 int64
region                object
funding_total_usd    float64
dtype: object

In [26]:
# Splitting the table to the features and the target
X = objects_clean_copy.drop(['status','funding_total_usd', 'region'], axis=1)
y = objects_clean_copy['status'].values

In [27]:
X.head()

Unnamed: 0,entity_type,category_code
0,Company,web
1,Company,games_video
2,Company,games_video
3,Company,network_hosting
4,Company,games_video


In [28]:
y

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [29]:
X.columns.to_list()

['entity_type', 'category_code']

In [31]:
# Encode the categorical features
data_encoded_df = pd.get_dummies(X, columns=['entity_type', 'category_code'])
data_encoded_df

Unnamed: 0,entity_type_Company,entity_type_FinancialOrg,entity_type_Person,entity_type_Product,category_code_advertising,category_code_analytics,category_code_automotive,category_code_biotech,category_code_cleantech,category_code_consulting,...,category_code_real_estate,category_code_search,category_code_security,category_code_semiconductor,category_code_social,category_code_software,category_code_sports,category_code_transportation,category_code_travel,category_code_web
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462646,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
462647,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
462648,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
462649,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
data_encoded_df.columns.to_list()

['entity_type_Company',
 'entity_type_FinancialOrg',
 'entity_type_Person',
 'entity_type_Product',
 'category_code_advertising',
 'category_code_analytics',
 'category_code_automotive',
 'category_code_biotech',
 'category_code_cleantech',
 'category_code_consulting',
 'category_code_design',
 'category_code_ecommerce',
 'category_code_education',
 'category_code_enterprise',
 'category_code_fashion',
 'category_code_finance',
 'category_code_games_video',
 'category_code_government',
 'category_code_hardware',
 'category_code_health',
 'category_code_hospitality',
 'category_code_legal',
 'category_code_local',
 'category_code_manufacturing',
 'category_code_medical',
 'category_code_messaging',
 'category_code_mobile',
 'category_code_music',
 'category_code_nanotech',
 'category_code_network_hosting',
 'category_code_news',
 'category_code_nonprofit',
 'category_code_other',
 'category_code_pets',
 'category_code_photo_video',
 'category_code_public_relations',
 'category_code_real

In [33]:
# Create encoded X df with the funding column added
X = data_encoded_df

In [34]:
# Splitting the data to train / test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [35]:
# Scale the features 
data_scaler = StandardScaler()
X_scaled = data_scaler.fit_transform(X)

In [36]:
# Count target train data
Counter(y_train)

Counter({1: 344912, 0: 2076})

In [None]:
# Resample
smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X_scaled, y)
Counter(y_resampled)

In [None]:
# Train
