Analysing with Decision tree
Setup: put it in the same folder as "unifed_csv_without_duplicated_company.csv" and "ROI.csv" (generated by get_roi notebook) 

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split   
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from IPython.display import Image
import pydotplus
import graphviz
from sklearn.model_selection import GridSearchCV

# Optional
# import library for visuaiization
from sklearn import tree
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
#Read both CSV
ROI_df= pd.read_csv("ROI.csv")
df=pd.read_csv("unifed_csv_without_duplicated_company.csv")
df.info(max_cols=1000)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37198 entries, 0 to 37197
Data columns (total 104 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   company_uuid                        37198 non-null  object 
 1   company_name                        347 non-null    object 
 2   country_code                        33609 non-null  object 
 3   region                              33609 non-null  object 
 4   city                                33609 non-null  object 
 5   status                              37198 non-null  object 
 6   category_groups_list                37198 non-null  object 
 7   num_funding_rounds                  12036 non-null  float64
 8   total_funding_usd                   9093 non-null   float64
 9   founded_on                          37198 non-null  object 
 10  last_funding_on                     12032 non-null  object 
 11  closed_on                           591 

In [3]:
#Create dataframe for features
features=df[["company_uuid", "status", "num_funding_rounds","Financial Services","FinTech","Finance","Blockchain", "Information Technology", "Software",  "Payments" , "Cryptocurrency", "Venture Capital","Internet", "Banking", "Consulting",  "Mobile Payments","E-Commerce", "Insurance", "Bitcoin", "Artificial Intelligence", "Mobile", "Crowdfunding","SaaS",  "Real Estate", "Apps", "Personal Finance","Accounting", "Mobile Apps", "Asset Management", "Marketplace", "Lending", "Big Data", "Machine Learning"]].copy()

In [4]:
#Deal with null value in num_funding_rounds and set status to number 
features['status'].replace(to_replace = ['operating', 'acquired','closed', "ipo"], value = [1,3,-1,2], inplace=True)
features["num_funding_rounds"]=features["num_funding_rounds"].fillna(0)

In [5]:
# Join with ROI.csv from ROI.ipynb
ROI_df=ROI_df.drop(columns=["Unnamed: 0","investment_delta", "norm_amount_delta", "max_norm_amount_delta" ])
features=features.set_index("company_uuid").join(ROI_df.set_index("org_uuid"))
features["mean_norm_amount_delta"]=features["mean_norm_amount_delta"].fillna(0)
features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37198 entries, 0001a6ec-e7e9-4d1b-8a77-adb5ac815420 to ffffabce-6d4a-b3d1-13c0-4e90cedf5270
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   status                   37198 non-null  int64  
 1   num_funding_rounds       37198 non-null  float64
 2   Financial Services       37198 non-null  int64  
 3   FinTech                  37198 non-null  int64  
 4   Finance                  37198 non-null  int64  
 5   Blockchain               37198 non-null  int64  
 6   Information Technology   37198 non-null  int64  
 7   Software                 37198 non-null  int64  
 8   Payments                 37198 non-null  int64  
 9   Cryptocurrency           37198 non-null  int64  
 10  Venture Capital          37198 non-null  int64  
 11  Internet                 37198 non-null  int64  
 12  Banking                  37198 non-null  int64  
 13  Consulting     

In [6]:
# define independent variables/attirbutes/features
x = features.drop(columns='mean_norm_amount_delta')
y = features["mean_norm_amount_delta"]

#Make "mean_norm_amount_delta" column 1/0
binary_labels = [0,1]
y = pd.qcut(y, 2, labels = binary_labels)
y

company_uuid
0001a6ec-e7e9-4d1b-8a77-adb5ac815420    0
0003f244-79d0-6178-353e-33dabaf3b2c6    0
00057beb-5724-c809-5cd2-a0a7c6b017aa    0
000ab460-8462-450d-ab34-b3c54fc252fd    0
000ad7a8-b868-f301-5f00-2a3361288fc9    0
                                       ..
fff9249a-baea-49a9-baca-a57b28ee7e6f    0
fffa7546-aba8-5bbf-11a9-105d61b0dd0e    0
fffbd05f-57b7-4fdc-94a6-be656719db8c    0
fffe3b0b-8545-4414-9c0e-a3f961c4e420    0
ffffabce-6d4a-b3d1-13c0-4e90cedf5270    1
Name: mean_norm_amount_delta, Length: 37198, dtype: category
Categories (2, int64): [0 < 1]

In [7]:
# Implement Decision tree
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=5)
model=DecisionTreeClassifier()
model.fit(X_train, y_train)

DecisionTreeClassifier()

In [8]:
# use the trained model to predict test set for evaluation
pred_test = model.predict(X_test)

# print out evaluation result
print("Accuracy:{}".format(accuracy_score(y_test, pred_test, normalize=True, sample_weight=None)))
print("Classification Report:\n{}".format(classification_report(y_test, pred_test)))
print("Confusion Matrix:\n{}".format(confusion_matrix(y_test, pred_test)))

Accuracy:0.9447132616487455
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     10431
           1       0.60      0.46      0.52       729

    accuracy                           0.94     11160
   macro avg       0.78      0.72      0.74     11160
weighted avg       0.94      0.94      0.94     11160

Confusion Matrix:
[[10210   221]
 [  396   333]]
