In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import pickle

In [2]:
data = pd.read_csv("companies.csv")

## Data Cleaning:

##### Deleting columns with redundancy, granularity, and irrelevant information:

In [3]:
columns_to_drop = ["created_at", "updated_at", "name", "region", "city", "state_code", 'domain', 'homepage_url', 'twitter_username', 'logo_url', 'logo_width', 'logo_height', 'short_description', 'description', 'overview','tag_list', 'normalized_name', 'permalink', "lat", "lng", "first_funding_at", "last_funding_at", 'id', 'Unnamed: 0.1', 'entity_type', 'entity_id', 'parent_id', 'created_by', "first_investment_at", "last_investment_at", "first_milestone_at", "last_milestone_at", "invested_companies"]
data.drop(columns_to_drop, inplace=True, axis=1)

print(data.columns)
print()
print(data.shape)

Index(['category_code', 'status', 'founded_at', 'closed_at', 'country_code',
       'investment_rounds', 'funding_rounds', 'funding_total_usd',
       'milestones', 'relationships', 'ROI'],
      dtype='object')

(196553, 11)


##### Deleting instances with missing values for specific columns:

In [4]:
# Checking number of null values for target columns
target_columns = ["status", "country_code", "category_code", "founded_at"]
print(data[target_columns].isnull().sum())
print()

# Dropping null values from target columns
data.dropna(subset=target_columns, inplace=True)

# Checking number of null values for target columns after removing
print(data[target_columns].isnull().sum())
print()
print(data.shape)

status                0
country_code     108563
category_code     73367
founded_at       105326
dtype: int64

status           0
country_code     0
category_code    0
founded_at       0
dtype: int64

(64099, 11)


##### Filling null values in data with mean, median, mode:

In [5]:
print(data.isnull().sum())
print()

target_columns = ["investment_rounds", "funding_rounds", ]
for column in target_columns:
    median_current_column = data[column].median()
    data[column].fillna(median_current_column, inplace=True)

target_columns = ["funding_total_usd", "relationships", "ROI"]
for column in target_columns:
    mean_current_column = data[column].mean()
    data[column].fillna(mean_current_column, inplace=True)

target_columns = ["milestones"]
for column in target_columns:
    mode_current_column = data[column].mode()[0]
    data[column].fillna(mode_current_column, inplace=True)
    
print(data.isnull().sum())

category_code            0
status                   0
founded_at               0
closed_at            62438
country_code             0
investment_rounds    63350
funding_rounds       41208
funding_total_usd    43630
milestones           28796
relationships        15657
ROI                  63556
dtype: int64

category_code            0
status                   0
founded_at               0
closed_at            62438
country_code             0
investment_rounds        0
funding_rounds           0
funding_total_usd        0
milestones               0
relationships            0
ROI                      0
dtype: int64


## Date Transformation:

##### Converting date columns to year only:

In [6]:
# Function to extract year from date for a given column (feature)
def extract_year(data, feature):
    return pd.to_datetime(data[feature], format="%Y-%m-%d").dt.year

# Converting target columns dates to year only
target_columns = ["founded_at", "closed_at"]
for column in target_columns:
    data[column] = extract_year(data, column)
    
data.head()

Unnamed: 0,category_code,status,founded_at,closed_at,country_code,investment_rounds,funding_rounds,funding_total_usd,milestones,relationships,ROI
0,web,operating,2005,,USA,1.0,3.0,39750000.0,5.0,17.0,15.5
5,advertising,operating,2007,,MAR,1.0,1.0,15819780.0,1.0,2.0,14.41045
6,cleantech,operating,2008,,IND,1.0,1.0,15819780.0,1.0,4.43452,14.41045
12,advertising,operating,2008,,USA,1.0,1.0,15819780.0,1.0,2.0,14.41045
13,web,acquired,2007,,USA,1.0,1.0,5000000.0,3.0,14.0,9.5


In [7]:
data.shape

(64099, 11)

##### Creating a new column "active_days" from "closed_at" and "founded_at":

In [8]:
# Filling null values in "closed_at" column for calculation of age of company
mode_closed_at = data["closed_at"].mode()[0]
data.loc[(pd.isnull(data["closed_at"]) & (data["status"] == "operating") | (data["status"] == "ipo")), "closed_at"] = 2021
data.loc[(pd.isnull(data["closed_at"]) & (data["status"] == "closed") | (data["status"] == "acquired")), "closed_at"] = mode_closed_at

# Creating column "active_days" from "closed_at" and "founded_at" columns
data["active_days"] = (data["closed_at"] - data["founded_at"]) * 365

# Removing rows with negative value of active days
neg_active_days_index = np.where(data['active_days'] < 0)
data.drop(data.index[neg_active_days_index], axis=0, inplace=True)

# Dropping columns "closed_at" and "founded_at" as now they are of no use
target_columns = ["closed_at", "founded_at"]
data.drop(target_columns, axis=1, inplace=True)

data.head()

Unnamed: 0,category_code,status,country_code,investment_rounds,funding_rounds,funding_total_usd,milestones,relationships,ROI,active_days
0,web,operating,USA,1.0,3.0,39750000.0,5.0,17.0,15.5,5840.0
5,advertising,operating,MAR,1.0,1.0,15819780.0,1.0,2.0,14.41045,5110.0
6,cleantech,operating,IND,1.0,1.0,15819780.0,1.0,4.43452,14.41045,4745.0
12,advertising,operating,USA,1.0,1.0,15819780.0,1.0,2.0,14.41045,4745.0
13,web,acquired,USA,1.0,1.0,5000000.0,3.0,14.0,9.5,1825.0


In [9]:
data.shape

(64084, 10)

#####  Deleting outliers for columns 'funding_total_usd' and 'funding_rounds':

In [10]:
target_columns = ["funding_total_usd", "funding_rounds"] # one or more

Q1 = data[target_columns].quantile(0.25)
Q3 = data[target_columns].quantile(0.75)
IQR = Q3 - Q1

data = data[~((data[target_columns] < (Q1 - 1.5 * IQR)) | (data[target_columns] > (Q3 + 1.5 * IQR))).any(axis=1)]
data.shape

(43582, 10)

##### Encoding columns "category_code" and "country_code" (One-Hot):

In [11]:
# Extacting top 10 frequent categories
top_categorical = list(data["category_code"].value_counts().sort_values(ascending=False).head(10).index)

# Keep category same if it is in top 10, change to "other" otherwise
data.loc[~data["category_code"].isin(top_categorical), "category_code"] = "other"

# Extacting top 10 frequent categories
top_country = list(data["country_code"].value_counts().sort_values(ascending=False).head(10).index)

# Keep category same if it is in top 10, change to "other" otherwise
data.loc[~data["country_code"].isin(top_country), "country_code"] = "other"

# Creating one hot encoder
one_hot_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)

# Setting target columns
target_columns = ["category_code", "country_code"]

# Applying one hot encoding to each target column
one_hot_columns = pd.DataFrame(one_hot_encoder.fit_transform(data[target_columns]))

# One hot encoding remove index, putting it back
one_hot_columns.index = data.index

# Updating names of one hot encoded columns
array = ["category_code_" + column for column in top_categorical]
array.extend(["country_code_" + column for column in top_country])
array.append("country_code_other")
one_hot_columns.columns = array

# Removing categorical columns (Will replace with one hot encoded columns)
numerical_data = data.drop(target_columns, axis=1)

# Adding one hot encoded columns to data
data = pd.concat([numerical_data, one_hot_columns], axis=1)

data.head()

Unnamed: 0,status,investment_rounds,funding_rounds,funding_total_usd,milestones,relationships,ROI,active_days,category_code_software,category_code_web,...,country_code_GBR,country_code_IND,country_code_CAN,country_code_DEU,country_code_AUS,country_code_FRA,country_code_ESP,country_code_NLD,country_code_ISR,country_code_other
5,operating,1.0,1.0,15819780.0,1.0,2.0,14.41045,5110.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,operating,1.0,1.0,15819780.0,1.0,4.43452,14.41045,4745.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
12,operating,1.0,1.0,15819780.0,1.0,2.0,14.41045,4745.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
15,operating,1.0,1.0,15819780.0,1.0,3.0,14.41045,4745.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
18,acquired,1.0,1.0,15819780.0,4.0,9.0,14.41045,1460.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


##### Working on target variable:

In [12]:
data["status"] = data["status"].map({"operating": 1, "ipo": 1, "acquired": 0, "closed": 0})
data["status"].unique()

array([1, 0], dtype=int64)

##### Removing duplicates from dataset:

In [13]:
print(data.duplicated().sum())
data.drop_duplicates(inplace=True)
print(data.duplicated().sum())

30705
0


## Final Dataset:

In [14]:
data

Unnamed: 0,status,investment_rounds,funding_rounds,funding_total_usd,milestones,relationships,ROI,active_days,category_code_software,category_code_web,...,country_code_GBR,country_code_IND,country_code_CAN,country_code_DEU,country_code_AUS,country_code_FRA,country_code_ESP,country_code_NLD,country_code_ISR,country_code_other
5,1,1.0,1.0,1.581978e+07,1.0,2.00000,14.410450,5110.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,1,1.0,1.0,1.581978e+07,1.0,4.43452,14.410450,4745.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
12,1,1.0,1.0,1.581978e+07,1.0,2.00000,14.410450,4745.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
15,1,1.0,1.0,1.581978e+07,1.0,3.00000,14.410450,4745.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
18,0,1.0,1.0,1.581978e+07,4.0,9.00000,14.410450,1460.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196499,1,1.0,1.0,1.581978e+07,1.0,6.00000,14.410450,16425.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
196519,0,1.0,1.0,1.450000e+07,3.0,19.00000,24.137931,3285.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
196524,0,1.0,1.0,1.581978e+07,1.0,4.00000,14.410450,2555.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
196548,1,1.0,1.0,1.581978e+07,2.0,5.00000,14.410450,5110.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Preparation for Modeling:

##### Splitting dataset:

In [15]:
X = data.drop(labels=["status"], axis=1)
y = data["status"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

##### Scaling dataset:

In [16]:
# Initiate scaler
standard_scaler = StandardScaler()

# Standardize the training dataset
X_train_scaled = pd.DataFrame(standard_scaler.fit_transform(X_train), index=X_train.index, columns=X_train.columns)

# Standardized the testing dataset
X_test_scaled = pd.DataFrame(standard_scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

# Summary statistics after standardization
X_train_scaled.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
investment_rounds,10301.0,6.456358e-16,1.000049,-0.057783,-0.057783,-0.057783,-0.057783,69.309903
funding_rounds,10301.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
funding_total_usd,10301.0,-1.873293e-16,1.000049,-15.13268,0.067987,0.067987,0.067987,9.042494
milestones,10301.0,6.995019e-16,1.000049,-0.513339,-0.513339,-0.513339,1.02429,8.712435
relationships,10301.0,6.333046e-17,1.000049,-0.246749,-0.195049,-0.091649,-0.039949,61.172864
ROI,10301.0,-1.321229e-15,1.000049,-34.082879,0.015468,0.015468,0.015468,52.29288
active_days,10301.0,1.521936e-16,1.000049,-1.223435,-0.538301,-0.289161,0.209118,6.250753
category_code_software,10301.0,-8.169209e-16,1.000049,-0.285429,-0.285429,-0.285429,-0.285429,3.503496
category_code_web,10301.0,-6.172241e-16,1.000049,-0.270017,-0.270017,-0.270017,-0.270017,3.703473
category_code_other,10301.0,5.529452e-16,1.000049,-0.302962,-0.302962,-0.302962,-0.302962,3.300745


## Logistic Regression Model:

##### Baseline model:

In [17]:
lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)
print("Train accuracy:", lr_model.score(X_train_scaled, y_train) * 100)
print("Test accuracy:", lr_model.score(X_test_scaled, y_test) * 100)

Train accuracy: 87.24395689738861
Test accuracy: 86.1413043478261


##### Hyperparameter tuning (GridSearchCV):

In [18]:
# define models and parameters
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=lr_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train_scaled, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.872181 using {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.872181 (0.004175) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.872181 (0.004175) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.872181 (0.004175) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.872116 (0.004217) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.872116 (0.004217) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.872116 (0.004217) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.872019 (0.004217) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.872019 (0.004217) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.871890 (0.004144) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.871080 (0.004070) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.871080 (0.004070) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.870821 (0.004021) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.862052 (0.002584) wit

##### With best parameters:

In [19]:
lr_model = LogisticRegression(C=100, solver="newton-cg", penalty="l2")
lr_model.fit(X_train_scaled, y_train)
print("Train accuracy:", lr_model.score(X_train_scaled, y_train) * 100)
print("Test accuracy:", lr_model.score(X_test_scaled, y_test) * 100)

Train accuracy: 87.25366469274827
Test accuracy: 86.1024844720497


##### Creating pipeline:

In [20]:
lr_pipeline = Pipeline([('standard_scalar', StandardScaler()), ('lr', lr_model)])
lr_pipeline.fit(X_train, y_train)
print("Train accuracy:", lr_pipeline.score(X_train, y_train) * 100)
print("Test accuracy:", lr_pipeline.score(X_test, y_test) * 100)

Train accuracy: 87.25366469274827
Test accuracy: 86.1024844720497


##### Saving model:

In [21]:
with open('lr_model.pkl', 'wb') as out_file:
    pickle.dump(lr_pipeline, out_file)