# 0. Intoduction

This notebook will evaluates different ML models for prediction M&A deal prices
we will compare
- Linear Regression
- SVM
- XGBoost


note that this notebook can also be run with the dummy-dataset to get the results on the dummy-coded variables

In [1]:
import pickle
import pandas as pd
import numpy as np
from datetime import datetime
from scipy.stats import loguniform, randint
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

from sklearn.linear_model import LinearRegression
from sklearn.dummy import DummyRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [2]:
df = pd.read_excel(r"C:\Users\32474\Documents\Ma TEW\masterthesis\data\modelling_data.xlsx")

In [3]:
df.drop("Unnamed: 0", axis=1, inplace=True)

In [4]:
df.tail()

Unnamed: 0,log_deal_price,deal_price,acquirer_investment_rounds,acquirer_funding_rounds,acquirer_milestones,acquirer_relationships,acquirer_founded_year,acquirer_offices,acquirer_is_usa,acquirer_is_bay,...,acquired_sector_communication,acquired_sector_consumer_discretionary,acquired_sector_financials,acquired_sector_health_care,acquired_sector_industrials,acquired_sector_other,acquired_age,acquisition_year,same_country,same_sector
2473,18.370608,95116040,0,0,0,1,1980,1,0,0,...,0,0,0,0,0,0,5,2013,0,1
2474,21.639557,2500000000,0,0,0,3,1985,1,0,0,...,0,0,0,0,0,1,13,2013,0,1
2475,19.218188,222000000,0,0,0,5,1994,1,1,0,...,0,0,0,1,0,0,13,2013,1,1
2476,23.981362,26000000000,5,0,2,14,1985,1,1,0,...,0,0,0,0,0,1,7,2007,0,0
2477,17.567365,42600000,0,5,5,21,2005,1,1,0,...,1,0,0,0,0,0,2,2013,1,0


In [5]:
# Timer function to record our training times
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

# 1. preprocessing

In [6]:
# select all the row other than deal price as independent variables
X = df.iloc[:, 2:]
# select log deal price as target
y = df.iloc[:, 1] # standard deal price
y_log = df.iloc[:, 0] # log scaled deal price

In [7]:
# create a train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# select the same train test cases for y_log
y_log_train = y_log[y_train.index]
y_log_test = y_log[y_test.index]

In [9]:
# # transform the data to be zero-mean and unit-variance normalized
# ss = StandardScaler()
# ss.fit(X_train)
# X_train_trans = ss.transform(X_train)
# X_test_trans = ss.transform(X_test)

# 2. modelling

# 2.1. Dummy regressor

In [10]:
# plot the average score of a basic mean prediction
np.mean(abs(cross_val_score(DummyRegressor(), X_train, y_train, cv=5, scoring="neg_mean_absolute_error"))).round(0)

548240940.0

In [11]:
# test the dummy regression model on the test set
dummy = DummyRegressor()
dummy.fit(X_train, y_train)
dummy_pred = dummy.predict(X_test)
mean_absolute_error(y_test, dummy_pred)

730325845.958947

## 2.2. Linear Regression

In [12]:
# create a pipeline combining the scaling and the OLS

In [13]:
lr_pipe = Pipeline([('scaler', StandardScaler()), ('lr', LinearRegression())])

### 2.2.1. Log scaled

In [14]:
# use cross validation to fit the model on the log prices, but get scores in terms of normal prices
kf = KFold(n_splits=5)
score = []
for train_index, test_index in kf.split(X_train):
    lr_pipe.fit(X_train.loc[X_train.index[train_index]], y_log_train.loc[y_log_train.index[train_index]])
    log_pred = lr_pipe.predict(X_train.loc[X_train.index[test_index]])
    pred = np.exp(log_pred)
    score.append(mean_absolute_error(y_train.loc[y_train.index[test_index]], pred))

In [15]:
np.mean(score).round(0)

170230580368.0

In [16]:
kf = KFold(n_splits=5)
score = []
for train_index, test_index in kf.split(X_train):
    lr_pipe.fit(X_train.loc[X_train.index[train_index]], y_log_train.loc[y_log_train.index[train_index]])
    log_pred = lr_pipe.predict(X_train.loc[X_train.index[test_index]])
    score.append(mean_absolute_error(y_log_train.loc[y_train.index[test_index]], log_pred))

### 2.1.2. Not log scaled

In [16]:
np.mean(abs(cross_val_score(lr_pipe, X_train, y_train, cv=5, scoring="neg_mean_absolute_error"))).round(0)

545626170.0

In [None]:
# train the OLS model
lr_pipe.fit(X_train, y_train)

In [17]:
# save this model
pickle.dump(lr_pipe, open("lr_model3.sav", "wb"))

In [18]:
# test the OLS model on the test set
lr_pred = lr_pipe.predict(X_test)
mean_absolute_error(y_test, lr_pred)

589108955.7166886

## 2.2. SVM

In [None]:
svr_pipe = Pipeline([('scaler', StandardScaler()), ('svr', SVR())])

In [22]:
param_svr = {
    "svr__kernel": ["linear", "rbf", "sigmoid"],
    "svr__gamma": loguniform(1e-5, 1e0),
    "svr__C": loguniform(1e-2, 1e5),
    "svr__epsilon":loguniform(1e-3, 1e2)
}

In [24]:
svr_search = RandomizedSearchCV(estimator=svr_pipe, param_distributions=param_svr, n_iter=2000, scoring="neg_mean_absolute_error",
                                 cv=5, n_jobs=-1, verbose=2, random_state=33)
svr_time = timer(None)
svr_search.fit(X_train, y_train)
timer(svr_time)

Fitting 5 folds for each of 2000 candidates, totalling 10000 fits

 Time taken: 0 hours 3 minutes and 55.7 seconds.


In [28]:
svr_search.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('svr',
                 SVR(C=99784.06088169578, epsilon=0.02124500203044543,
                     gamma=0.0007037942513643572, kernel='linear'))])

In [26]:
print(f"The SVR model obtains an average mae of {abs(svr_search.best_score_).round(0)}")

The SVR model obtains an average mae of 377218224.0


In [29]:
# save this model
pickle.dump(svr_search.best_estimator_, open("svr_model3.sav", "wb"))

In [34]:
abs(svr_search.score(X_test, y_test)).round(0)

551766971.0

## 2.3. XGBoost

In [31]:
xgb_pipe = Pipeline([('scaler', StandardScaler()), ('xgb', XGBRegressor())])

In [38]:
params_xgb = {
    "xgb__eta": loguniform(0.01, 0.5),
    "xgb__gamma": loguniform(1e-1, 1e3),
    "xgb__max_depth": randint(1, 10),
    "xgb__min_child_weight": randint(1, 10),
    "xgb__subsample": [0.5, 0.75, 1],
    "xgb__colsample_bylevel": [0.5, 0.75, 1],
    "xgb__lambda": randint(1, 10)
}

In [39]:
xgb_search = RandomizedSearchCV(estimator=xgb_pipe, param_distributions=params_xgb, n_iter=2000, scoring="neg_mean_absolute_error",
                                cv=5,n_jobs=-1, verbose=2, random_state=33)

xgb_time = timer(None)
xgb_search.fit(X_train, y_train)
timer(xgb_time)

Fitting 5 folds for each of 2000 candidates, totalling 10000 fits

 Time taken: 0 hours 8 minutes and 1.75 seconds.


In [42]:
xgb_search.best_params_

{'xgb__colsample_bylevel': 0.5,
 'xgb__eta': 0.014281468394888143,
 'xgb__gamma': 32.94546149518965,
 'xgb__lambda': 7,
 'xgb__max_depth': 9,
 'xgb__min_child_weight': 1,
 'xgb__subsample': 0.5}

In [41]:
xgb_search.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('xgb',
                 XGBRegressor(base_score=0.5, booster='gbtree',
                              colsample_bylevel=0.5, colsample_bynode=1,
                              colsample_bytree=1, enable_categorical=False,
                              eta=0.014281468394888143, gamma=32.94546149518965,
                              gpu_id=-1, importance_type=None,
                              interaction_constraints='', lambda=7,
                              learning_rate=0.0142814685, max_delta_step=0,
                              max_depth=9, min_child_weight=1, missing=nan,
                              monotone_constraints='()', n_estimators=100,
                              n_jobs=16, num_parallel_tree=1, predictor='auto',
                              random_state=0, reg_alpha=0, reg_lambda=7,
                              scale_pos_weight=1, subsample=0.5,
                              tree_method='exact', validate_

In [43]:
print(f"The XGBoost model obtains an average mae of {abs(xgb_search.best_score_).round(0)}")

The XGBoost model obtains an average mae of 354126647.0


In [44]:
# save this model
pickle.dump(xgb_search.best_estimator_, open("xgb_model3.sav", "wb"))

In [45]:
abs(xgb_search.score(X_test, y_test)).round(0)

528058600.0