In [2]:
import pandas as pd
import numpy as np

# Models to use
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from xgboost import XGBClassifier
import catboost as cb
from sklearn.svm import SVC

# Importing the metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
#from sklearn.metrics import confusion_matrix
#from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score

# For measuring the training time taken during the fit process
import time

## Reading & Correcting the Data

In [3]:
df = pd.read_csv('data.csv')
df.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,class,lepton_pT,lepton_eta,lepton_phi,missing_energy_magnitude,missing_energy_phi,jet1pt,jet1eta,jet1phi,jet1b-tag,...,jet4eta,jet4phi,jet4b-tag,m_jj,m_jjj,m_lv,m_jlv,m_bb,m_wbb,m_wwbb
0,1,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,...,-1.13893,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343
1,1,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.0,...,1.128848,0.900461,0.0,0.909753,1.10833,0.985692,0.951331,0.803252,0.865924,0.780118
2,0,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,...,-0.678379,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904
3,1,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,...,-0.373566,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808487
4,0,1.595839,-0.607811,0.007075,1.81845,-0.111906,0.84755,-0.566437,1.581239,2.173076,...,-0.654227,-1.274345,3.101961,0.823761,0.938191,0.971758,0.789176,0.430553,0.961357,0.957818


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98050 entries, 0 to 98049
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   class                     98050 non-null  int64  
 1   lepton_pT                 98050 non-null  float64
 2   lepton_eta                98050 non-null  float64
 3   lepton_phi                98050 non-null  float64
 4   missing_energy_magnitude  98050 non-null  float64
 5   missing_energy_phi        98050 non-null  float64
 6   jet1pt                    98050 non-null  float64
 7   jet1eta                   98050 non-null  float64
 8   jet1phi                   98050 non-null  float64
 9   jet1b-tag                 98050 non-null  float64
 10  jet2pt                    98050 non-null  float64
 11  jet2eta                   98050 non-null  float64
 12  jet2phi                   98050 non-null  float64
 13  jet2b-tag                 98050 non-null  float64
 14  jet3pt

Some columns are casted as objects instead of as floats. We need to change it but let me correct one thing first.

The last row includes '?' characters for for some features. Since it is only one row, I will drop it from the dataframe.

In [5]:
df.tail()

Unnamed: 0,class,lepton_pT,lepton_eta,lepton_phi,missing_energy_magnitude,missing_energy_phi,jet1pt,jet1eta,jet1phi,jet1b-tag,...,jet4eta,jet4phi,jet4b-tag,m_jj,m_jjj,m_lv,m_jlv,m_bb,m_wbb,m_wwbb
98045,1,0.908091,-0.825006,-0.830871,0.736298,1.512713,0.881811,-0.36344,0.006813,1.086538,...,-0.187013,0.716784954071045,0,1.31768071651459,1.01079499721527,0.985962450504303,0.957878351211548,1.45467174053192,0.903936803340912,0.786069393157959
98046,0,2.512898,0.530759,-1.470626,1.641798,1.613386,0.934027,1.521958,-0.049836,0.0,...,0.053673,-0.386482894420624,0,0.774412572383881,0.745262801647186,0.99708765745163,1.39238667488098,0.86468094587326,1.17978250980377,1.08616733551025
98047,0,0.903699,0.261943,-0.429149,1.892855,0.313687,0.493396,-1.494282,-1.458506,0.0,...,-0.283621,1.1107724905014,3.10196137428284,0.527038097381592,0.60726273059845,1.12528610229492,0.634106457233429,0.115543350577354,0.425828188657761,0.692506015300751
98048,0,0.566047,-0.317568,0.062561,0.358186,-1.315823,0.691176,1.154583,-0.242759,2.173076,...,-0.844943,-0.294922441244125,0,0.747239172458649,1.0089750289917,0.989497303962708,1.20314705371857,0.8924919962883,1.09080731868744,0.888965249061584
98049,1,0.708611,1.190136,0.593008,1.899366,-0.111872,0.871276,0.982283,-1.684691,2.173076,...,1.797605,?,?,?,?,?,?,?,?,?


In [6]:
(df == '?').sum()

class                       0
lepton_pT                   0
lepton_eta                  0
lepton_phi                  0
missing_energy_magnitude    0
missing_energy_phi          0
jet1pt                      0
jet1eta                     0
jet1phi                     0
jet1b-tag                   0
jet2pt                      0
jet2eta                     0
jet2phi                     0
jet2b-tag                   0
jet3pt                      0
jet3eta                     0
jet3phi                     0
jet3b-tag                   0
jet4pt                      0
jet4eta                     0
jet4phi                     1
jet4b-tag                   1
m_jj                        1
m_jjj                       1
m_lv                        1
m_jlv                       1
m_bb                        1
m_wbb                       1
m_wwbb                      1
dtype: int64

In [7]:
df = df.iloc[:98049]

I will now correct the column datatypes.

In [8]:
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df.loc[:,col].astype(np.float64)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98049 entries, 0 to 98048
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   class                     98049 non-null  int64  
 1   lepton_pT                 98049 non-null  float64
 2   lepton_eta                98049 non-null  float64
 3   lepton_phi                98049 non-null  float64
 4   missing_energy_magnitude  98049 non-null  float64
 5   missing_energy_phi        98049 non-null  float64
 6   jet1pt                    98049 non-null  float64
 7   jet1eta                   98049 non-null  float64
 8   jet1phi                   98049 non-null  float64
 9   jet1b-tag                 98049 non-null  float64
 10  jet2pt                    98049 non-null  float64
 11  jet2eta                   98049 non-null  float64
 12  jet2phi                   98049 non-null  float64
 13  jet2b-tag                 98049 non-null  float64
 14  jet3pt

In [11]:
#df.to_csv('higgs_cleaned.csv')

## Baselines

Before fitting the models, let's first check the ratio of class labels to see if there is any imbalance exists.

In [12]:
df['class'].value_counts()

1    51826
0    46223
Name: class, dtype: int64

In [13]:
print('Ratio of Class 0 to Class 1:')
print(round(df['class'].value_counts()[0]/(df['class'].value_counts()[1])*100, 2))

Ratio of Class 0 to Class 1:
89.19


The dataset is mostly balanced so the plain old accuracy score metric would not be a bad decision to use for measuring the performance. Let's continue with splitting the dataset into train and test sets.

In [14]:
X, y = df.drop('class', axis=1), df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1864)

Lastly, I will create a dataframe storing the values of accuracy, roc-auc score, processing time and f1-score of the models that will be fitted.

In [15]:
baselines = pd.DataFrame(index = ['Training Time', 'Accuracy', 'Roc-Auc Score', 'F1 Score'])

### Decision Tree

In [16]:
clf = DecisionTreeClassifier(random_state=1864)

start = time.time()
clf.fit(X_train, y_train)
end = time.time()
dt_preds = clf.predict(X_test)

In [17]:
print('Vanilla Decision Tree Scores:')
print('Accuracy:', accuracy_score(y_test, dt_preds))
print('ROC-AUC:', roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))
print('*'*60)
print('Detailed Report:')
print(classification_report(y_test, dt_preds))

Vanilla Decision Tree Scores:
Accuracy: 0.6228964813870475
ROC-AUC: 0.621665181318777
************************************************************
Detailed Report:
              precision    recall  f1-score   support

           0       0.61      0.60      0.60      9342
           1       0.64      0.65      0.64     10268

    accuracy                           0.62     19610
   macro avg       0.62      0.62      0.62     19610
weighted avg       0.62      0.62      0.62     19610



In [18]:
baselines['DT'] = [end-start, accuracy_score(y_test, dt_preds), 
                   roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]), f1_score(y_test, dt_preds, average='weighted')]

### Random Forest

In [19]:
clf = RandomForestClassifier(random_state=1864)
start = time.time()
clf.fit(X_train, y_train)
end = time.time()
rf_preds = clf.predict(X_test)

In [20]:
print('Vanilla RF Scores:')
print('Accuracy:', accuracy_score(y_test, rf_preds))
print('ROC-AUC:', roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))
print('*'*60)
print('Detailed Report:')
print(classification_report(y_test, rf_preds))

Vanilla RF Scores:
Accuracy: 0.7166241713411525
ROC-AUC: 0.7920389835850293
************************************************************
Detailed Report:
              precision    recall  f1-score   support

           0       0.71      0.69      0.70      9342
           1       0.72      0.74      0.73     10268

    accuracy                           0.72     19610
   macro avg       0.72      0.72      0.72     19610
weighted avg       0.72      0.72      0.72     19610



In [21]:
baselines['RF'] = [end-start, accuracy_score(y_test, rf_preds), 
                   roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]), f1_score(y_test, rf_preds, average='weighted')]

### Boosting Models

#### LightGBM

In [22]:
clf = lgb.LGBMClassifier(random_state = 1864)
start = time.time()
clf.fit(X_train, y_train)
end = time.time()
lgb_preds = clf.predict(X_test)

In [23]:
print('Vanilla LightGBM Scores:')
print('Accuracy:', accuracy_score(y_test, lgb_preds))
print('ROC-AUC:', roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))
print('*'*60)
print('Detailed Report:')
print(classification_report(y_test, lgb_preds))

Vanilla LightGBM Scores:
Accuracy: 0.7233554309026007
ROC-AUC: 0.8028859742376792
************************************************************
Detailed Report:
              precision    recall  f1-score   support

           0       0.72      0.69      0.71      9342
           1       0.73      0.75      0.74     10268

    accuracy                           0.72     19610
   macro avg       0.72      0.72      0.72     19610
weighted avg       0.72      0.72      0.72     19610



In [24]:
baselines['LGBM'] = [end-start, accuracy_score(y_test, lgb_preds), 
                   roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]), f1_score(y_test, lgb_preds, average='weighted')]

#### XGBoost

In [25]:
clf = XGBClassifier(random_state = 1864)
start = time.time()
clf.fit(X_train, y_train)
end = time.time()
xgb_preds = clf.predict(X_test)





In [26]:
print('Vanilla XGB Scores:')
print('Accuracy:', accuracy_score(y_test, xgb_preds))
print('ROC-AUC:', roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))
print('*'*60)
print('Detailed Report:')
print(classification_report(y_test, xgb_preds))

Vanilla XGB Scores:
Accuracy: 0.7187149413564508
ROC-AUC: 0.7968862029195385
************************************************************
Detailed Report:
              precision    recall  f1-score   support

           0       0.71      0.68      0.70      9342
           1       0.72      0.75      0.74     10268

    accuracy                           0.72     19610
   macro avg       0.72      0.72      0.72     19610
weighted avg       0.72      0.72      0.72     19610



In [27]:
baselines['XGB'] = [end-start, accuracy_score(y_test, xgb_preds), 
                   roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]), f1_score(y_test, xgb_preds, average='weighted')]

#### Catboost

In [28]:
clf = cb.CatBoostClassifier(random_state=1864)
start = time.time()
clf.fit(X_train, y_train, verbose= False)
end = time.time()
cb_preds = clf.predict(X_test)

In [29]:
print('Vanilla Catboost Scores:')
print('Accuracy:', accuracy_score(y_test, cb_preds))
print('ROC-AUC:', roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))
print('*'*60)
print('Detailed Report:')
print(classification_report(y_test, cb_preds))

Vanilla Catboost Scores:
Accuracy: 0.7282508924018358
ROC-AUC: 0.8071100313357531
************************************************************
Detailed Report:
              precision    recall  f1-score   support

           0       0.72      0.70      0.71      9342
           1       0.73      0.76      0.74     10268

    accuracy                           0.73     19610
   macro avg       0.73      0.73      0.73     19610
weighted avg       0.73      0.73      0.73     19610



In [30]:
baselines['Catboost'] = [end-start, accuracy_score(y_test, cb_preds), 
                   roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]), f1_score(y_test, cb_preds, average='weighted')]

#### GBM

In [31]:
clf = GradientBoostingClassifier(random_state=1864)
start = time.time()
clf.fit(X_train, y_train)
end = time.time()
gbm_preds = clf.predict(X_test)

In [32]:
print('Vanilla GBM Scores:')
print('Accuracy:', accuracy_score(y_test, gbm_preds))
print('ROC-AUC:', roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))
print('*'*60)
print('Detailed Report:')
print(classification_report(y_test, gbm_preds))

Vanilla GBM Scores:
Accuracy: 0.7086690464048955
ROC-AUC: 0.7868082092283889
************************************************************
Detailed Report:
              precision    recall  f1-score   support

           0       0.70      0.67      0.69      9342
           1       0.71      0.74      0.73     10268

    accuracy                           0.71     19610
   macro avg       0.71      0.71      0.71     19610
weighted avg       0.71      0.71      0.71     19610



In [33]:
baselines['GBM'] = [end-start, accuracy_score(y_test, gbm_preds), 
                   roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]), f1_score(y_test, gbm_preds, average='weighted')]

#### ExtraTrees

In [34]:
clf = ExtraTreesClassifier(random_state=1864)
start = time.time()
clf.fit(X_train, y_train)
end = time.time()
xtratrees_preds = clf.predict(X_test)

In [35]:
print('Vanilla ExtraTreesClf Scores:')
print('Accuracy:', accuracy_score(y_test, xtratrees_preds))
print('ROC-AUC:', roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))
print('*'*60)
print('Detailed Report:')
print(classification_report(y_test, xtratrees_preds))

Vanilla ExtraTreesClf Scores:
Accuracy: 0.7024477307496175
ROC-AUC: 0.7742860322171207
************************************************************
Detailed Report:
              precision    recall  f1-score   support

           0       0.71      0.64      0.67      9342
           1       0.70      0.75      0.73     10268

    accuracy                           0.70     19610
   macro avg       0.70      0.70      0.70     19610
weighted avg       0.70      0.70      0.70     19610



In [36]:
baselines['ExtraTrees'] = [end-start, accuracy_score(y_test, xtratrees_preds), 
                   roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]), f1_score(y_test, xtratrees_preds, average='weighted')]

### Logistic Regression

In [37]:
clf = LogisticRegression(random_state=1864)
start = time.time()
clf.fit(X_train, y_train)
end = time.time()
logreg_preds = clf.predict(X_test)

In [38]:
print('Vanilla Logistic Regression Scores:')
print('Accuracy:', accuracy_score(y_test, logreg_preds))
print('ROC-AUC:', roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))
print('*'*60)
print('Detailed Report:')
print(classification_report(y_test, logreg_preds))

Vanilla Logistic Regression Scores:
Accuracy: 0.6357980622131566
ROC-AUC: 0.6797003337737668
************************************************************
Detailed Report:
              precision    recall  f1-score   support

           0       0.65      0.51      0.57      9342
           1       0.63      0.75      0.68     10268

    accuracy                           0.64     19610
   macro avg       0.64      0.63      0.63     19610
weighted avg       0.64      0.64      0.63     19610



In [39]:
baselines['LogisticRegression'] = [end-start, accuracy_score(y_test, logreg_preds), 
                   roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]), f1_score(y_test, logreg_preds, average='weighted')]

## Baseline Model Performance Summary

We finished fitting the models and we can now check the metrics. Let's first get an overview of the results and see if there is a big gap between the weighted F1 scores and the accuracies. If it's small, we don't need to worry about f1 scores from now on.

In [40]:
baselines = baselines.T
baselines

Unnamed: 0,Training Time,Accuracy,Roc-Auc Score,F1 Score
DT,6.164743,0.622896,0.621665,0.622724
RF,67.73717,0.716624,0.792039,0.716455
LGBM,2.279974,0.723355,0.802886,0.7231
XGB,25.759258,0.718715,0.796886,0.718343
Catboost,40.278692,0.728251,0.80711,0.727937
GBM,85.060044,0.708669,0.786808,0.708243
ExtraTrees,21.779579,0.702448,0.774286,0.701365
LogisticRegression,1.511083,0.635798,0.6797,0.630379


As expected, the difference between accuracies and f1 scores are nearly non-existent because of the fact that our dataset being balanced.

Let's look at the top 3 models in terms of accuracy, roc-auc score and F1 score now.

In [41]:
print('Accuracy Results')
display(baselines.sort_values(by='Accuracy', ascending=False).head(3))
print('ROC-AUC score Results')
display(baselines.sort_values(by='Roc-Auc Score', ascending=False).head(3))
print('F1 score Results')
display(baselines.sort_values(by='F1 Score', ascending=False).head(3))

Accuracy Results


Unnamed: 0,Training Time,Accuracy,Roc-Auc Score,F1 Score
Catboost,40.278692,0.728251,0.80711,0.727937
LGBM,2.279974,0.723355,0.802886,0.7231
XGB,25.759258,0.718715,0.796886,0.718343


ROC-AUC score Results


Unnamed: 0,Training Time,Accuracy,Roc-Auc Score,F1 Score
Catboost,40.278692,0.728251,0.80711,0.727937
LGBM,2.279974,0.723355,0.802886,0.7231
XGB,25.759258,0.718715,0.796886,0.718343


F1 score Results


Unnamed: 0,Training Time,Accuracy,Roc-Auc Score,F1 Score
Catboost,40.278692,0.728251,0.80711,0.727937
LGBM,2.279974,0.723355,0.802886,0.7231
XGB,25.759258,0.718715,0.796886,0.718343


Let's have a look at the training time spent now. For this I will enlarge the leaderboard and look at it as a whole, since the best candidate for tuning, the default catboost model, took too much time to train.

In [42]:
baselines.sort_values(by='Training Time')

Unnamed: 0,Training Time,Accuracy,Roc-Auc Score,F1 Score
LogisticRegression,1.511083,0.635798,0.6797,0.630379
LGBM,2.279974,0.723355,0.802886,0.7231
DT,6.164743,0.622896,0.621665,0.622724
ExtraTrees,21.779579,0.702448,0.774286,0.701365
XGB,25.759258,0.718715,0.796886,0.718343
Catboost,40.278692,0.728251,0.80711,0.727937
RF,67.73717,0.716624,0.792039,0.716455
GBM,85.060044,0.708669,0.786808,0.708243


So Catboost was the best in terms of getting the correct classification results but LGBM followed it quite closely, with XGBoost Classifier settling in the third position. In terms of training time, LGBM was much faster than the two, it only took 1.7 seconds for it to complete the training.

At this stage, we can continue improving our top 3 baseline models by tuning their hyperparameters and generating new features. But for the sake of simplicity, I will resort to using catboost for getting better results.  