In [None]:
# Imports
import energyflow
from scipy import stats
from sklearn.model_selection import train_test_split

In [None]:
# Load Quark and Gluon Data
X, y = energyflow.qg_jets.load(num_data=100000, pad=True, ncol=4, generator='pythia',
                        with_bc=True, cache_dir='~/.energyflow')

Downloading QG_jets_withbc_0.npz from https://www.dropbox.com/s/hlu497verxb9f4x/QG_jets_withbc_0.npz?dl=1 to /root/.energyflow/datasets


In [None]:
# Dataset Dimensions
print(f'X dimensions {X.shape}')
print(f'y dimensions {y.shape}')

X dimensions (100000, 134, 4)
y dimensions (100000,)


In [None]:
# Check distribution for Input values
print(f"First Param:   {stats.describe(stats.describe(X[:][:][0]))} \n")
print(f"Second Param:  {stats.describe(stats.describe(X[:][:][1]))} \n")
print(f"Third Param:   {stats.describe(stats.describe(X[:][:][2]))} \n")
print(f"Fourth Param:  {stats.describe(stats.describe(X[:][:][3]))} \n")

First Param:  DescribeResult(nobs=134, minmax=(array([ 0.0000000e+00, -1.3538641e+00, -8.8155663e-02, -2.2120000e+03]), array([7.98959525e+01, 0.00000000e+00, 3.54168435e-01, 2.21200000e+03])), mean=array([ 4.07690646, -0.34512397,  0.04144194,  1.84328358]), variance=array([1.49872895e+02, 2.50955835e-01, 5.53456652e-03, 3.60437547e+05]), skewness=array([ 4.46841141, -0.80509079,  1.83599502, -0.01106463]), kurtosis=array([20.95536436, -1.23599918,  3.21814589, 10.01918759])) 

Second Param:  DescribeResult(nobs=134, minmax=(array([ 0.0000000e+00, -1.0666893e+00,  0.0000000e+00, -2.1120000e+03]), array([ 200.92795907,    0.        ,    5.5205472 , 2112.        ])), mean=array([ 4.03044755, -0.24516803,  1.62016569, 18.1119403 ]), variance=array([3.65579054e+02, 1.44313598e-01, 6.00242359e+00, 1.74122341e+05]), skewness=array([ 8.69535343, -0.99609829,  0.84400043,  0.84701844]), kurtosis=array([83.66202488, -0.80047526, -1.28500461, 21.71020124])) 

Third Param:  DescribeResult(nobs=1

In [None]:
# Check distribution of Quark and Gluon Jet samples.
stats.describe(y)

DescribeResult(nobs=100000, minmax=(0.0, 1.0), mean=0.5, variance=0.25000250002500024, skewness=0.0, kurtosis=-2.0)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [None]:
# Print Dimensions of Train Dataset
print(f'X_train dimensions {X_train.shape}')
print(f'y_train dimensions {y_train.shape}')

# Print Dimensions of Test Dataset
print(f'X_test dimensions {X_test.shape}')
print(f'y_test dimensions {y_test.shape}')

X_train dimensions (67000, 134, 4)
y_train dimensions (67000,)
X_test dimensions (33000, 134, 4)
y_test dimensions (33000,)


In [None]:
# Reshape X_train for CNN Layers
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1]*X_train.shape[2])

# Reshape X_test for CNN Layers
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1]*X_test.shape[2])

In [None]:
# Print Dimensions of Reshaped Train Dataset
print(f'X_train dimensions {X_train.shape}')
print(f'y_train dimensions {y_train.shape}')

X_train dimensions (67000, 536)
y_train dimensions (67000,)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

regressor = RandomForestRegressor(n_estimators=20, random_state=1)
regressor.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=20, n_jobs=None, oob_score=False,
                      random_state=1, verbose=0, warm_start=False)

In [None]:
prediction = regressor.predict(X_test)

In [None]:
for i in range(0, 33000):
  if prediction[i] <= 0.5:
    prediction[i] = 0
  else:
    prediction[i] = 1

accurate_pred = 0
for i in range(0, 33000):
  if prediction[i] == y_test[i]:
    accurate_pred += 1

print(f"Random Forest Regressor Accuracy: {(accurate_pred/33000) * 100}")

Random Forest Regressor Accuracy: 76.95757575757575
