In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from google.colab import drive
from sklearn.ensemble import RandomForestClassifier

drive.mount('/content/drive')
data_directory = '/content/drive/My Drive/data'
all_files = [os.path.join(data_directory, f) for f in os.listdir(data_directory) if f.endswith('.txt')]

all_data = []
for file_path in all_files:
    data = []
    hist = []
    with open(file_path, 'r') as file:
        skip_header = True
        for line in file:
            if skip_header:
                skip_header = False
                continue
            parts = line.split()
            if len(parts) == 3:
                if hist:
                    data.append(hist)
                hist = [int(x) for x in parts]
            else:
                hist.extend(map(int, parts))
    if hist:
        data.append(hist)

    cols = ['year', 'day', 'hour'] + [f'bin_{i+1}' for i in range(60)]
    df = pd.DataFrame(data, columns=cols)
    all_data.append(df)

full_data = pd.concat(all_data, ignore_index=True)

he3_start = int((2.7 - 2) / 0.05)
he3_end = int((3.3 - 2) / 0.05)
he4_start = int((3.7 - 2) / 0.05)
he4_end = int((4.3 - 2) / 0.05)
full_data['He-3'] = full_data.iloc[:, 3+he3_start:3+he3_end+1].sum(axis=1)
full_data['He-4'] = full_data.iloc[:, 3+he4_start:3+he4_end+1].sum(axis=1)
full_data['He-3 Rich'] = ((full_data['He-3'] >= 0.5 * full_data['He-4']) & ((full_data['He-3'] + full_data['He-4']) >= 20)).astype(int)

Mounted at /content/drive


In [2]:
print("columns:")
print(full_data.columns)
print("\nrows:")
print(full_data.head(2))
print("\ncolumn stats")
print(full_data.groupby('He-3 Rich').describe().transpose())

columns:
Index(['year', 'day', 'hour', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'bin_5',
       'bin_6', 'bin_7', 'bin_8', 'bin_9', 'bin_10', 'bin_11', 'bin_12',
       'bin_13', 'bin_14', 'bin_15', 'bin_16', 'bin_17', 'bin_18', 'bin_19',
       'bin_20', 'bin_21', 'bin_22', 'bin_23', 'bin_24', 'bin_25', 'bin_26',
       'bin_27', 'bin_28', 'bin_29', 'bin_30', 'bin_31', 'bin_32', 'bin_33',
       'bin_34', 'bin_35', 'bin_36', 'bin_37', 'bin_38', 'bin_39', 'bin_40',
       'bin_41', 'bin_42', 'bin_43', 'bin_44', 'bin_45', 'bin_46', 'bin_47',
       'bin_48', 'bin_49', 'bin_50', 'bin_51', 'bin_52', 'bin_53', 'bin_54',
       'bin_55', 'bin_56', 'bin_57', 'bin_58', 'bin_59', 'bin_60', 'He-3',
       'He-4', 'He-3 Rich'],
      dtype='object')

rows:
   year  day  hour  bin_1  bin_2  bin_3  bin_4  bin_5  bin_6  bin_7  ...  \
0  2016  280     0      0      0      0      0      0      0      0  ...   
1  2016  280     1      0      0      0      0      0      0      0  ...   

   bin_54  bin_55  

In [3]:
X = full_data[[f'bin_{i+1}' for i in range(60)]]
y = full_data['He-3 Rich']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_probs = rf_model.predict_proba(X_test)[:, 1]

print("report:")
print(classification_report(y_test, rf_predictions))

imps = rf_model.feature_importances_
sorted_idx = np.argsort(imps)[::-1]
imp_df = pd.DataFrame({'feature': X_train.columns[sorted_idx],'importance': imps[sorted_idx]})

print("important features:")
print(imp_df)

print("\nconfusion matrix:")
print(confusion_matrix(y_test, rf_predictions))

report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     44210
           1       0.94      0.67      0.78       114

    accuracy                           1.00     44324
   macro avg       0.97      0.83      0.89     44324
weighted avg       1.00      1.00      1.00     44324

important features:
   feature  importance
0   bin_18    0.129985
1   bin_17    0.089955
2   bin_19    0.088400
3   bin_15    0.058184
4   bin_21    0.056039
5   bin_20    0.052912
6   bin_16    0.051665
7   bin_22    0.036209
8   bin_14    0.030090
9   bin_41    0.025001
10  bin_42    0.023047
11  bin_38    0.022302
12  bin_37    0.020924
13  bin_40    0.020749
14  bin_39    0.018204
15  bin_35    0.015554
16  bin_34    0.015498
17  bin_36    0.015433
18  bin_23    0.015216
19  bin_43    0.014749
20  bin_45    0.013677
21  bin_12    0.012235
22  bin_24    0.011623
23  bin_31    0.011365
24  bin_13    0.011329
25  bin_33    0.010244
26  bin_32    0.009798
2