In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

data_directory = 'data'
all_files = [os.path.join(data_directory, f) for f in os.listdir(data_directory) if f.endswith('.txt')]

all_data = []
for file_path in all_files:
    data = []
    hist = []
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.split()
            if len(parts) == 3:
                if hist:
                    data.append(hist)
                hist = [int(x) for x in parts]
            else:
                hist.extend(map(int, parts))
    if hist:
        data.append(hist)

    cols = ['year', 'day', 'hour'] + [f'bin_{i+1}' for i in range(60)]
    df = pd.DataFrame(data, columns=cols)
    all_data.append(df)

full_data = pd.concat(all_data, ignore_index=True)

he3_start = int((2.7 - 2) / 0.05)
he3_end = int((3.3 - 2) / 0.05)
he4_start = int((3.7 - 2) / 0.05)
he4_end = int((4.3 - 2) / 0.05)

full_data['He-3'] = full_data.iloc[:, 3+he3_start:3+he3_end+1].sum(axis=1)
full_data['He-4'] = full_data.iloc[:, 3+he4_start:3+he4_end+1].sum(axis=1)
full_data['He-3 Rich'] = ((full_data['He-3'] >= 0.5 * full_data['He-4']) & ((full_data['He-3'] + full_data['He-4']) >= 20)).astype(int)

X = full_data[['He-3', 'He-4']]
y = full_data['He-3 Rich']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = Sequential([
    Dense(10, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(X_train_scaled, y_train, epochs=50, validation_split=0.2)

test_loss, test_acc = model.evaluate(X_test_scaled, y_test)
print(f"Test Accuracy: {test_acc:.4f}")


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9743 - loss: 0.5712 - val_accuracy: 0.9770 - val_loss: 0.2570
Epoch 2/50
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9820 - loss: 0.1819 - val_accuracy: 0.9963 - val_loss: 0.0626
Epoch 3/50
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9925 - loss: 0.0540 - val_accuracy: 0.9972 - val_loss: 0.0294
Epoch 4/50
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9940 - loss: 0.0287 - val_accuracy: 0.9972 - val_loss: 0.0189
Epoch 5/50
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9914 - loss: 0.0225 - val_accuracy: 0.9972 - val_loss: 0.0142
Epoch 6/50
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9935 - loss: 0.0172 - val_accuracy: 0.9972 - val_loss: 0.0116
Epoch 7/50
[1m137/137[0m [32m━━━━━━━

In [3]:
print(f"Test Accuracy: {test_acc:.4f}")
y_pred = model.predict(X_test_scaled) > 0.5
print(classification_report(y_test, y_pred, target_names=['Non-Rich', 'He-3 Rich']))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
roc_auc = roc_auc_score(y_test, model.predict(X_test_scaled))
print(f"ROC-AUC Score: {roc_auc:.4f}")

[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9981 - loss: 0.0038
Test Accuracy: 0.9970
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
              precision    recall  f1-score   support

    Non-Rich       1.00      1.00      1.00      2288
   He-3 Rich       0.88      0.98      0.93        45

    accuracy                           1.00      2333
   macro avg       0.94      0.99      0.96      2333
weighted avg       1.00      1.00      1.00      2333

Confusion Matrix:
[[2282    6]
 [   1   44]]
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
ROC-AUC Score: 0.9997
