In [6]:
  
def logical_feature_expansion(df, degree=2):
    """
    Perform logical feature expansion on a DataFrame.
    This function generates new features using logical AND, OR, and NOT operations.
    
    Parameters:
    df (pd.DataFrame): Input DataFrame with binary features (0 or 1).
    
    Returns:
    pd.DataFrame: DataFrame with original and newly generated logical features.
    """
    from itertools import combinations
    
    for col in df.columns:
        df[f'NOT_{col}'] = ~df[col]

    new_features = {}
    
    # Generate AND features
    for col in combinations(df.columns, degree):
        new_features[f'{"_".join(columns)}_AND'] = df[columns].all(axis=1)
    
    # Generate OR features
    for columns in combinations(df.columns, degree):
        new_features[f'{"_".join(columns)}_OR'] = df[columns].any(axis=1)
    
    
    # Combine original features with new logical features
    df_expanded = df.copy()
    for feature_name, feature_values in new_features.items():
        df_expanded[feature_name] = feature_values
    
    return df_expanded




In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU, Input, concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import Callback
from sklearn.mixture import GaussianMixture
from tensorflow.keras.regularizers import l2

# Load datasets and initialize models
for n in [15]:
    # Load dataset
    import os
    print("Current Directory:", os.getcwd())
    
    Xs = np.load(f"/home/moritz/maths-for-ml/Kryptonite-N/Datasets/kryptonite-{n}-X.npy")
    Ys = np.load(f"/home/moritz/maths-for-ml/Kryptonite-N/Datasets/kryptonite-{n}-y.npy")
    df_x = pd.DataFrame(Xs)
    df_y = pd.Series(Ys)
    
    df_x = (df_x >= 0.5).astype(int)
    
    from sklearn.preprocessing import PolynomialFeatures

    # Perform polynomial feature expansion
    poly = PolynomialFeatures(degree=4, include_bias=False)
    df_x_poly = poly.fit_transform(df_x)

    # Convert the expanded features back to a DataFrame
    df_x = pd.DataFrame(df_x_poly)

    X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=42)

    print(X_train.shape)
    print(X_test.shape)


    


Current Directory: \\wsl.localhost\Ubuntu\home\moritz\maths-for-ml\Kryptonite-N


KeyError: (0, 1)

In [8]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree model
dt_model = DecisionTreeClassifier(max_depth=2000, min_samples_split=2, min_samples_leaf=1)

# Train the model
dt_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_dt = dt_model.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Accuracy of the Decision Tree Classifier on test set for n = {n}: {accuracy_dt:.4f}")
accuracy_train = accuracy_score(y_train, dt_model.predict(X_train))
print(f"Accuracy of the Decision Tree Classifier on train set for n = {n}: {accuracy_train:.4f}")


Accuracy of the Decision Tree Classifier on test set for n = 15: 0.7432
Accuracy of the Decision Tree Classifier on train set for n = 15: 0.9840


In [19]:
X_train.head()

# Calculate correlation between '14_Mode_Prob' and column '14'
for i in range(X_train.shape[1]):
    if i in [0, 6, 7, 12, 13, 14, 17, 19, 20, 24]:
        continue
    correlation = X_train[f'{i}_Mode_Prob'].corr(X_train[i])
    print(f"Correlation between '{i}_Mode_Prob' and '{i}':", correlation)


Correlation between '1_Mode_Prob' and '1': -0.9999999993956484
Correlation between '2_Mode_Prob' and '2': 0.9999993818094738
Correlation between '3_Mode_Prob' and '3': -0.9999999999982891
Correlation between '4_Mode_Prob' and '4': 0.9999999999999762
Correlation between '5_Mode_Prob' and '5': -0.9999999999984581
Correlation between '8_Mode_Prob' and '8': 0.9999997687835763
Correlation between '9_Mode_Prob' and '9': -0.9999999999249185
Correlation between '10_Mode_Prob' and '10': 0.9999999999999996
Correlation between '11_Mode_Prob' and '11': 0.999999984056016
Correlation between '15_Mode_Prob' and '15': -0.999999999999801
Correlation between '16_Mode_Prob' and '16': -0.9999989131246727
Correlation between '18_Mode_Prob' and '18': -0.9999999263339079
Correlation between '21_Mode_Prob' and '21': -0.9999754921252788
Correlation between '22_Mode_Prob' and '22': 0.9999999999220971
Correlation between '23_Mode_Prob' and '23': -0.9999999999999667
Correlation between '25_Mode_Prob' and '25': 0.

KeyError: '30_Mode_Prob'

In [32]:
model = Sequential([
        Input(shape=(X_train.shape[1],), sparse=True),
        Dense(128),
        LeakyReLU(alpha=0.1),
        BatchNormalization(),
        
        Dense(64),
        LeakyReLU(alpha=0.1),
        BatchNormalization(),
        
        Dense(32),
        LeakyReLU(alpha=0.1),
        BatchNormalization(),
        
        Dense(1, activation='sigmoid')
    ])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=150,
    batch_size=32,
    validation_split=0.1, 
    verbose=1,
)

# Make predictions and evaluate
y_pred_nn = (model.predict(X_test) > 0.5).astype(int)
accuracy_nn = accuracy_score(y_test, y_pred_nn)
print(f"Accuracy of the Neural Network Classifier on test set for n = {n}: {accuracy_nn:.4f}")
print(classification_report(y_test, y_pred_nn))

Epoch 1/150




[1m1080/1080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.4980 - loss: 0.7368 - val_accuracy: 0.5013 - val_loss: 0.6983
Epoch 2/150
[1m1080/1080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5257 - loss: 0.6939 - val_accuracy: 0.5003 - val_loss: 0.6985
Epoch 3/150
[1m1080/1080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5359 - loss: 0.6902 - val_accuracy: 0.5003 - val_loss: 0.6987
Epoch 4/150
[1m1080/1080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5377 - loss: 0.6879 - val_accuracy: 0.5000 - val_loss: 0.6986
Epoch 5/150
[1m1080/1080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5392 - loss: 0.6883 - val_accuracy: 0.5047 - val_loss: 0.7000
Epoch 6/150
[1m1080/1080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5450 - loss: 0.6868 - val_accuracy: 0.4924 - val_loss: 0.7011
Epoch 7/150
[1m1080/1

KeyboardInterrupt: 

In [10]:
# Build the neural network model


model = Sequential([
Input(shape=(X_train.shape[1],)),

Dense(512),
LeakyReLU(),
BatchNormalization(),
Dropout(0.2),

Dense(256),
LeakyReLU(),
BatchNormalization(),
Dropout(0.2),

Dense(128),
LeakyReLU(),
BatchNormalization(),
Dropout(0.2),


Dense(64),
LeakyReLU(),
BatchNormalization(),
Dropout(0.1),


Dense(32),
LeakyReLU(),
BatchNormalization(),

Dense(1, activation='sigmoid')
])


# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Define a callback to stop training when accuracy reaches 95%
class EarlyStoppingByAccuracy(Callback):
    def on_epoch_end(self, epoch, logs=None):
        if logs.get('accuracy') >= 0.94:
            print("\nReached 94% accuracy, stopping training!")
            self.model.stop_training = True

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=500,
    batch_size=32,
    validation_split=0.1, 
    verbose=1,
    callbacks=[EarlyStoppingByAccuracy()]
)

# Make predictions and evaluate
y_pred_nn = (model.predict(X_test) > 0.5).astype(int)
accuracy_nn = accuracy_score(y_test, y_pred_nn)
print(f"Accuracy of the Neural Network Classifier on test set for n = {n}: {accuracy_nn:.4f}")
print(classification_report(y_test, y_pred_nn))

Epoch 1/500
[1m1080/1080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.5033 - loss: 0.7324 - val_accuracy: 0.5065 - val_loss: 0.6993
Epoch 2/500
[1m 209/1080[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m3s[0m 4ms/step - accuracy: 0.5123 - loss: 0.6965

KeyboardInterrupt: 

In [4]:
# Save the model as moritz-n24
model.save('moritz-n30.keras')


In [8]:
# Load the hidden kryptonite 30 dataset from a .npy file
hidden_X = np.load(f"/home/moritz/maths-for-ml/Kryptonite-N/Datasets/additional-kryptonite-{n}-X.npy")
df_hidden_x = pd.DataFrame(Xs)
hidden_X_transformed = pd.DataFrame()
for column in df_x.columns:
        if column not in columns_to_exclude:
            gmm = GaussianMixture(n_components=2, random_state=42)
            gmm.fit(X_train[[column]])
            proba = gmm.predict_proba(X_train[[column]])
            X_train_transformed[f'{column}_Mode_Prob'] = np.where(proba[:, 0] > proba[:, 1], -proba[:, 0], proba[:, 1])
            proba = gmm.predict_proba(X_test[[column]])
            hidden_X_transformed[f'{column}_Mode_Prob'] = np.where(proba[:, 0] > proba[:, 1], -proba[:, 0], proba[:, 1])
# Make predictions on the hidden kryptonite 30 dataset
X_hidden = pd.concat([(df_hidden_x >= 0.5).astype(int).reset_index(drop=True), hidden_X_transformed.reset_index(drop=True)], axis=1)

y_pred_hidden_kryptonite_30 = (model.predict(X_hidden) > 0.5).astype(int)


np.save('f"/home/moritz/maths-for-ml/Kryptonite-N/Datasets/predicted_y_hidden_kryptonite_30.npy', y_pred_hidden_kryptonite_30)






[1m7813/7813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 707us/step


FileNotFoundError: [Errno 2] No such file or directory: 'f"/home/moritz/maths-for-ml/Kryptonite-N/Datasets/predicted_y_hidden_kryptonite_30.npy'

In [11]:
np.save('/home/moritz/maths-for-ml/Kryptonite-N/Datasets/predicted_y_hidden_kryptonite_30.npy', y_pred_hidden_kryptonite_30)