### **Reading the Data**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

import warnings

# Suppress specific types of warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
# Add more filterwarnings lines as needed to suppress other types of warnings

# Your code goes here
td = pd.read_csv("/kaggle/input/sandeep-isi-ml4crypto/TrainingData.csv")
td

In [None]:
td["class"].value_counts()

In [None]:
td.shape

In [None]:
td.iloc[0,1]

In [None]:
sample = pd.read_excel("/kaggle/input/sandeep-isi-ml4crypto/Sample.xlsx")

In [None]:
# Split the dataset into features (X) and labels (y)
X = td['Bitstream'].apply(lambda x: [int(bit) for bit in x]).values
y = td['class'].values

# Convert X to a NumPy array
X = pd.DataFrame(list(X)).values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)


### **Training the models**

### Classical ML Models

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

#### **Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

# Create a logistic regression model
model = LogisticRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

#### **SVM**

In [None]:
from sklearn.svm import SVC

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

# Create an SVM model
model = SVC()

In [None]:
# Define a callback function to monitor progress
def progress_callback(epoch, logs):
    if epoch % 10 == 0:
        # Make predictions on the test data
        y_pred = model.predict(X_test)
        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Epoch {epoch}, Accuracy: {accuracy}")

# Fit the model to the training data and monitor progress
model.fit(X_train[:20000], y_train[:20000])

In [None]:
# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

#### **DTM**

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Create a Decision Tree model
model = DecisionTreeClassifier()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)


#### **XGBoost**

In [None]:
import xgboost as xgb

# Create an XGBoost DMatrix for training and testing
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define the parameters for the XGBoost model
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',  # You can use other metrics like 'error' or 'auc'
    'max_depth': 12,
    'eta': 0.1,
    'nrounds': 100,
    'early_stopping_rounds': 10,
    'verbose_eval': 10  # Print evaluation metrics every 10 rounds
}

# Train the XGBoost model and monitor progress
watchlist = [(dtrain, 'train'), (dtest, 'test')]
model = xgb.train(params, dtrain, num_boost_round=params['nrounds'], evals=watchlist)

In [None]:
# Make predictions on the test data
y_pred = model.predict(dtest)

# Convert predicted probabilities to binary labels
y_pred_binary = [1 if p >= 0.5 else 0 for p in y_pred]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_binary)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

#### **XGB Ensemble**

In [None]:
from xgboost import XGBClassifier

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.05, random_state=42)

# Define the parameters for the XGBoost model
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',  # You can use other metrics like 'error' or 'logloss'
    'max_depth': 20,
    'eta': 0.1,
    'n_estimators': 10,  # Number of trees in the ensemble
    'early_stopping_rounds': 10,
    'verbose': 10  # Print evaluation metrics every 10 rounds
}

# Create an XGBoost classifier
model = XGBClassifier(**params)

# Train the XGBoost model with early stopping
evals = [(X_val, y_val)]  # Specify the validation dataset
model.fit(X_train, y_train, eval_set=evals, verbose=True)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

## **Deep Learning Models**

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# Split the dataset into features (X) and labels (y)
X = td['Bitstream'].apply(lambda x: [int(bit) for bit in x]).values
y = td['class'].values

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05, random_state=42)

### **Multi-layer perceptron (ffn)**

In [None]:
# Split the dataset into features (X) and labels (y)
X = td['Bitstream'].apply(lambda x: [int(bit) for bit in x]).values
y = td['class'].values

# Convert X to a NumPy array
X = np.array(list(X))  # Convert the list of lists to a NumPy array

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05, random_state=42)

# Define a simple feedforward neural network
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(64,)),  # Input layer with 64 features
    tf.keras.layers.Dense(64, activation='relu'),  # Hidden layer with 64 neurons and ReLU activation
    tf.keras.layers.Dense(128, activation='relu'),  # Additional hidden layer with 128 neurons and ReLU activation
    tf.keras.layers.Dense(128, activation='relu'),  # Additional hidden layer with 128 neurons and ReLU activation
    tf.keras.layers.Dense(256, activation='relu'),  # Additional hidden layer with 256 neurons and ReLU activation
    tf.keras.layers.Dense(256, activation='relu'),  # Additional hidden layer with 256 neurons and ReLU activation
    tf.keras.layers.Dense(256, activation='relu'),  # Additional hidden layer with 256 neurons and ReLU activation
    tf.keras.layers.Dense(128, activation='relu'),  # Additional hidden layer with 128 neurons and ReLU activation
    tf.keras.layers.Dense(128, activation='relu'),  # Additional hidden layer with 128 neurons and ReLU activation
    tf.keras.layers.Dense(64, activation='relu'),   # Additional hidden layer with 64 neurons and ReLU activation
    tf.keras.layers.Dense(64, activation='relu'),   # Additional hidden layer with 64 neurons and ReLU activation
    tf.keras.layers.Dense(2, activation='softmax')  # Output layer with 2 neurons and softmax activation for binary classification
])




In [None]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
batch_size = 3200
epochs = 5
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val))



In [None]:
# Evaluate the model on the validation data
y_val_pred = model.predict(X_val)
y_val_pred = np.argmax(y_val_pred, axis=1)

accuracy = accuracy_score(y_val, y_val_pred)
confusion = confusion_matrix(y_val, y_val_pred)
classification_rep = classification_report(y_val, y_val_pred)

print("Validation Accuracy: {:.2f}%".format(accuracy * 100))
print("Confusion Matrix:")
print(confusion)
print("Classification Report:")
print(classification_rep)


### **MLP - Reccurrent Neurons (LSTM)**

In [None]:

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05, random_state=42)

# Reshape data for LSTM layers
X_train = X_train.reshape(X_train.shape[0], 64, 1)
X_val = X_val.reshape(X_val.shape[0], 64, 1)

# Define a neural network with LSTM layers
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(64, 1)),  # Input layer with 64 time steps and 1 feature
    tf.keras.layers.LSTM(64, return_sequences=True),  # LSTM layer with 64 units and return sequences
    tf.keras.layers.LSTM(128, return_sequences=True),  # LSTM layer with 128 units and return sequences
    tf.keras.layers.Dense(256, activation='relu'),  # Additional dense layer with 256 neurons and ReLU activation
    tf.keras.layers.LSTM(256, return_sequences=True),  # LSTM layer with 256 units and return sequences
    tf.keras.layers.Dense(128, activation='relu'),  # Additional dense layer with 128 neurons and ReLU activation
    tf.keras.layers.LSTM(64),  # LSTM layer with 64 units
    tf.keras.layers.Dense(10, activation='softmax'),  # Output layer with 5 neurons and softmax activation for binary classification
    tf.keras.layers.Dense(2, activation='sigmoid') 
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
batch_size = 12800
epochs = 5
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val))


In [None]:
# Evaluate the model on the validation data
y_val_pred = model.predict(X_val)
y_val_pred = np.argmax(y_val_pred, axis=1)

accuracy = accuracy_score(y_val, y_val_pred)
confusion = confusion_matrix(y_val, y_val_pred)
classification_rep = classification_report(y_val, y_val_pred)

print("Validation Accuracy: {:.2f}%".format(accuracy * 100))
print("Confusion Matrix:")
print(confusion)
print("Classification Report:")
print(classification_rep)


### **CNN + LSTM**

In [None]:
# Define a neural network with LSTM and CNN layers
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(64, 1)),  # Input layer with 64 time steps and 1 feature
    tf.keras.layers.Conv1D(64, 3, activation='relu'),  # 1D convolutional layer with 64 filters and a kernel size of 3
    tf.keras.layers.MaxPooling1D(2),  # Max pooling layer
    tf.keras.layers.LSTM(64, return_sequences=True),  # LSTM layer with 64 units and return sequences
    tf.keras.layers.LSTM(128, return_sequences=True),  # LSTM layer with 128 units and return sequences
    tf.keras.layers.Dense(256, activation='relu'),  # Additional dense layer with 256 neurons and ReLU activation
    tf.keras.layers.LSTM(256, return_sequences=True),  # LSTM layer with 256 units and return sequences
    tf.keras.layers.Dense(128, activation='relu'),  # Additional dense layer with 128 neurons and ReLU activation
    tf.keras.layers.LSTM(64),  # LSTM layer with 64 units
    tf.keras.layers.Dense(10, activation='softmax'),  # Output layer with 10 neurons and softmax activation for multi-class classification
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
batch_size = 3200
epochs = 5
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val))


In [None]:
# Evaluate the model on the validation data
y_val_pred = model.predict(X_val)
y_val_pred = np.argmax(y_val_pred, axis=1)

accuracy = accuracy_score(y_val, y_val_pred)
confusion = confusion_matrix(y_val, y_val_pred)
classification_rep = classification_report(y_val, y_val_pred)

print("Validation Accuracy: {:.2f}%".format(accuracy * 100))
print("Confusion Matrix:")
print(confusion)
print("Classification Report:")
print(classification_rep)


#### **CNN+LSTM - more layers**

In [None]:
# Define a neural network with LSTM and CNN layers
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(64, 1)),  # Input layer with 64 time steps and 1 feature
    tf.keras.layers.Conv1D(64, 3, activation='relu'),  # 1D convolutional layer with 64 filters and a kernel size of 3
    tf.keras.layers.MaxPooling1D(2),  # Max pooling layer
    tf.keras.layers.LSTM(64, return_sequences=True),  # LSTM layer with 64 units and return sequences
    tf.keras.layers.LSTM(128, return_sequences=True),  # LSTM layer with 128 units and return sequences
    tf.keras.layers.Dense(256, activation='relu'),  # Additional dense layer with 256 neurons and ReLU activation
    tf.keras.layers.LSTM(256, return_sequences=True),  # LSTM layer with 256 units and return sequences
    tf.keras.layers.Dense(128, activation='relu'),  # Additional dense layer with 128 neurons and ReLU activation
    tf.keras.layers.LSTM(64),  # LSTM layer with 64 units
    tf.keras.layers.Dense(10, activation='softmax'),# Output layer with 10 neurons and softmax activation for multi-class classification
    tf.keras.layers.Dense(2, activation='sigmoid'),# Output layer with 10 neurons and softmax activation for multi-class classification
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
batch_size = 8000
epochs = 5
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val))

# Evaluate the model on the validation data
y_val_pred = model.predict(X_val)
y_val_pred = np.argmax(y_val_pred, axis=1)

accuracy = accuracy_score(y_val, y_val_pred)
confusion = confusion_matrix(y_val, y_val_pred)
classification_rep = classification_report(y_val, y_val_pred)

print("Validation Accuracy: {:.2f}%".format(accuracy * 100))
print("Confusion Matrix:")
print(confusion)
print("Classification Report:")
print(classification_rep)


In [None]:
# Define a neural network with LSTM and CNN layers
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(64, 1)),  # Input layer with 64 time steps and 1 feature
    tf.keras.layers.Conv1D(64, 3, activation='relu'),  # 1D convolutional layer with 64 filters and a kernel size of 3
    tf.keras.layers.MaxPooling1D(2),  # Max pooling layer
    tf.keras.layers.LSTM(128, return_sequences=True),  # LSTM layer with 128 units and return sequences
    tf.keras.layers.LSTM(128, return_sequences=True),  # LSTM layer with 128 units and return sequences
    tf.keras.layers.LSTM(64, return_sequences=True),  # LSTM layer with 64 units and return sequences
    tf.keras.layers.Flatten(),  # Flatten the output for the fully connected layers
    tf.keras.layers.Dense(256, activation='relu'),  # Additional dense layer with 256 neurons and ReLU activation
    tf.keras.layers.Dense(128, activation='relu'),  # Additional dense layer with 128 neurons and ReLU activation
    tf.keras.layers.Dense(2, activation='softmax'),  # Output layer with 2 neurons and softmax activation for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
batch_size = 6400
epochs = 6
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val))

# Evaluate the model on the validation data
y_val_pred = model.predict(X_val)
y_val_pred = np.argmax(y_val_pred, axis=1)

accuracy = accuracy_score(y_val, y_val_pred)
confusion = confusion_matrix(y_val, y_val_pred)
classification_rep = classification_report(y_val, y_val_pred, target_names=['Class 0', 'Class 1'])

print("Validation Accuracy: {:.2f}%".format(accuracy * 100))
print("Confusion Matrix:")
print(confusion)
print("Classification Report:")
print(classification_rep)


### **BERT**

**further**

In [None]:
from sklearn.metrics import roc_auc_score
import tensorflow as tf

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.05, random_state=42)

# Define a custom loss function for weighted average precision
def weighted_average_precision(y_true, y_pred):
    # Define weights for positive and negative examples
    weight_pos = 6265  # Adjust this value as needed
    weight_neg = 6235  # Adjust this value as needed
    
    # Split y_true and y_pred into positive and negative examples
    y_true_pos = tf.boolean_mask(y_true, tf.math.equal(y_true, 1))
    y_pred_pos = tf.boolean_mask(y_pred, tf.math.equal(y_true, 1))
    y_true_neg = tf.boolean_mask(y_true, tf.math.equal(y_true, 0))
    y_pred_neg = tf.boolean_mask(y_pred, tf.math.equal(y_true, 0))
    
    # Calculate precision for positive and negative examples
    precision_pos = tf.reduce_sum(tf.math.multiply(y_true_pos, y_pred_pos)) / (tf.reduce_sum(y_pred_pos) + 1e-7)
    precision_neg = tf.reduce_sum(tf.math.multiply(y_true_neg, y_pred_neg)) / (tf.reduce_sum(y_pred_neg) + 1e-7)
    
    # Calculate weighted average precision
    weighted_avg_precision = (weight_pos * precision_pos + weight_neg * precision_neg)
    
    return 1 - weighted_avg_precision

# Define a neural network with LSTM and CNN layers
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(64, 1)),  # Input layer with 64 time steps and 1 feature
    tf.keras.layers.Conv1D(64, 3, activation='relu'),  # 1D convolutional layer with 64 filters and a kernel size of 3
    tf.keras.layers.MaxPooling1D(2),  # Max pooling layer
    tf.keras.layers.Conv1D(64, 3, activation='relu'),  # 1D convolutional layer with 64 filters and a kernel size of 5
    tf.keras.layers.MaxPooling1D(2),  # Max pooling layer
    tf.keras.layers.LSTM(64, return_sequences=True),  # LSTM layer with 64 units and return sequences
    tf.keras.layers.LSTM(128, return_sequences=True),  # LSTM layer with 128 units and return sequences
    tf.keras.layers.Dense(256, activation='relu'),  # Additional dense layer with 256 neurons and ReLU activation
    tf.keras.layers.LSTM(256, return_sequences=True),  # LSTM layer with 256 units and return sequences
    tf.keras.layers.Dense(128, activation='relu'),  # Additional dense layer with 128 neurons and ReLU activation
    tf.keras.layers.LSTM(64),  # LSTM layer with 64 units
    tf.keras.layers.Dense(1, activation='sigmoid'),  # Output layer with sigmoid activation for binary classification
])

# Compile the model with the custom loss function
model.compile(optimizer='adam', loss=weighted_average_precision, metrics=['accuracy'])

# Train the model
batch_size = 3200
epochs = 5
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val))

# Evaluate the model on the validation data
y_val_pred = model.predict(X_val)
y_val_pred = (y_val_pred > 0.5).astype(int)  # Convert probabilities to binary predictions

accuracy = accuracy_score(y_val, y_val_pred)
confusion = confusion_matrix(y_val, y_val_pred)
classification_rep = classification_report(y_val, y_val_pred)

print("Validation Accuracy: {:.2f}%".format(accuracy * 100))
print("Confusion Matrix:")
print(confusion)
print("Classification Report:")
print(classification_rep)