### XGBoost Model

In [1]:
# Import Libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb


In [2]:
# Load the dataset
data = pd.read_csv('TrainingData.csv')
data


Unnamed: 0,BID,Bitstream,class
0,0,1000111010111101101100110111001111001000101111...,1
1,1,1101111100101011111111101101010001110110000010...,1
2,2,0011001010001010100100011101000111110100101111...,0
3,3,1101010110000110100001001100111101000000110001...,1
4,4,1010111100001001000101010010111010011101001100...,1
...,...,...,...
1995,1995,1110110011110100001111101111010110011000001110...,0
1996,1996,0100010100011110101110000110100101100000011001...,1
1997,1997,1100001010100011010001110001010010101010101100...,0
1998,1998,0011110000001110101101111110110100010010100011...,1


In [3]:
# Dataset info

print("Dataset Info:")
print(data.info())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   BID        2000 non-null   int64 
 1   Bitstream  2000 non-null   object
 2   class      2000 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 47.0+ KB
None


In [4]:
# Advanced Feature Engineering

def extract_advanced_features(bitstream):
    bits = np.array([int(bit) for bit in bitstream])
    n = len(bits)
    
    # Basic features
    mean = np.mean(bits)
    std = np.std(bits)
    num_ones = np.sum(bits)
    num_zeros = n - num_ones
    
    # Transition probabilities
    transitions = [f"{bits[i]}{bits[i+1]}" for i in range(n - 1)]
    transition_counts = {k: transitions.count(k) for k in ['00', '01', '10', '11']}
    transition_probs = {k: v / (n - 1) for k, v in transition_counts.items()}
    
    # Fourier Transform
    freq_components = np.abs(np.fft.fft(bits))
    fft_mean = np.mean(freq_components)
    fft_std = np.std(freq_components)
    
    # Autocorrelation
    autocorr = np.correlate(bits - mean, bits - mean, mode='full')[n - 1:]
    max_autocorr = np.max(autocorr)
    
    return {
        'mean': mean,
        'std': std,
        'num_ones': num_ones,
        'num_zeros': num_zeros,
        'trans_00': transition_probs['00'],
        'trans_01': transition_probs['01'],
        'trans_10': transition_probs['10'],
        'trans_11': transition_probs['11'],
        'fft_mean': fft_mean,
        'fft_std': fft_std,
        'max_autocorr': max_autocorr
    }

# Apply feature extraction
features = data['Bitstream'].apply(extract_advanced_features)
features_df = pd.DataFrame(features.tolist())

# Add labels
features_df['class'] = data['class']
features_df


Unnamed: 0,mean,std,num_ones,num_zeros,trans_00,trans_01,trans_10,trans_11,fft_mean,fft_std,max_autocorr,class
0,0.511719,0.499863,524,500,0.233627,0.254154,0.255132,0.257087,14.595853,17.634089,255.859375,1
1,0.504883,0.499976,517,507,0.239492,0.255132,0.256109,0.249267,14.769506,17.287617,255.975586,1
2,0.480469,0.499618,492,532,0.276637,0.242424,0.242424,0.238514,14.572046,16.722903,255.609375,0
3,0.500977,0.499999,513,511,0.253177,0.246334,0.246334,0.254154,14.586181,17.327531,255.999023,1
4,0.507812,0.499939,520,504,0.240469,0.252199,0.252199,0.255132,14.695926,17.436449,255.937500,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.510742,0.499885,523,501,0.224829,0.264907,0.264907,0.245357,14.805685,17.429621,255.881836,0
1996,0.500000,0.500000,512,512,0.244379,0.255132,0.255132,0.245357,14.642418,17.251075,256.000000,1
1997,0.500977,0.499999,513,511,0.239492,0.260020,0.260020,0.240469,14.590297,17.324065,255.999023,0
1998,0.503906,0.499985,516,508,0.245357,0.251222,0.250244,0.253177,14.677067,17.337350,255.984375,1


In [5]:
# Train-Test Split

# Separate features and target variable
X = features_df.drop('class', axis=1)
y = features_df['class']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("\nTraining set size:", X_train.shape)
print("Testing set size:", X_test.shape)



Training set size: (1600, 11)
Testing set size: (400, 11)


In [6]:
# Train an XGBoost model
xgb_model = xgb.XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)


In [7]:
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)

# Display a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Accuracy: 0.5425

Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.56      0.55       200
           1       0.54      0.52      0.53       200

    accuracy                           0.54       400
   macro avg       0.54      0.54      0.54       400
weighted avg       0.54      0.54      0.54       400

