### Random Forest classifier Model

In [1]:
# Import Required Libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [2]:
# Load the dataset
data = pd.read_csv('TrainingData.csv')

# Display dataset structure and first few rows
print("Dataset Info:")
print(data.info())
print("\nFirst 5 rows of the dataset:")
print(data.head())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   BID        2000 non-null   int64 
 1   Bitstream  2000 non-null   object
 2   class      2000 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 47.0+ KB
None

First 5 rows of the dataset:
   BID                                          Bitstream  class
0    0  1000111010111101101100110111001111001000101111...      1
1    1  1101111100101011111111101101010001110110000010...      1
2    2  0011001010001010100100011101000111110100101111...      0
3    3  1101010110000110100001001100111101000000110001...      1
4    4  1010111100001001000101010010111010011101001100...      1


In [3]:
# Check label distribution
label_distribution = data['class'].value_counts()
print("\nLabel Distribution:")
print(label_distribution)



Label Distribution:
class
1    1000
0    1000
Name: count, dtype: int64


In [4]:
# Feature extraction function
def extract_features(bitstream):
    bits = np.array([int(bit) for bit in bitstream])
    features = {
        'mean': np.mean(bits),  # Mean value of bits
        'std': np.std(bits),  # Standard deviation of bits
        'num_ones': np.sum(bits),  # Total count of 1s
        'num_zeros': len(bits) - np.sum(bits),  # Total count of 0s
        'max_run_ones': max([len(run) for run in ''.join(map(str, bits)).split('0')]),  # Longest run of 1s
        'max_run_zeros': max([len(run) for run in ''.join(map(str, bits)).split('1')]),  # Longest run of 0s
        'bit_entropy': -np.sum([(p * np.log2(p)) for p in np.bincount(bits, minlength=2) / len(bits) if p > 0])  # Entropy
    }
    return features

# Apply the feature extraction function to all bitstreams
features = data['Bitstream'].apply(extract_features)
features_df = pd.DataFrame(features.tolist())

# Add the labels to the features dataframe
features_df['class'] = data['class']
features_df


Unnamed: 0,mean,std,num_ones,num_zeros,max_run_ones,max_run_zeros,bit_entropy,class
0,0.511719,0.499863,524,500,11,8,0.999604,1
1,0.504883,0.499976,517,507,10,10,0.999931,1
2,0.480469,0.499618,492,532,9,10,0.998899,0
3,0.500977,0.499999,513,511,10,10,0.999997,1
4,0.507812,0.499939,520,504,12,8,0.999824,1
...,...,...,...,...,...,...,...,...
1995,0.510742,0.499885,523,501,10,9,0.999667,0
1996,0.500000,0.500000,512,512,15,7,1.000000,1
1997,0.500977,0.499999,513,511,8,8,0.999997,0
1998,0.503906,0.499985,516,508,8,9,0.999956,1


In [5]:
# Separate features and target variable
X = features_df.drop('class', axis=1)
y = features_df['class']

# Split into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("\nTraining set size:", X_train.shape)
print("Testing set size:", X_test.shape)



Training set size: (1600, 7)
Testing set size: (400, 7)


In [6]:
# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)


In [7]:
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)

# Display a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Accuracy: 0.4975

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.51      0.50       200
           1       0.50      0.48      0.49       200

    accuracy                           0.50       400
   macro avg       0.50      0.50      0.50       400
weighted avg       0.50      0.50      0.50       400

