In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Load the dataset (adjust your dataset path)
dataset = pd.read_csv("data/dataset_train_2024.csv")

# Preprocess data (extract features and labels)
sequences_1 = dataset.iloc[:, 1:129].values  # Columns 1-128
sequences_2 = dataset.iloc[:, 129:257].values  # Columns 129-256
extra_feature = dataset.iloc[:, 257].values.reshape(-1, 1)  # Column 257

# Combine features
all_features = np.hstack([sequences_1, sequences_2, extra_feature])

# Normalize features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(all_features)

# Labels
labels = dataset.iloc[:, -1].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_features, labels, test_size=0.2, random_state=42, stratify=labels)

# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Optionally, perform hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [10, 20, 30],  # Max depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at each leaf node
    'bootstrap': [True, False]  # Whether to use bootstrap samples when building trees
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters from grid search
print(f"Best parameters found: {grid_search.best_params_}")

# Use the best model from grid search
best_rf_model = grid_search.best_estimator_

# Train the model
best_rf_model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = best_rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

# Optionally, save the model for later use
#import joblib
#joblib.dump(best_rf_model, 'random_forest_model.pkl')


Fitting 5 folds for each of 162 candidates, totalling 810 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters found: {'bootstrap': False, 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Test Accuracy: 0.7083
Classification Report:
               precision    recall  f1-score   support

        8PSK       0.62      0.81      0.70       480
        BPSK       0.98      0.79      0.88       480
       CPFSK       0.92      0.53      0.67       480
        GFSK       0.61      0.99      0.76       480
        QPSK       0.60      0.41      0.49       480

    accuracy                           0.71      2400
   macro avg       0.75      0.71      0.70      2400
weighted avg       0.75      0.71      0.70      2400



Best parameters found: {'bootstrap': False, 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}

Test Accuracy: 0.7083

Classification Report:
               precision    recall  f1-score   support

        8PSK       0.62      0.81      0.70       480
        BPSK       0.98      0.79      0.88       480
       CPFSK       0.92      0.53      0.67       480
        GFSK       0.61      0.99      0.76       480
        QPSK       0.60      0.41      0.49       480

    accuracy                           0.71      2400

   macro avg       0.75      0.71      0.70      2400
   
weighted avg       0.75      0.71      0.70      2400