In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \
    classification_report

labels = pd.read_csv('../data/Label.csv')
data = pd.read_csv('../data/Data.csv')

print("Data shape before processing:", data.shape)

# Check for missing values in each column
missing_values_column = data.isnull().sum()
if missing_values_column.sum() > 0:
    print("Columns with missing values:")
    print(missing_values_column[missing_values_column > 0])  # Display columns with missing values
else:
    print("No missing values found in the data.")

# Drop columns with only one unique value
for col in data.columns:
    if data[col].nunique() == 1:
        data.drop(col, axis=1, inplace=True)
        print(f"Dropped column: {col}")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels['Label'], test_size=0.2, random_state=42)

best_features = SelectKBest(score_func=chi2, k=10)
fit = best_features.fit(X_train, y_train)

# Get the scores for each feature
dfscores = pd.DataFrame(fit.scores_, columns=['Score'])
dfcolumns = pd.DataFrame(X_train.columns, columns=['Feature'])

# Concatenate feature names and scores
feature_scores = pd.concat([dfcolumns, dfscores], axis=1)

# Sort and get the top features
feature_scores = feature_scores.sort_values(by='Score', ascending=False)
top_30_features = feature_scores.head(30)['Feature'].values

# Reduce the datasets to selected features
X_train_reduced = X_train[top_30_features]
X_test_reduced = X_test[top_30_features]

print("Reduced X_train shape:", X_train_reduced.shape)
print("Reduced X_test shape:", X_test_reduced.shape)

# Initialize the base classifier (e.g., DecisionTreeClassifier)
base_estimator =RandomForestClassifier(n_estimators=100, random_state=42) 

# Initialize the AdaBoost classifier
ada_classifier = AdaBoostClassifier(estimator=base_estimator, n_estimators=50, random_state=42)

# Fit the model
ada_classifier.fit(X_train_reduced, y_train)

# Predict on the test data
y_pred = ada_classifier.predict(X_test_reduced)

print("\nClassification Report:")
report = classification_report(y_test, y_pred, target_names=[f"Class {i}" for i in np.unique(y_test)])
print(report)

# Print accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Data shape before processing: (447915, 76)
No missing values found in the data.
Dropped column: Bwd PSH Flags
Dropped column: Fwd URG Flags
Dropped column: Bwd URG Flags
Dropped column: URG Flag Count
Dropped column: CWR Flag Count
Dropped column: ECE Flag Count
Dropped column: Fwd Bytes/Bulk Avg
Dropped column: Fwd Packet/Bulk Avg
Dropped column: Fwd Bulk Rate Avg
Reduced X_train shape: (358332, 30)
Reduced X_test shape: (89583, 30)


