In [None]:
import numpy as np
import tensorflow as tf
import csv

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

from imblearn.over_sampling import RandomOverSampler


2024-11-07 16:51:14.043438: E external/local_xla/xla/stream_executor/plugin_registry.cc:91] Invalid plugin kind specified: FFT
2024-11-07 16:51:14.073812: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-07 16:51:14.345439: E external/local_xla/xla/stream_executor/plugin_registry.cc:91] Invalid plugin kind specified: DNN


In [None]:

# Load and combine data from multiple CSV files
def load_data(file_paths):
    data = []
    labels = []
    max_features = 0
    file_data = []

    # First pass: Read all files and find the maximum number of feature columns
    for file_path in file_paths:
        with open(file_path, 'r') as f:
            reader = csv.reader(f)
            header = next(reader)

            # Identify the "malware_type" column
            malware_type_index = header.index("malware_type")

            for row in reader:
                label = row[malware_type_index]
                features = [
                    float(val) if val else 0.0
                    for i, val in enumerate(row) if i != malware_type_index
                ]
                max_features = max(max_features, len(features))
                file_data.append((features, label))

    # Second pass: Pad rows to max_features length
    data, labels = [], []
    for features, label in file_data:
        if len(features) < max_features:
            features += [0.0] * (max_features - len(features))
        data.append(features)
        labels.append(label)
    
    return np.array(data), np.array(labels)

# Define file paths
file_paths = [
    "/workspace/Ngram2/AdwareJson.csv",
    "/workspace/Ngram2/BenignJson.csv",
    "/workspace/Ngram2/BankingwareJson.csv",
    "/workspace/Ngram2/RiskwareJson.csv",
    "/workspace/Ngram2/SmswareJson.csv"
]

# Load data
X, y = load_data(file_paths)

# Encode the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Balance the classes using Random Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y_encoded)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_, output_dict=True)

# Print overall accuracy
print(f"Overall Accuracy: {accuracy:.4f}")

# Print detailed report for each malware type
for malware_type, metrics in report.items():
    if malware_type in label_encoder.classes_:  # Skip 'accuracy', 'macro avg', 'weighted avg' keys
        print(f"\nReport for {malware_type}:")
        print(f"  Precision: {metrics['precision']:.4f}")
        print(f"  Recall:    {metrics['recall']:.4f}")
        print(f"  F1-Score:  {metrics['f1-score']:.4f}")
        print(f"  Support:   {metrics['support']}")
        
# Print macro and weighted averages
print("\nMacro Average:")
print(f"  Precision: {report['macro avg']['precision']:.4f}")
print(f"  Recall:    {report['macro avg']['recall']:.4f}")
print(f"  F1-Score:  {report['macro avg']['f1-score']:.4f}")

print("\nWeighted Average:")
print(f"  Precision: {report['weighted avg']['precision']:.4f}")
print(f"  Recall:    {report['weighted avg']['recall']:.4f}")
print(f"  F1-Score:  {report['weighted avg']['f1-score']:.4f}")


Overall Accuracy: 0.9951

Report for AdwareJson:
  Precision: 1.0000
  Recall:    0.9962
  F1-Score:  0.9981
  Support:   782.0

Report for BankingwareJson:
  Precision: 0.9987
  Recall:    0.9937
  F1-Score:  0.9962
  Support:   794.0

Report for BenignJson:
  Precision: 0.9871
  Recall:    0.9948
  F1-Score:  0.9910
  Support:   772.0

Report for RiskwareJson:
  Precision: 0.9896
  Recall:    0.9974
  F1-Score:  0.9935
  Support:   765.0

Report for SmswareJson:
  Precision: 1.0000
  Recall:    0.9937
  F1-Score:  0.9968
  Support:   791.0

Macro Average:
  Precision: 0.9951
  Recall:    0.9951
  F1-Score:  0.9951

Weighted Average:
  Precision: 0.9952
  Recall:    0.9951
  F1-Score:  0.9951
