In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report
import pickle

In [6]:
# Function to load and preprocess individual CSV files
def load_and_preprocess_csvs(file_access_path, network_traffic_path, system_performance_path, user_behavior_path):
    # Load CSVs
    file_access = pd.read_csv(file_access_path)
    network_traffic = pd.read_csv(network_traffic_path)
    system_performance = pd.read_csv(system_performance_path)
    user_behavior = pd.read_csv(user_behavior_path)
    
    # Check for 'Timestamp' column and merge datasets
    for df, name in zip([file_access, network_traffic, system_performance, user_behavior], 
                        ['file_access', 'network_traffic', 'system_performance', 'user_behavior']):
        if 'Timestamp' not in df.columns:
            raise KeyError(f"'Timestamp' column missing in {name}. Please check the CSV file.")
    
    # Merge datasets on 'Timestamp' column
    df = pd.merge(file_access, network_traffic, on='Timestamp', how='outer', suffixes=('_file', '_network'))
    df = pd.merge(df, system_performance, on='Timestamp', how='outer', suffixes=('_network', '_system'))
    df = pd.merge(df, user_behavior, on='Timestamp', how='outer', suffixes=('_system', '_user'))

    # Handle multiple 'Label' columns
    df['Label'] = df.get('Label_file', df.get('Label_network', 0))  # Default to 0 if no 'Label' found
    df.drop(columns=[col for col in df.columns if 'Label_' in col], inplace=True)
    
    # Handle missing values by filling with 0 or an appropriate placeholder
    df.fillna(0, inplace=True)

    # Standardize data types before encoding
    for col in ['File_Operation', 'User']:
        if col in df.columns:
            df[col] = df[col].astype(str)
    
    # Encode categorical variables
    label_encoder = LabelEncoder()
    if 'File_Operation' in df.columns:
        df['File_Operation'] = label_encoder.fit_transform(df['File_Operation'])
    if 'User' in df.columns:
        df['User'] = label_encoder.fit_transform(df['User'])
    
    return df, label_encoder

In [7]:
# Function to train a model and save it to disk
def train_and_save_model(df, label_encoder, model_path, scaler_path):
    # Define feature columns, ensuring they exist in df
    feature_columns = [col for col in [
        'File_Operation', 'File_Size_MB', 'Network_Bytes', 'CPU_Usage', 'Disk_IO', 
        'Memory_Usage', 'Login_Success', 'Privilege_Escalation'] if col in df.columns]
    
    # Separate features (X) and labels (y)
    X = df[feature_columns]
    y = df['Label']  # Assuming 'Label' is the column that marks ransomware (1) or not (0)
    
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
    
    # Train the Random Forest model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    print("Model Performance on Test Data:")
    print(classification_report(y_test, y_pred))
    
    # Save the trained model and scaler to files
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)
    
    with open(scaler_path, 'wb') as f:
        pickle.dump(scaler, f)

    # Save label encoder (for categorical features) if used
    with open('label_encoder.pkl', 'wb') as f:
        pickle.dump(label_encoder, f)

In [8]:
# Main execution (train and save model)
if __name__ == "__main__":
    # File paths for training
    file_access_path = 'file_access_3.0.csv'
    network_traffic_path = 'network_traffic_3.0.csv'
    system_performance_path = 'system_performance_3.0.csv'
    user_behavior_path = 'user_behavior_3.0.csv'
    
    # Load and preprocess the data
    combined_df, label_encoder = load_and_preprocess_csvs(file_access_path, network_traffic_path, system_performance_path, user_behavior_path)
    
    # Train and save the model
    train_and_save_model(combined_df, label_encoder, 'ransomware_model.pkl', 'scaler.pkl')

Model Performance on Test Data:
              precision    recall  f1-score   support

         0.0       0.94      0.93      0.93       818
         1.0       0.74      0.76      0.75       221

    accuracy                           0.89      1039
   macro avg       0.84      0.85      0.84      1039
weighted avg       0.90      0.89      0.89      1039

