# Imports

In [6]:
# Install gensim library
!pip install gensim



In [11]:
# Imports
from google.colab import drive
import random
import pickle
import os
import numpy as np
import pandas as pd
import tensorflow as tf
tf.random.set_seed(42)

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV

from gensim.models import Word2Vec

In [12]:
# Mount drive
drive.mount('/content/drive')

# Define project path, example
PATH="/content/drive/MyDrive/CS171Project"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Neural Networks

## Preprocess Data

In [13]:
# Load all API calls
with open(PATH + "/data/APICalls.txt", "r") as f:
    api_calls = [line.strip() for line in f]

# Encode each API call to an integer
encoded_api_calls = {api_call: idx + 1 for idx, api_call in enumerate(api_calls)}

# Map each family to an integer
encoded_malware_family = {
    "adload": 0,
    "bancos": 1,
    "onlinegames": 2,
    "vbinject": 3,
    "vundo": 4,
    "winwebsec": 5,
    "zwangi": 6
}

In [None]:
# Path to folder containing malware families
root_dir = PATH + "/data/malwares"

encoded_dataset = []  # List of encoded sequences object
vector_dataset = []   # List of vectorized sequences object

# Loop through each malware family
for family_name in os.listdir(root_dir):

    # Loop through each file in malware family
    family_path = os.path.join(root_dir, family_name)
    for file_name in os.listdir(family_path):

        # Open file and get its list of API calls
        file_path = os.path.join(family_path, file_name)
        with open(file_path, "r") as f:
            sequence = [line.strip() for line in f if line.strip()]

        # Encode the file's list of API calls
        encoded_sequence = [encoded_api_calls[api] for api in sequence
                            if api in encoded_api_calls]

        # Add the encoded list and its family as an object
        encoded_dataset.append({
            "family": encoded_malware_family[family_name],
            "sequence": encoded_sequence
        })

        # Add plain the unencoded list API calls for different preprocessing
        vector_dataset.append({
            "family": encoded_malware_family[family_name],
            "sequence": sequence
        })

In [18]:
# Get ideal length of sequence (leave out outliers)
file_lengths = [len(sample["sequence"]) for sample in encoded_dataset]
max_length = int(np.percentile(file_lengths, 90))

# Path for storing lists
os.makedirs(PATH + "/compiled", exist_ok=True)

### Encoded Dataset

In [19]:
# Truncate or pad sequences based on ideal length
def pad_or_truncate(seq, max_length):
    if len(seq) > max_length:
        return seq[:max_length] # Truncate
    else:
        return seq + [0]* (max_length - len(seq)) # Pad with 0's

for sample in encoded_dataset:
    sample["sequence"] = pad_or_truncate(sample["sequence"], max_length)

# Shuffle samples and save
random.shuffle(encoded_dataset)

# Store encoded dataset
with open(PATH + "/compiled/encoded_dataset.pkl", "wb") as f:
    pickle.dump(encoded_dataset, f)

### Vector Dataset

In [20]:
# Extract API call sequences to a list
sequences = [sample["sequence"] for sample in vector_dataset]

# Train the Word2Vec model on API call sequences
word2vec_model = Word2Vec(sequences, size=100, window=5, min_count=1, workers=4)



In [21]:
# Embedding dimension
embedding_dim = word2vec_model.vector_size

# Function for converting an API call sequence to word2vec embeddings, pad as needed
def sequence_to_matrix(seq, model, max_length):
    mat = []
    for api in seq[:max_length]:
        if api in model.wv:
            mat.append(model.wv[api])
        else:
            mat.append(np.zeros(embedding_dim))
    while len(mat) < max_length:
        mat.append(np.zeros(embedding_dim))
    return np.stack(mat)

# For each sample, convert that sample's sequence into a matrix of embeddings
vector_data = [
    (sequence_to_matrix(sample["sequence"], word2vec_model, max_length), sample["family"])
    for sample in vector_dataset
]

In [22]:
# Shuffle samples and save
random.shuffle(vector_data)
X_mat = np.stack([x for x, _ in vector_data])
y     = np.array([y for _, y in vector_data])

with open(PATH + "/compiled/vector_dataset.pkl", "wb") as f:
    pickle.dump((X_mat, y), f)

## Read Processed Data

In [23]:
# Load encoded dataset
with open(PATH + "/compiled/encoded_dataset.pkl", "rb") as f:
    encoded_dataset = pickle.load(f)

# Features and targets
X_encoded = [sample["sequence"] for sample in encoded_dataset]
y_encoded = [sample["family"] for sample in encoded_dataset]

# Get sequence length
sequence_length = len(X_encoded[0])

# Reshape to (samples, timesteps, features)
X_encoded = np.array(X_encoded).reshape(len(X_encoded), sequence_length, 1)

# Split data into training, validation, and testing
X_train_encoded, X_temp, y_train_encoded, y_temp = train_test_split(
    X_encoded, y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)
X_val_encoded, X_test_encoded, y_val_encoded, y_test_encoded = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    random_state=42,
    stratify=y_temp
)

# Convert labels to one-hot encoding
y_train_encoded = tf.keras.utils.to_categorical(y_train_encoded)
y_val_encoded = tf.keras.utils.to_categorical(y_val_encoded)
y_test_encoded = tf.keras.utils.to_categorical(y_test_encoded)

In [24]:
# Load vector dataset
with open(PATH + "/compiled/vector_dataset.pkl", "rb") as f:
    X_mat, y = pickle.load(f)

# Split data into training, validation, and testing
y_cat = tf.keras.utils.to_categorical(y, num_classes=7)
X_train_vector, X_temp, y_train_vector, y_temp = train_test_split(
    X_mat, y_cat,
    test_size=0.2,
    random_state=42,
    stratify=y_cat
)
X_val_vector, X_test_vector, y_val_vector, y_test_vector = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    random_state=42,
    stratify=y_temp
)

## Define CNN, RNN, LSTM

In [25]:
# Function for creating a CNN model
def create_cnn_model(input_shape, num_classes):
    # Define model
    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv1D(256,                         # Number of features/patterns
                               kernel_size=16,              # Sliding window size
                               activation='relu',
                               input_shape=input_shape),
        tf.keras.layers.MaxPooling1D(pool_size=4),          # Pooling, make samples generic -> less overfitting
        tf.keras.layers.Flatten(),                          # Turns to 1D array so that dense layer can take it
        tf.keras.layers.Dense(256, activation='relu'),      # Hidden layer
        tf.keras.layers.Dropout(0.2),                       # Drops every 5th sample, forces model to learn without it
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])

    # Compile model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

In [26]:
# Function for creating a RNN model
def create_rnn_model(input_shape, num_classes):
    # Define model
    model = tf.keras.models.Sequential([
        tf.keras.layers.SimpleRNN(
            256,
            input_shape=input_shape,
            return_sequences=False
        ),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])

    # Compile model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

In [27]:
# Function for creating an LSTM model
def create_lstm_model(input_shape, num_classes):
    # Define model
    model = tf.keras.models.Sequential([
        tf.keras.layers.LSTM(256, input_shape=input_shape, return_sequences=False),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])

    # Compile model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

## Train and Test Models

In [28]:
# Function for training and evaluating neural network
def train_and_evaluate_model(model, X_train, y_train, X_val, y_val, X_test, y_test):
    # Condition for early stopping
    early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                  patience=3,
                                                  restore_best_weights=True)
    # Fit the model
    history = model.fit(
        X_train, y_train,
        epochs=30,
        batch_size=32,
        validation_data=(X_val, y_val),
        callbacks=[early_stop]
    )

    # Print accuracies
    test_loss, test_accuracy = model.evaluate(X_test, y_test)
    print(f"Training Accuracy: {history.history['accuracy'][-1]:.4f}")
    print(f"Testing Accuracy: {test_accuracy:.4f}")

### Using Encoded Dataset

In [29]:
# Shapes of encoded features and labels
input_shape_encoded = X_train_encoded.shape[1:]
num_classes_encoded = y_train_encoded.shape[1]

In [30]:
# CNN with encoded dataset
encoded_cnn_model = create_cnn_model(input_shape_encoded, num_classes_encoded)

train_and_evaluate_model(
    encoded_cnn_model,
    X_train_encoded, y_train_encoded,
    X_val_encoded, y_val_encoded,
    X_test_encoded, y_test_encoded
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 321ms/step - accuracy: 0.3551 - loss: 7.3346 - val_accuracy: 0.4211 - val_loss: 2.1599
Epoch 2/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 277ms/step - accuracy: 0.6240 - loss: 1.3908 - val_accuracy: 0.6316 - val_loss: 0.9044
Epoch 3/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 307ms/step - accuracy: 0.8370 - loss: 0.6248 - val_accuracy: 0.7895 - val_loss: 0.5698
Epoch 4/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 266ms/step - accuracy: 0.8946 - loss: 0.3149 - val_accuracy: 0.8421 - val_loss: 0.4766
Epoch 5/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 251ms/step - accuracy: 0.9627 - loss: 0.1367 - val_accuracy: 0.8158 - val_loss: 0.4834
Epoch 6/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 263ms/step - accuracy: 0.9867 - loss: 0.0776 - val_accuracy: 0.8158 - val_loss: 0.5611
Epoch 7/30
[1m10/10[0m [3

In [31]:
# RNN with encoded dataset
encoded_rnn_model = create_rnn_model(input_shape_encoded, num_classes_encoded)

train_and_evaluate_model(
    encoded_rnn_model,
    X_train_encoded, y_train_encoded,
    X_val_encoded, y_val_encoded,
    X_test_encoded, y_test_encoded
)

Epoch 1/30


  super().__init__(**kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 572ms/step - accuracy: 0.2164 - loss: 1.9315 - val_accuracy: 0.2368 - val_loss: 1.8064
Epoch 2/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 532ms/step - accuracy: 0.1920 - loss: 1.8147 - val_accuracy: 0.2632 - val_loss: 1.6829
Epoch 3/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 685ms/step - accuracy: 0.2953 - loss: 1.7122 - val_accuracy: 0.2632 - val_loss: 1.6776
Epoch 4/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 547ms/step - accuracy: 0.2541 - loss: 1.7224 - val_accuracy: 0.2632 - val_loss: 1.6838
Epoch 5/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 806ms/step - accuracy: 0.2084 - loss: 1.7920 - val_accuracy: 0.2632 - val_loss: 1.6611
Epoch 6/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 618ms/step - accuracy: 0.2914 - loss: 1.7144 - val_accuracy: 0.2632 - val_loss: 1.6536
Epoch 7/30
[1m10/10[0m [32m━━━━━━━━

In [32]:
# LSTM with encoded dataset
encoded_lstm_model = create_lstm_model(input_shape_encoded, num_classes_encoded)

train_and_evaluate_model(
    encoded_lstm_model,
    X_train_encoded, y_train_encoded,
    X_val_encoded, y_val_encoded,
    X_test_encoded, y_test_encoded
)

Epoch 1/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 2s/step - accuracy: 0.1478 - loss: 1.9139 - val_accuracy: 0.2632 - val_loss: 1.8133
Epoch 2/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 2s/step - accuracy: 0.2574 - loss: 1.8450 - val_accuracy: 0.2632 - val_loss: 1.7955
Epoch 3/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2s/step - accuracy: 0.2787 - loss: 1.8033 - val_accuracy: 0.2632 - val_loss: 1.7340
Epoch 4/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 2s/step - accuracy: 0.2535 - loss: 1.7280 - val_accuracy: 0.2632 - val_loss: 1.6518
Epoch 5/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 2s/step - accuracy: 0.2438 - loss: 1.6964 - val_accuracy: 0.2632 - val_loss: 1.6587
Epoch 6/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 2s/step - accuracy: 0.2415 - loss: 1.6725 - val_accuracy: 0.2632 - val_loss: 1.6456
Epoch 7/30
[1m10/10[0m [32m━━━━━━━━━━

### Using Vector Dataset

In [33]:
# Shapes of vectorized features and labels
input_shape_vector = X_train_vector.shape[1:]  # (max_length, embedding_dim)
num_classes_vector = y_train_vector.shape[1]   # 7

In [34]:
# CNN with vector dataset
vector_cnn_model = create_cnn_model(input_shape_vector, num_classes_vector)

train_and_evaluate_model(
    vector_cnn_model,
    X_train_vector, y_train_vector,
    X_val_vector, y_val_vector,
    X_test_vector, y_test_vector
)

Epoch 1/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 944ms/step - accuracy: 0.3681 - loss: 3.6655 - val_accuracy: 0.6316 - val_loss: 1.2600
Epoch 2/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1s/step - accuracy: 0.7029 - loss: 0.8441 - val_accuracy: 0.7368 - val_loss: 0.8425
Epoch 3/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1s/step - accuracy: 0.8458 - loss: 0.4936 - val_accuracy: 0.7368 - val_loss: 1.1296
Epoch 4/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 1s/step - accuracy: 0.8911 - loss: 0.3413 - val_accuracy: 0.7368 - val_loss: 0.8641
Epoch 5/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 985ms/step - accuracy: 0.8913 - loss: 0.2611 - val_accuracy: 0.7105 - val_loss: 1.0813
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 123ms/step - accuracy: 0.6678 - loss: 0.8039
Training Accuracy: 0.8977
Testing Accuracy: 0.6579


In [35]:
# RNN with vector dataset
vector_rnn_model = create_rnn_model(input_shape_vector, num_classes_vector)

train_and_evaluate_model(
    vector_rnn_model,
    X_train_vector, y_train_vector,
    X_val_vector, y_val_vector,
    X_test_vector, y_test_vector
)

Epoch 1/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1s/step - accuracy: 0.2000 - loss: 1.8818 - val_accuracy: 0.2368 - val_loss: 1.8492
Epoch 2/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 899ms/step - accuracy: 0.2668 - loss: 1.7296 - val_accuracy: 0.2632 - val_loss: 1.8539
Epoch 3/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 654ms/step - accuracy: 0.2363 - loss: 1.7768 - val_accuracy: 0.3158 - val_loss: 1.7331
Epoch 4/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 886ms/step - accuracy: 0.2294 - loss: 1.6771 - val_accuracy: 0.2632 - val_loss: 1.7599
Epoch 5/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 817ms/step - accuracy: 0.1990 - loss: 1.7090 - val_accuracy: 0.2895 - val_loss: 1.6965
Epoch 6/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 670ms/step - accuracy: 0.2663 - loss: 1.6447 - val_accuracy: 0.2368 - val_loss: 1.7066
Epoch 7/30
[1m10/10[0m [32

In [36]:
# LSTM with vector dataset
vector_lstm_model = create_lstm_model(input_shape_vector, num_classes_vector)

train_and_evaluate_model(
    vector_lstm_model,
    X_train_vector, y_train_vector,
    X_val_vector, y_val_vector,
    X_test_vector, y_test_vector
)

Epoch 1/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 2s/step - accuracy: 0.2009 - loss: 1.8825 - val_accuracy: 0.2368 - val_loss: 1.8423
Epoch 2/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 2s/step - accuracy: 0.2615 - loss: 1.7870 - val_accuracy: 0.2368 - val_loss: 1.8269
Epoch 3/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 3s/step - accuracy: 0.2808 - loss: 1.7238 - val_accuracy: 0.2895 - val_loss: 1.7130
Epoch 4/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 2s/step - accuracy: 0.2334 - loss: 1.6036 - val_accuracy: 0.2368 - val_loss: 1.7002
Epoch 5/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 2s/step - accuracy: 0.2714 - loss: 1.6095 - val_accuracy: 0.2895 - val_loss: 1.7010
Epoch 6/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 4s/step - accuracy: 0.2761 - loss: 1.5862 - val_accuracy: 0.2368 - val_loss: 1.7277
Epoch 7/30
[1m10/10[0m [32m━━━━━━━━━━

# Traditional Models

## Preprocess Data

In [37]:
# Load all API calls
with open(PATH + "/data/APICalls.txt", "r") as f:
    api_calls = [line.strip() for line in f]

# Create mapping of API call to index
api_index_map = {api: idx for idx, api in enumerate(api_calls)}

# Malware family encoding
encoded_malware_family = {
    "adload": 0,
    "bancos": 1,
    "onlinegames": 2,
    "vbinject": 3,
    "vundo": 4,
    "winwebsec": 5,
    "zwangi": 6
}

# Root directory containing malware families
root_dir = PATH + "/data/malwares"

# List for feature vectors and labels
feature_vectors = []
labels = []

# Go through each malware family directory
for family_name in os.listdir(root_dir):

    # Loop through each malware file
    family_path = os.path.join(root_dir, family_name)
    for file_name in os.listdir(family_path):
        file_path = os.path.join(family_path, file_name)

        # Initialize zero vector for this file
        vector = np.zeros(len(api_calls), dtype=int)

        # Read file and count API calls
        with open(file_path, "r") as f:
            for line in f:
                api = line.strip()
                if api in api_index_map:
                    vector[api_index_map[api]] += 1

        # Append vector and label
        feature_vectors.append(vector)
        labels.append(encoded_malware_family[family_name])

# Convert to DataFrame
df = pd.DataFrame(feature_vectors, columns=api_calls)
df["label"] = labels

# Save to CSV
df.to_csv(PATH + "/compiled/api_call_counts.csv", index=False)

## Read Processed Data

In [38]:
# Load the CSV
df = pd.read_csv(PATH + "/compiled/api_call_counts.csv")

# Separate features and labels
X = df.drop("label", axis=1)
y = df["label"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


## Support Vector Machine

In [45]:
# SVM pipeline
svm_pipeline = make_pipeline(
    StandardScaler(),
    SVC(random_state=42)
)

# Grid search parameters
svm_param_grid = {
    'svc__kernel': ['linear'],
    'svc__C': list(np.arange(5, 10, 0.01)),
    'svc__class_weight': [None, 'balanced'],
}

# Grid search with 5-fold cross-validation
svm_grid_search = GridSearchCV(svm_pipeline, svm_param_grid, cv=5, n_jobs=-1, verbose=2)
svm_grid_search.fit(X_train, y_train)

# Best model
best_svm_model = svm_grid_search.best_estimator_

print("Best parameters found:", svm_grid_search.best_params_)
print(f"Training Accuracy: {best_svm_model.score(X_train, y_train):.4f}")
print(f"Testing Accuracy:  {best_svm_model.score(X_test, y_test):.4f}")

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
Best parameters found: {'svc__C': 7.549999999999946, 'svc__class_weight': None, 'svc__kernel': 'linear'}
Training Accuracy: 0.9822
Testing Accuracy:  0.8990


## Random Forest Classifier

In [40]:
# RFC pipeline
rfc_pipeline = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(random_state=42)
)

# Grid search parameters
rfc_param_grid = {
    'randomforestclassifier__n_estimators': list(np.arange(250, 271, 2)),
    'randomforestclassifier__max_depth': list(np.arange(10, 21, 1)),
    'randomforestclassifier__min_samples_split': [2],
    'randomforestclassifier__min_samples_leaf': [1],
    'randomforestclassifier__class_weight': ['balanced']
}

# Grid search with 5-fold cross-validation
rfc_grid_search = GridSearchCV(rfc_pipeline, rfc_param_grid, cv=5, n_jobs=-1, verbose=2)
rfc_grid_search.fit(X_train, y_train)

# Best model
best_rfc_model = rfc_grid_search.best_estimator_

print("Best parameters found:", rfc_grid_search.best_params_)
print(f"Training Accuracy: {best_rfc_model.score(X_train, y_train):.4f}")
print(f"Testing Accuracy:  {best_rfc_model.score(X_test, y_test):.4f}")

Fitting 5 folds for each of 121 candidates, totalling 605 fits
Best parameters found: {'randomforestclassifier__class_weight': 'balanced', 'randomforestclassifier__max_depth': 16, 'randomforestclassifier__min_samples_leaf': 1, 'randomforestclassifier__min_samples_split': 2, 'randomforestclassifier__n_estimators': 258}
Training Accuracy: 0.9975
Testing Accuracy:  0.9495


## Logistic Regression

In [41]:
# Logistic Regression pipeline
logreg_pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=1000, random_state=42)
)

# Grid search parameters
logreg_param_grid = {
    'logisticregression__C': list(np.arange(0.1, 3.1, 0.2)),
    'logisticregression__penalty': ['l2'],
    'logisticregression__class_weight': ['balanced'],
    'logisticregression__solver': ['lbfgs', 'newton-cg']
}

# Grid search with 5-fold cross-validation
logreg_grid_search = GridSearchCV(logreg_pipeline, logreg_param_grid, cv=5, n_jobs=-1, verbose=2)
logreg_grid_search.fit(X_train, y_train)

# Best model
best_logreg_model = logreg_grid_search.best_estimator_

# Accuracy
print("Best parameters found:", logreg_grid_search.best_params_)
print(f"Training Accuracy: {best_logreg_model.score(X_train, y_train):.4f}")
print(f"Testing Accuracy:  {best_logreg_model.score(X_test, y_test):.4f}")

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best parameters found: {'logisticregression__C': 1.1000000000000003, 'logisticregression__class_weight': 'balanced', 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'lbfgs'}
Training Accuracy: 0.9517
Testing Accuracy:  0.9091
