# LLM Notebook

# Dependencies

In [None]:
pip install transformers

In [None]:
pip install datasets

In [None]:
pip install --upgrade pandas

In [None]:
pip install evaluate

In [None]:
pip install torch==2.2.1

# CSV FILE SETUP

In [None]:
csv = "Fileless-ResultsNew2.csv" #CSV File Name Has Data On It
text_column_name = "Name" #Column Name
text_column_name_1 = "pslist" #Column pslist
text_column_name_2 = "pstree" #Column pstree
text_column_name_3 = "psxview" #Column psxview
text_column_name_4 = "dlllist" #Column dlllist
text_column_name_5 = "handles" #Column handles
text_column_name_6 = "ldrmodules" #Column ldrmodules
text_column_name_7 = "svcscan" #Column svcscan
text_column_name_8 = "cmdline" #Column cmdline
label = "Category" #Label Column
model_name = "ehsanaghaei/SecureBERT" #Model Chosen Hugging Face
test_size = 0.2 #Test Size Split 20% Test Train 80%
num_labels = 2 #Fileless and Benign

# CSV File Output

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv(csv)

In [None]:
df.head()

# Label Encodeing

In [None]:
pip install scikit-learn

In [None]:
from sklearn import preprocessing

In [None]:
le = preprocessing.LabelEncoder()
le.fit(df[label].tolist())
df['label'] = le.transform(df[label].tolist())

In [None]:
df.head()

# Split Data Train and Test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_train,df_test = train_test_split(df,test_size=test_size, stratify=df['Category'])

In [None]:
test_features = df_test.drop(columns=['Category', 'label'])

In [None]:
from datasets import Dataset

In [None]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(test_features)

# Tokenize All Columns

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    combined_texts = []
    for Name,pslist,pstree,psxview,dlllist,handles,ldrmodules,svcscan,cmdline in zip(
        examples["Name"], examples["pslist"], examples["pstree"], examples["psxview"], examples["dlllist"], examples["handles"], examples["ldrmodules"], examples["svcscan"], examples["cmdline"]  
    ):
        combined_text = f"{Name} {pslist} {pstree} {psxview} {dlllist} {handles} {ldrmodules} {svcscan} {cmdline}  "
        combined_texts.append(combined_text)


    encoding = tokenizer(
        combined_texts,
        padding = 'max_length',
        truncation = True,
        return_tensors = None,
        return_attention_mask=True,
        return_token_type_ids=True
    )
    
    return {
        'input_ids': encoding['input_ids'],
        'attention_mask': encoding['attention_mask'],
        'token_type_ids': encoding.get('token_type_ids', [None] * len(combined_texts))                     
    } 

In [None]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)

In [None]:
tokenized_test = test_dataset.map(preprocess_function, batched=True)

# Label Encoding Training

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
id2label = {0: "Benign", 1: "Fileless"}
label2id = {"Benign": 0, "Fileless": 1}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, id2label=id2label, label2id=label2id)

# Training Model

In [None]:
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
pip install accelerate -U

In [None]:
training_args = TrainingArguments(
    output_dir="./result",
    learning_rate=2e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy = "epoch",
    logging_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics

)


In [None]:
trainer.train()

In [None]:
trainer.save_model('Fileless')

In [None]:
print(test_dataset[:11])

In [None]:
# Get predictions
predictions = trainer.predict(test_dataset=tokenized_test)

# Print predictions
print(predictions.predictions)


# Results Training

In [None]:
predicted_class_ids = np.argmax(predictions.predictions, axis=-1)
id2label = {0: "Benign", 1: "Fileless"}
predicted_labels = [id2label[class_id] for class_id in predicted_class_ids]
# Print predictions
for text, label in zip(test_dataset, predicted_labels):
    print(f"Text: {text}\nPredicted Label: {label}\n")
