In [1]:
from elasticsearch import Elasticsearch

# 1. Connect to your Ubuntu Server
es = Elasticsearch("http://10.0.2.5:9200")

# 2. Test the connection
try:
    if es.info():
        print("‚úÖ Connected to Elasticsearch!")
        print(f"Server Version: {es.info()['version']['number']}")
except Exception as e:
    print("‚ùå Error:", e)

# 3. List the Indices
try:
    indices = es.indices.get_alias(index="winlogbeat-*")
    print("Found Indices:", list(indices.keys()))
except Exception as e:
    print("Could not list indices (yet):", e)

‚úÖ Connected to Elasticsearch!
Server Version: 8.19.9
Found Indices: []


In [3]:
import pandas as pd             # <--- We added this line!
from elasticsearch import Elasticsearch

# 1. Re-connect (just to be safe)
es = Elasticsearch("http://10.0.2.5:9200")

# 2. Define the search query
query_body = {
    "query": {"match_all": {}},
    "size": 1000,
    "sort": [{"@timestamp": "desc"}]
}

# 3. Execute the search
response = es.search(index="winlogbeat-*", body=query_body)
print(f"üì• Found {len(response['hits']['hits'])} logs.")

# 4. Extract the clean data
data = []
for hit in response['hits']['hits']:
    source = hit['_source']
    
    row = {
        "timestamp": source.get("@timestamp"),
        "event_id": source.get("winlogbeat", {}).get("event_id"),
        "process_name": source.get("process", {}).get("name"),
        "command_line": source.get("process", {}).get("command_line"),
        "parent_process": source.get("process", {}).get("parent", {}).get("name")
    }
    data.append(row)

# 5. Convert to DataFrame
df = pd.DataFrame(data)

# 6. Display the table
print("‚úÖ Data Loaded Successfully!")
df.head()

üì• Found 1000 logs.
‚úÖ Data Loaded Successfully!


Unnamed: 0,timestamp,event_id,process_name,command_line,parent_process
0,2026-01-02T10:28:33.722Z,,,,
1,2026-01-02T10:28:26.078Z,,,,
2,2026-01-02T10:27:52.241Z,,,,
3,2026-01-02T10:27:18.879Z,,,,
4,2026-01-02T10:27:00.633Z,,,,


In [5]:
# 1. Smarter Query: Only get logs that HAVE a command line
query_body = {
    "query": {
        "bool": {
            "must": [
                {"exists": {"field": "process.command_line"}}
            ]
        }
    },
    "size": 1000,  # Get the last 1000 *useful* logs
    "sort": [{"@timestamp": "desc"}]
}

# 2. Execute Search
response = es.search(index="winlogbeat-*", body=query_body)
print(f"üì• Found {len(response['hits']['hits'])} USEFUL logs.")

# 3. Extract Data (Same as before)
data = []
for hit in response['hits']['hits']:
    source = hit['_source']
    row = {
        "timestamp": source.get("@timestamp"),
        "process_name": source.get("process", {}).get("name"),
        "command_line": source.get("process", {}).get("command_line"),
        "user": source.get("user", {}).get("name")
    }
    data.append(row)

# 4. Create DataFrame & Label it immediately
df = pd.DataFrame(data)

# Re-apply the labeling function
def label_log(cmd):
    cmd = str(cmd).lower()
    if "powershell" in cmd and "hidden" in cmd: return 1
    if "reg add" in cmd and "calc.exe" in cmd: return 1
    if "whoami" in cmd or "net user" in cmd: return 1
    if "reg save" in cmd: return 1
    if "encodedcommand" in cmd: return 1
    if "certutil" in cmd and "urlcache" in cmd: return 1
    return 0

df['label'] = df['command_line'].apply(label_log)

# 5. Show the Scoreboard
print("\nüìä Dataset Balance:")
print(df['label'].value_counts())

üì• Found 1000 USEFUL logs.

üìä Dataset Balance:
label
0    997
1      3
Name: count, dtype: int64


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# 1. Oversampling (The "Photocopy" Trick)
# We separate the bad logs and copy them 50 times so the AI notices them.
df_normal = df[df['label'] == 0]
df_attack = df[df['label'] == 1]

# We duplicate the attack rows to boost their numbers
df_attack_boosted = pd.concat([df_attack] * 50, ignore_index=True)

# Combine them back together
df_balanced = pd.concat([df_normal, df_attack_boosted])

print(f"üìä New Balance -> Normal: {len(df_normal)}, Attack: {len(df_attack_boosted)}")

# 2. Feature Extraction (Text -> Math)
# We use TF-IDF (Term Frequency-Inverse Document Frequency)
# It turns words like "hidden" or "powershell" into important numbers.
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df_balanced['command_line'])
y = df_balanced['label']

# 3. Split Data (80% for Training, 20% for Testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Train the Brain (Random Forest)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 5. Test the Results
y_pred = model.predict(X_test)

print("\nüèÜ Model Training Complete!")
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("\nDetailed Report:")
print(classification_report(y_test, y_pred))

üìä New Balance -> Normal: 997, Attack: 150

üèÜ Model Training Complete!
Accuracy: 100.00%

Detailed Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       204
           1       1.00      1.00      1.00        26

    accuracy                           1.00       230
   macro avg       1.00      1.00      1.00       230
weighted avg       1.00      1.00      1.00       230



In [7]:
import joblib

# 1. Save the files to your folder
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(model, 'random_forest_model.pkl')

print("‚úÖ AI Model saved successfully to disk!")

‚úÖ AI Model saved successfully to disk!


In [8]:
# 1. Load the saved brain (simulating a new day)
loaded_vec = joblib.load('tfidf_vectorizer.pkl')
loaded_model = joblib.load('random_forest_model.pkl')

# 2. Simulate a suspicious command
# (A hacker trying to hide powershell, similar to Attack #1)
new_command = ["powershell.exe -windowstyle hidden -command 'Invoke-WebRequest...'"]

# 3. Translate it to math
new_command_vector = loaded_vec.transform(new_command)

# 4. Ask the AI: "Is this safe?"
prediction = loaded_model.predict(new_command_vector)

if prediction[0] == 1:
    print("üö® ALERT: Malicious Command Detected!")
else:
    print("‚úÖ Safe Command.")

‚úÖ Safe Command.
