In [None]:
# Import libraries
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, download
import re

# Download NLTK data (run once)
download('punkt')
download('stopwords')
download('wordnet')
download('averaged_perceptron_tagger')

# Sample security log data (simulate AD/Sitecore logs)
data = {
    'log_text': [
        "Unauthorized access attempt from IP 192.168.1.1 on Active Directory server",
        "Successful login for user admin on Sitecore portal",
        "Failed authentication from unknown IP 8.8.8.8 with multiple retries",
        "Normal traffic from internal network to firewall",
        "Suspicious packet drop in UDP connection from remote server",
        "User query on Sitecore CMS: search for sensitive documents",
        "Brute force attack detected on AD domain controller"
    ]
}
df = pd.DataFrame(data)
print("Raw Logs:")
print(df)

# Function for tokenization
def tokenize_text(text):
    # Convert to lowercase and remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    # Tokenize into words
    tokens = word_tokenize(text)
    return tokens

# Apply tokenization
df['tokens'] = df['log_text'].apply(tokenize_text)
print("\nTokenized Logs:")
print(df[['log_text', 'tokens']])
''' Explanation: Tokenization splits text into words, e.g., 
"Unauthorized access" -> ['unauthorized', 'access']. Useful for parsing security logs'''

In [None]:
# Stopwords removal
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

df['filtered_tokens'] = df['tokens'].apply(remove_stopwords)
print("Logs After Stopword Removal:")
print(df[['log_text', 'filtered_tokens']])
# Explanation: Removes common words like "from", "on" to focus on key terms like "unauthorized", "failed". In security, this highlights threats like "attack".

# Stemming (Porter Stemmer)
stemmer = PorterStemmer()
def stem_tokens(tokens):
    stemmed = [stemmer.stem(word) for word in tokens]
    return stemmed

df['stemmed'] = df['filtered_tokens'].apply(stem_tokens)
print("\nStemmed Logs:")
print(df[['log_text', 'stemmed']])
# Explanation: Stemming reduces words to roots, e.g., "attempts" -> "attempt". Good for matching variants in logs (e.g., "failed" and "failure").

# Lemmatization (more accurate than stemming)
lemmatizer = WordNetLemmatizer()
def lemmatize_tokens(tokens):
    # POS tagging for better lemmatization
    tagged = pos_tag(tokens)
    lemmatized = [lemmatizer.lemmatize(word, pos='v' if tag.startswith('V') else 'n') for word, tag in tagged]
    return lemmatized

df['lemmatized'] = df['filtered_tokens'].apply(lemmatize_tokens)
print("\nLemmatized Logs:")
print(df[['log_text', 'lemmatized']])
# Explanation: Lemmatization returns base forms, e.g., "accesses" -> "access". Better for domain-specific text like AD logs where context matters.

In [None]:
# Vectorization with TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Prepare text (use lemmatized tokens joined back to strings)
df['processed_text'] = df['lemmatized'].apply(' '.join)

# TF-IDF Vectorizer (max_features=50 to keep simple)
vectorizer = TfidfVectorizer(max_features=50, stop_words='english')
X_tfidf = vectorizer.fit_transform(df['processed_text'])
feature_names = vectorizer.get_feature_names_out()
print("TF-IDF Features (Top 10):", feature_names[:10])
print("TF-IDF Matrix Shape:", X_tfidf.shape)
# Explanation: TF-IDF converts text to sparse matrix; high scores for rare terms like "brute" in logs. Useful for ML input in security analysis.

# Display TF-IDF scores for first log
first_log_scores = X_tfidf[0].toarray().flatten()
top_terms = sorted(zip(feature_names, first_log_scores), key=lambda x: x[1], reverse=True)[:5]
print("\nTop Terms for First Log:", top_terms)

In [None]:
# Sentiment Analysis with VADER (pre-trained for social/domain text)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
download('vader_lexicon')

sia = SentimentIntensityAnalyzer()
df['sentiment'] = df['processed_text'].apply(lambda x: sia.polarity_scores(x)['compound'])
print("Sentiment Scores:")
print(df[['log_text', 'sentiment']])
# Explanation: VADER scores text from -1 (negative) to 1 (positive). E.g., 
# "failed authentication" = negative, flagging suspicious AD logs.

# Topic Modeling with LDA (Gensim)
from gensim import corpora, models
from gensim.utils import simple_preprocess

# Prepare corpus from lemmatized tokens
tokenized_docs = df['lemmatized'].tolist()
dictionary = corpora.Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

# LDA model (num_topics=2 for simplicity)
lda_model = models.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=10)
topics = lda_model.print_topics(num_words=3)
print("LDA Topics:")
for topic in topics:
    print(topic)
# Explanation: LDA discovers topics (e.g., Topic 0: "access unauthorized ip" for threats). 
# Useful for grouping Sitecore logs into "normal" vs "suspicious".

In [None]:
# Install Ollama client (run in terminal: pip install ollama)
import ollama

# Test Ollama with Mistral (or Llama2)
response = ollama.chat(model='mistral', messages=[{'role': 'user', 'content': 'Hello, what is NLP?'}])
print("Basic Response:", response['message']['content'])
# Explanation: Ollama runs Mistral locally. 'chat' sends prompts; 
# 'messages' simulates conversation. Safe for domain-sensitive data (no cloud).

# Prompt Engineering Basics
prompts = [
    "What is NLP?",  # Basic prompt
    "Explain NLP in 3 sentences for a security analyst.",  # Specific role
    "As a network security expert, explain how NLP can analyze intrusion logs in Active Directory."  
    # Domain-specific
]

for i, prompt in enumerate(prompts, 1):
    response = ollama.chat(model='mistral', messages=[{'role': 'user', 'content': prompt}])
    print(f"\nPrompt {i}: {prompt}")
    print("Response:", response['message']['content'][:200] + "..." if len(response['message']['content']) > 200 else response['message']['content'])
# Explanation: Better prompts (specific, role-based) yield relevant outputs. For security, 
    #add "confidential" to prompts for ethical handling.

In [None]:
# Domain-Specific Prompt: Security Log Analysis
log_example = "Failed login from IP 192.168.1.1 on AD server at 2025-09-18 10:00:00"
prompt = f"As a cybersecurity analyst, analyze this Active Directory log and suggest 3 mitigation steps: {log_example}"
response = ollama.chat(model='mistral', messages=[{'role': 'user', 'content': prompt}])
print("Security Log Analysis:")
print(response['message']['content'])
# Explanation: Prompt engineering tailors Gen AI for domain tasks like log triage in Sitecore/AD. Use 'system' role for consistent personas.

# Chain Prompts (Conversation)
messages = [
    {'role': 'system', 'content': 'You are a network security expert.'},
    {'role': 'user', 'content': 'What is a common threat in Sitecore deployments?'},
    {'role': 'assistant', 'content': response['message']['content'][:100] + "..."}  # Simulate previous response
]
chain_response = ollama.chat(model='mistral', messages=messages)
print("\nChained Prompt Response:")
print(chain_response['message']['content'])
# Explanation: Chain prompts for multi-turn interactions, e.g., refining threat analysis in security audits.

In [None]:
# Chaining Prompts for Multi-Step Task
# Example: Security Workflow - Log Analysis Chain
messages = [
    {'role': 'system', 'content': 'You are a Gen AI assistant for network security.'},
    {'role': 'user', 'content': 'Step 1: Analyze this Sitecore log: "Error in user authentication from IP 10.0.0.5".'}
]
response1 = ollama.chat(model='mistral', messages=messages)
print("Step 1 Response:", response1['message']['content'])

# Chain Step 2
messages.append({'role': 'assistant', 'content': response1['message']['content']})
messages.append({'role': 'user', 'content': 'Step 2: Based on analysis, suggest 2 fixes.'})
response2 = ollama.chat(model='mistral', messages=messages)
print("\nStep 2 Response:", response2['message']['content'])
# Explanation: Chaining builds workflows, e.g., log triage in AD/Sitecore. Append to 'messages' for context.

# Multi-Step Chain for Security Report
full_chain = [
    {'role': 'system', 'content': 'You are a security analyst using Gen AI.'},
    {'role': 'user', 'content': 'Generate a threat report from this AD log: "Multiple failed logins from external IP". Include risk level and mitigation.'}
]
report_response = ollama.chat(model='mistral', messages=full_chain)
print("\nFull Chain Report:")
print(report_response['message']['content'])
# Explanation: Single prompt for chained reasoning, useful for automated security reports.

In [None]:
# Simulate Fine-Tuning with Few-Shot Prompting (No Actual Training - Ollama Local)
# Use examples to "fine-tune" behavior for domain (e.g., security classification)
few_shot_prompt = """
You are a fine-tuned Gen AI for classifying network threats. Examples:
Input: Failed login from IP 192.168.1.1
Output: Threat: Brute Force | Risk: Medium | Mitigation: Block IP

Input: Normal traffic on port 80
Output: Threat: None | Risk: Low | Mitigation: Monitor

Classify this: Suspicious UDP packets to Sitecore server.
"""
response_fs = ollama.chat(model='mistral', messages=[{'role': 'user', 'content': few_shot_prompt}])
print("Few-Shot Classification:")
print(response_fs['message']['content'])
# Explanation: Few-shot prompting simulates fine-tuning by providing examples. For local Ollama, this adapts Mistral to domain without training data/credits.

# Hands-On Extension: Create Your Own Few-Shot
# Add 2 more examples (e.g., AD log) to few_shot_prompt and re-run. Discuss: How does this "fine-tune" for Sitecore threats?