In [5]:
# Dark Web Content Analysis Framework - For Legitimate Research Purposes
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import networkx as nx
from sentence_transformers import SentenceTransformer
import joblib
import re
from datetime import datetime

# IMPORTANT: This notebook assumes you have legally obtained data
# through proper channels (law enforcement, academic research with IRB approval)

# 1. Load pre-collected dataset (this avoids direct crawling code)
def load_dataset(file_path):
    """
    Load a pre-existing dataset of listings that was obtained legally.
    """
    # For demonstration purposes only
    df = pd.read_csv(file_path)
    print(f"Loaded {len(df)} listings for analysis")
    return df

# 2. Text preprocessing function
def preprocess_text(text):
    """Clean and normalize text data"""
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove special characters
        text = re.sub(r'[^\w\s]', '', text)
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    return ""

# 3. Feature extraction
def extract_features(df, text_column):
    """Extract features from listing text"""
    # Process the text
    df['processed_text'] = df[text_column].apply(preprocess_text)
    
    # Create TF-IDF features
    vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
    X = vectorizer.fit_transform(df['processed_text'])
    
    return X, vectorizer

# 4. Train a classifier
def train_classifier(X, y):
    """Train a classification model on the data"""
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    return model

# 5. Vendor network analysis
def analyze_vendor_network(df, vendor_col='vendor_name'):
    """Create a network graph of vendor relationships"""
    # Create a graph
    G = nx.Graph()
    
    # Add vendors as nodes
    vendors = df[vendor_col].unique()
    G.add_nodes_from(vendors)
    
    # Create edges between vendors who sell similar products
    categories = df['category'].unique()
    for category in categories:
        category_vendors = df[df['category'] == category][vendor_col].unique()
        for i, v1 in enumerate(category_vendors):
            for v2 in category_vendors[i+1:]:
                if G.has_edge(v1, v2):
                    G[v1][v2]['weight'] += 1
                else:
                    G.add_edge(v1, v2, weight=1)
    
    return G

# 6. Find suspicious patterns (for legitimate analysis)
def identify_suspicious_patterns(df):
    """Identify potential suspicious patterns in the data"""
    # Example: Look for sudden price changes
    df['price_diff'] = df.groupby('item_name')['price'].diff()
    suspicious = df[abs(df['price_diff']) > df['price'].mean() * 0.5]
    
    # Example: Look for unusual posting frequencies
    posting_freq = df.groupby('vendor_name').size()
    suspicious_vendors = posting_freq[posting_freq > posting_freq.mean() + 2*posting_freq.std()]
    
    return suspicious, suspicious_vendors.index.tolist()

# Display results
def visualize_results(G, suspicious_vendors):
    """Visualize the vendor network with suspicious vendors highlighted"""
    plt.figure(figsize=(12, 12))
    
    # Position nodes using force-directed layout
    pos = nx.spring_layout(G, seed=42)
    
    # Draw regular nodes
    nx.draw_networkx_nodes(G, pos, 
                         node_color='blue', 
                         node_size=50,
                         alpha=0.6)
    
    # Highlight suspicious vendors
    suspicious_nodes = [n for n in G.nodes() if n in suspicious_vendors]
    if suspicious_nodes:
        nx.draw_networkx_nodes(G, pos,
                             nodelist=suspicious_nodes,
                             node_color='red',
                             node_size=100,
                             alpha=0.8)
    
    # Draw edges with varying thickness based on weight
    edge_weights = [G[u][v]['weight'] for u, v in G.edges()]
    nx.draw_networkx_edges(G, pos, width=edge_weights, alpha=0.3)
    
    # Add labels to important nodes
    # Only label nodes with high degree
    important_nodes = [n for n in G.nodes() if G.degree(n) > np.percentile([G.degree(n) for n in G.nodes()], 90)]
    nx.draw_networkx_labels(G, pos, {n: n for n in important_nodes}, font_size=8)
    
    plt.title("Vendor Network Analysis")
    plt.axis('off')
    plt.show()

# 7. Save results (omitting alert system for ethical reasons)
def save_analysis_results(model, vectorizer, G, suspicious_vendors, output_dir):
    """Save the analysis results for future reference"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Save the model
    joblib.dump(model, f"{output_dir}/model_{timestamp}.pkl")
    joblib.dump(vectorizer, f"{output_dir}/vectorizer_{timestamp}.pkl")
    
    # Save the suspicious vendors list
    with open(f"{output_dir}/suspicious_vendors_{timestamp}.txt", "w") as f:
        for vendor in suspicious_vendors:
            f.write(f"{vendor}\n")
    
    # Save the network as GraphML for further analysis
    nx.write_graphml(G, f"{output_dir}/vendor_network_{timestamp}.graphml")
    
    print(f"Analysis results saved to {output_dir}")

# Main workflow (assumes ethical usage)
if __name__ == "__main__":
    # Replace with your legal dataset path
    data_path = "your_legal_dataset.csv"
    output_dir = "./analysis_results"
    
    # Load data
    df = load_dataset(data_path)
    
    # Extract features
    X, vectorizer = extract_features(df, 'description')
    
    # Train classifier
    model = train_classifier(X, df['category'])
    
    # Analyze vendor network
    G = analyze_vendor_network(df)
    
    # Identify suspicious patterns
    _, suspicious_vendors = identify_suspicious_patterns(df)
    
    # Visualize results
    visualize_results(G, suspicious_vendors)
    
    # Save results
    save_analysis_results(model, vectorizer, G, suspicious_vendors, output_dir)

ModuleNotFoundError: No module named 'seaborn'

In [6]:
!pip install seaborn



In [1]:
# Dark Web Content Analysis Framework - For Legitimate Research Purposes
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import networkx as nx
from sentence_transformers import SentenceTransformer
import joblib
import re
from datetime import datetime

# IMPORTANT: This notebook assumes you have legally obtained data
# through proper channels (law enforcement, academic research with IRB approval)

# 1. Load pre-collected dataset (this avoids direct crawling code)
def load_dataset(file_path):
    """
    Load a pre-existing dataset of listings that was obtained legally.
    """
    # For demonstration purposes only
    df = pd.read_csv(file_path)
    print(f"Loaded {len(df)} listings for analysis")
    return df

# 2. Text preprocessing function
def preprocess_text(text):
    """Clean and normalize text data"""
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove special characters
        text = re.sub(r'[^\w\s]', '', text)
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    return ""

# 3. Feature extraction
def extract_features(df, text_column):
    """Extract features from listing text"""
    # Process the text
    df['processed_text'] = df[text_column].apply(preprocess_text)
    
    # Create TF-IDF features
    vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
    X = vectorizer.fit_transform(df['processed_text'])
    
    return X, vectorizer

# 4. Train a classifier
def train_classifier(X, y):
    """Train a classification model on the data"""
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    return model

# 5. Vendor network analysis
def analyze_vendor_network(df, vendor_col='vendor_name'):
    """Create a network graph of vendor relationships"""
    # Create a graph
    G = nx.Graph()
    
    # Add vendors as nodes
    vendors = df[vendor_col].unique()
    G.add_nodes_from(vendors)
    
    # Create edges between vendors who sell similar products
    categories = df['category'].unique()
    for category in categories:
        category_vendors = df[df['category'] == category][vendor_col].unique()
        for i, v1 in enumerate(category_vendors):
            for v2 in category_vendors[i+1:]:
                if G.has_edge(v1, v2):
                    G[v1][v2]['weight'] += 1
                else:
                    G.add_edge(v1, v2, weight=1)
    
    return G

# 6. Find suspicious patterns (for legitimate analysis)
def identify_suspicious_patterns(df):
    """Identify potential suspicious patterns in the data"""
    # Example: Look for sudden price changes
    df['price_diff'] = df.groupby('item_name')['price'].diff()
    suspicious = df[abs(df['price_diff']) > df['price'].mean() * 0.5]
    
    # Example: Look for unusual posting frequencies
    posting_freq = df.groupby('vendor_name').size()
    suspicious_vendors = posting_freq[posting_freq > posting_freq.mean() + 2*posting_freq.std()]
    
    return suspicious, suspicious_vendors.index.tolist()

# Display results
def visualize_results(G, suspicious_vendors):
    """Visualize the vendor network with suspicious vendors highlighted"""
    plt.figure(figsize=(12, 12))
    
    # Position nodes using force-directed layout
    pos = nx.spring_layout(G, seed=42)
    
    # Draw regular nodes
    nx.draw_networkx_nodes(G, pos, 
                         node_color='blue', 
                         node_size=50,
                         alpha=0.6)
    
    # Highlight suspicious vendors
    suspicious_nodes = [n for n in G.nodes() if n in suspicious_vendors]
    if suspicious_nodes:
        nx.draw_networkx_nodes(G, pos,
                             nodelist=suspicious_nodes,
                             node_color='red',
                             node_size=100,
                             alpha=0.8)
    
    # Draw edges with varying thickness based on weight
    edge_weights = [G[u][v]['weight'] for u, v in G.edges()]
    nx.draw_networkx_edges(G, pos, width=edge_weights, alpha=0.3)
    
    # Add labels to important nodes
    # Only label nodes with high degree
    important_nodes = [n for n in G.nodes() if G.degree(n) > np.percentile([G.degree(n) for n in G.nodes()], 90)]
    nx.draw_networkx_labels(G, pos, {n: n for n in important_nodes}, font_size=8)
    
    plt.title("Vendor Network Analysis")
    plt.axis('off')
    plt.show()

# 7. Save results (omitting alert system for ethical reasons)
def save_analysis_results(model, vectorizer, G, suspicious_vendors, output_dir):
    """Save the analysis results for future reference"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Save the model
    joblib.dump(model, f"{output_dir}/model_{timestamp}.pkl")
    joblib.dump(vectorizer, f"{output_dir}/vectorizer_{timestamp}.pkl")
    
    # Save the suspicious vendors list
    with open(f"{output_dir}/suspicious_vendors_{timestamp}.txt", "w") as f:
        for vendor in suspicious_vendors:
            f.write(f"{vendor}\n")
    
    # Save the network as GraphML for further analysis
    nx.write_graphml(G, f"{output_dir}/vendor_network_{timestamp}.graphml")
    
    print(f"Analysis results saved to {output_dir}")

# Main workflow (assumes ethical usage)
if __name__ == "__main__":
    # Replace with your legal dataset path
    data_path = "your_legal_dataset.csv"
    output_dir = "./analysis_results"
    
    # Load data
    df = load_dataset(data_path)
    
    # Extract features
    X, vectorizer = extract_features(df, 'description')
    
    # Train classifier
    model = train_classifier(X, df['category'])
    
    # Analyze vendor network
    G = analyze_vendor_network(df)
    
    # Identify suspicious patterns
    _, suspicious_vendors = identify_suspicious_patterns(df)
    
    # Visualize results
    visualize_results(G, suspicious_vendors)
    
    # Save results
    save_analysis_results(model, vectorizer, G, suspicious_vendors, output_dir)

ModuleNotFoundError: No module named 'seaborn'