In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns

# Load and preprocess packet data
df = pd.read_csv('packet_data.csv')  # Replace with actual dataset path
df['clean_content'] = df['packet_content'].apply(lambda x: re.sub(r'\W+', ' ', x.lower()))

# Build pipeline with TF-IDF, dimensionality reduction, and classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000)),
    ('svd', TruncatedSVD(n_components=100, random_state=42)),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['clean_content'], df['label'], test_size=0.3, random_state=42)

# Fit and evaluate model
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)
print("DPI Model Performance:")
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))

# Visualize keyword importance
tfidf = pipeline.named_steps['tfidf']
feature_names = tfidf.get_feature_names_out()
importances = pipeline.named_steps['classifier'].feature_importances_
indices = np.argsort(importances)[-10:]
plt.figure(figsize=(10, 6))
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Feature Importance')
plt.title('Top 10 Important Features in DPI')
plt.show()