In [None]:
import pandas as pd


In [None]:
df = pd.read_csv("Wednesday-workingHours.pcap_ISCX.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'Wednesday-workingHours.pcap_ISCX.csv'

In [None]:
df.head()


In [None]:
df.columns


In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.heatmap(df.isnull(), cbar=False)
plt.title("Missing Values Heatmap")
plt.show()


In [None]:
print(f"Number of duplicate rows: {df.duplicated().sum()}")

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
# This will show you the number of unique values in each column
print(df.nunique())

In [None]:
df.shape

In [None]:
# Dropping irrelevant features
df.columns = df.columns.str.strip()
irrelevant_cols = [
    'Flow Bytes/s', 'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk', 'Fwd Avg Bulk Rate',
    'Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate'
]
df = df.drop(columns=irrelevant_cols)

In [None]:
df.shape

In [None]:
# Saving Cleaned Data
df.to_csv("Wednesday_cleaned.csv", index=False)

In [None]:
# Correlation
import seaborn as sns
import matplotlib.pyplot as plt

# Select only numeric columns
numeric_df = df.select_dtypes(include=['int64', 'float64'])

plt.figure(figsize=(12, 8))
sns.heatmap(numeric_df.corr(), cmap="coolwarm", annot=False)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
import matplotlib.pyplot as plt

numeric_df = df.select_dtypes(include=['int64', 'float64'])
target_corr = numeric_df.corr()['Flow Duration'].sort_values(ascending=False)

plt.figure(figsize=(12, 6))
target_corr.plot(kind='bar', color='skyblue')
plt.title("Correlation with Flow Duration")
plt.ylabel("Correlation Coefficient")
plt.show()

In [None]:
sns.boxplot(x=df['Flow Duration'])
plt.title("Outlier Detection - Flow Duration")
plt.show()

In [None]:
from scipy import stats
import numpy as np

z_scores = np.abs(stats.zscore(numeric_df))
outliers = np.where(z_scores > 3)
print("Outliers:", len(outliers[0]))


In [None]:
categorical_features = df.select_dtypes(include=['object']).columns
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns

print("Categorical:", categorical_features)
print("Numerical:", numerical_features)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Label'] = le.fit_transform(df['Label'])
print(df['Label'].unique())

In [None]:
# Replace inf/-inf with NaN
df[numerical_features] = df[numerical_features].replace([np.inf, -np.inf], np.nan)

# Optionally drop rows with NaN
df[numerical_features] = df[numerical_features].dropna()

In [None]:
# Optionally drop rows with NaN
df[numerical_features] = df[numerical_features].dropna()

for col in numerical_features:
    col_data = df[col].replace([np.inf, -np.inf], np.nan).dropna()
    plt.figure(figsize=(8,4))
    plt.hist(col_data, bins=30)
    plt.title(f"Histogram - {col}")
    plt.show()

In [None]:
important_features = [
    'Flow Duration',
    'Total Fwd Packets',
    'Total Backward Packets',
    'Fwd Packet Length Mean',
    'Bwd Packet Length Mean',
    'Fwd IAT Mean',
    'Bwd IAT Mean',
    'Flow Bytes/s',
    'Label'
]


In [None]:
#WEEK 3
# === Feature Engineering and Selection ===

# Correlation threshold method
corr_matrix = df[numerical_features].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
print("Highly correlated features to drop:", to_drop)

df_reduced = df.drop(columns=to_drop)

# Feature importance using Random Forest
from sklearn.ensemble import RandomForestClassifier
X = df_reduced.drop('Label', axis=1)
y = df_reduced['Label']

rf = RandomForestClassifier(n_estimators=50, random_state=42)
rf.fit(X, y)

importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10,6))
importances[:15].plot(kind='bar')
plt.title("Top 15 Important Features")
plt.show()


In [None]:
#WEEK 4
# === Supervised Model Training ===
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Report:\n", classification_report(y_test, y_pred_rf))

# SVM
svm = SVC(kernel='rbf', gamma='scale')
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print("SVM Report:\n", classification_report(y_test, y_pred_svm))

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print("Logistic Regression Report:\n", classification_report(y_test, y_pred_lr))


In [None]:
#WEEK 5
# === Anomaly Detection ===
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans

# Isolation Forest
iso_forest = IsolationForest(contamination=0.02, random_state=42)
anomalies = iso_forest.fit_predict(X)
df['Anomaly_IF'] = anomalies
print(df['Anomaly_IF'].value_counts())

# K-Means clustering
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X)
df['Cluster'] = clusters
sns.scatterplot(x=X.iloc[:,0], y=X.iloc[:,1], hue=df['Cluster'], palette='viridis')
plt.title("K-Means Clustering")
plt.show()


In [None]:
# WEEK 6
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20, None]
}

grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='accuracy')
grid.fit(X_train, y_train)
print("Best Parameters:", grid.best_params_)
print("Best Score:", grid.best_score_)

# ROC Curve
from sklearn.metrics import roc_curve, roc_auc_score
y_prob = rf.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.plot(fpr, tpr)
plt.title("ROC Curve - Random Forest")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()


In [None]:
#WEEK 7
# === Alert Generation ===
import datetime

df['Prediction_RF'] = rf.predict(X)

alerts = df[df['Prediction_RF'] != df['Label']]
alerts['Timestamp'] = datetime.datetime.now()

alerts[['Timestamp', 'Flow Duration', 'Label', 'Prediction_RF']].to_csv('alerts_log.csv', index=False)
print(f"Alerts generated: {len(alerts)}")
print("Saved as alerts_log.csv")
