# Exploration

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [None]:
with open('Data/kddcup.names.txt', 'r') as file:
    # Read the content of the file
    file_content = file.read()
    # Print the content
    print(file_content)

In [None]:
# Getting columns
cols="""duration,
protocol_type,
service,
flag,
src_bytes,
dst_bytes,
land,
wrong_fragment,
urgent,
hot,
num_failed_logins,
logged_in,
num_compromised,
root_shell,
su_attempted,
num_root,
num_file_creations,
num_shells,
num_access_files,
num_outbound_cmds,
is_host_login,
is_guest_login,
count,
srv_count,
serror_rate,
srv_serror_rate,
rerror_rate,
srv_rerror_rate,
same_srv_rate,
diff_srv_rate,
srv_diff_host_rate,
dst_host_count,
dst_host_srv_count,
dst_host_same_srv_rate,
dst_host_diff_srv_rate,
dst_host_same_src_port_rate,
dst_host_srv_diff_host_rate,
dst_host_serror_rate,
dst_host_srv_serror_rate,
dst_host_rerror_rate,
dst_host_srv_rerror_rate"""

columns=[]
for c in cols.split(','):
    if(c.strip()):
       columns.append(c.strip())
    
columns.append('target')
print(columns)

In [None]:
with open("Data/training_attack_types.txt",'r') as f:
    print(f.read())

In [None]:
attacks_types = {
'normal': 'normal',
'back': 'dos',
'buffer_overflow': 'u2r',
'ftp_write': 'r2l',
'guess_passwd': 'r2l',
'imap': 'r2l',
'ipsweep': 'probe',
'land': 'dos',
'loadmodule': 'u2r',
'multihop': 'r2l',
'neptune': 'dos',
'nmap': 'probe',
'perl': 'u2r',
'phf': 'r2l',
'pod': 'dos',
'portsweep': 'probe',
'rootkit': 'u2r',
'satan': 'probe',
'smurf': 'dos',
'spy': 'r2l',
'teardrop': 'dos',
'warezclient': 'r2l',
'warezmaster': 'r2l',
}

# Building the dataframe

In [None]:
df = pd.read_csv("Data/kddcup.data.corrected.csv", names=columns)

#Adding Attack Type column
df['Attack Type'] = df.target.apply(lambda r:attacks_types[r[:-1]])
df.head()

In [None]:
df.shape

In [None]:
df['target'].value_counts()

In [None]:
df['Attack Type'].value_counts()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include='object')

# Pre-processing

In [None]:
df.isnull().sum()

In [None]:
# Number of duplicate rows
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)
df

In [None]:
df.shape

In [None]:
#Finding categorical features
num_cols = df._get_numeric_data().columns

cate_cols = list(set(df.columns)-set(num_cols))
cate_cols.remove('target')
cate_cols.remove('Attack Type')

cate_cols

In [None]:
# Define a function to detect outliers using standard deviation
def detect_outliers_std(data, threshold=3):
    outliers = pd.DataFrame()
    for col in df.columns:
        mean = df[col].mean()
        std = df[col].std()
        outliers[col] = df[col][np.abs((df[col] - mean) / std) > threshold]
    return outliers

# Detect outliers
outliers_std = detect_outliers_std(df, threshold=3)

# Display or handle the outliers accordingly
print("Outliers using Standard Deviation Method:")
print(outliers_std)

# Visualisation

In [None]:
%matplotlib inline
sns.set(style="whitegrid")

# Create the countplot
ax = sns.countplot(data=df, x="protocol_type")

# Set plot labels and title
plt.xlabel("Count")
plt.ylabel("Protocol Type")
plt.title("Distribution of Protocol Types")

# Add value counts on top of each bar
total = len(df["protocol_type"])
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height() / total)
    x = p.get_x() + p.get_width() / 2
    y = p.get_height()
    ax.annotate(percentage, (x, y), ha='center', va='bottom')

In [None]:
plt.figure(figsize=(15, 3))
sns.set(style="whitegrid")

# Create the countplot
ax = sns.countplot(data=df, x="service")

plt.xlabel("Count")
plt.ylabel("Service")
plt.title("Distribution of Services")
plt.xticks(rotation=90)

# Add value counts on top of each bar
total = len(df["service"])
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height() / total)
    x = p.get_x() + p.get_width() / 2
    y = p.get_height()
    if float(percentage[:-1]) > 0.5: ax.annotate(percentage, (x, y), ha='center', va='bottom')

In [None]:
sns.set(style="whitegrid")

# Create the countplot
ax = sns.countplot(data=df, x="flag")

plt.xlabel("Count")
plt.ylabel("Flag")
plt.title("Distribution of Flags")
plt.xticks(rotation=90)

# Add value counts on top of each bar
total = len(df["logged_in"])
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height() / total)
    x = p.get_x() + p.get_width() / 2
    y = p.get_height()
    ax.annotate(percentage, (x, y), ha='center', va='bottom')

In [None]:
sns.set(style="whitegrid")

# Create the countplot
ax = sns.countplot(data=df, x="logged_in", palette="viridis")  # Using a different color palette

# Set plot labels and title
plt.xlabel("Logged In")
plt.ylabel("Count")
plt.title("Distribution of logged_in")

# Add value counts on top of each bar
total = len(df["logged_in"])
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height() / total)
    x = p.get_x() + p.get_width() / 2
    y = p.get_height()
    ax.annotate(percentage, (x, y), ha='center', va='bottom')

In [None]:
sns.set(style="whitegrid")

# Create the countplot
ax = sns.countplot(data=df, x="target", palette="viridis")  # Using a different color palette

# Set plot labels and title
plt.xlabel("target")
plt.ylabel("Count")
plt.title("Distribution of target")
plt.xticks(rotation=90)

# Add value counts on top of each bar
total = len(df["target"])
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height() / total)
    x = p.get_x() + p.get_width() / 2
    y = p.get_height()
    if float(percentage[:-1]) > 0.3: ax.annotate(percentage, (x, y), ha='center', va='bottom')

In [None]:
sns.set(style="whitegrid")

# Create the countplot
ax = sns.countplot(data=df, x="Attack Type", palette="viridis")  # Using a different color palette

# Set plot labels and title
plt.xlabel("Attack Type")
plt.ylabel("Count")
plt.title("Distribution of Attack Type")
plt.xticks(rotation=90)

# Add value counts on top of each bar
total = len(df["Attack Type"])
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height() / total)
    x = p.get_x() + p.get_width() / 2
    y = p.get_height()
    ax.annotate(percentage, (x, y), ha='center', va='bottom')

In [None]:
df['target'].value_counts()

In [None]:
df['Attack Type'].value_counts()

In [None]:
df.columns

# DATA CORRELATION

In [None]:
categorical_columns = df.select_dtypes(include=['object']).columns
categorical_columns

In [None]:
from sklearn.preprocessing import LabelEncoder

le_protocol = LabelEncoder()
col = 'protocol_type'
df[col + '_encoded'] = le_protocol.fit_transform(df[col])
df

In [None]:
from sklearn.preprocessing import LabelEncoder

le_service = LabelEncoder()
col = 'service'
df[col + '_encoded'] = le_service.fit_transform(df[col])
df

In [None]:
from sklearn.preprocessing import LabelEncoder

le_flag = LabelEncoder()
col = 'flag'
df[col + '_encoded'] = le_flag.fit_transform(df[col])
df

In [None]:
from sklearn.preprocessing import LabelEncoder

le_target = LabelEncoder()
col = 'target'
df[col + '_encoded'] = le_target.fit_transform(df[col])
df

In [None]:
from sklearn.preprocessing import LabelEncoder

le_Attack_type = LabelEncoder()
col = 'Attack Type'
df[col + '_encoded'] = le_Attack_type.fit_transform(df[col])
df

In [None]:
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
df = df.drop(categorical_columns, axis=1)
df

In [None]:
df = df.dropna(axis='columns')# drop columns with NaN
df = df[[col for col in df if df[col].nunique() > 1]]# keep columns where there are more than 1 unique values

corr = df.corr()

plt.figure(figsize=(15,12))

sns.heatmap(corr)

In [None]:
# correlation coefficient between 'num_root'  and the 'num_compromised' 
df['num_root'].corr(df['num_compromised'])

In [None]:
# correlation coefficient between 'srv_serror_rate' and 'serror_rate' 
df['srv_serror_rate'].corr(df['serror_rate'])

In [None]:
# Correlation values
correlation_dict = {}

for col1 in df.columns:
    for col2 in df.columns:
        if df.columns.get_loc(col1) < df.columns.get_loc(col2):
            correlation = df[col1].corr(df[col2])
            correlation_dict[f'{col1}---{col2}'] = correlation

# Descending order
sorted_correlation_dict = {k: v for k, v in sorted(correlation_dict.items(), key=lambda item: item[1], reverse=True)}
sorted_correlation_dict

In [None]:
#These variables are highly correlated and should be ignored for analysis.
#(Correlation = 0.9938277978738366 with num_compromised)
df.drop('num_root',axis = 1,inplace = True)

#(Correlation = 0.9983615072725952 with serror_rate)
df.drop('srv_serror_rate',axis = 1,inplace = True)

#(Correlation = 0.9947309539817937 with rerror_rate)
df.drop('srv_rerror_rate',axis = 1, inplace=True)

#(Correlation = 0.9993041091850098 with srv_serror_rate)
df.drop('dst_host_srv_serror_rate',axis = 1, inplace=True)

#(Correlation = 0.9869947924956001 with rerror_rate)
df.drop('dst_host_serror_rate',axis = 1, inplace=True)

#(Correlation = 0.9821663427308375 with srv_rerror_rate)
df.drop('dst_host_rerror_rate',axis = 1, inplace=True)

#(Correlation = 0.9851995540751249 with rerror_rate)
df.drop('dst_host_srv_rerror_rate',axis = 1, inplace=True)

#(Correlation = 0.9736854572953938 with dst_host_srv_count)
df.drop('dst_host_same_srv_rate',axis = 1, inplace=True)

In [None]:
df

In [None]:
df.shape

In [None]:
# écart type
df_std = df.std()
df_std = df_std.sort_values(ascending = True)
df_std

# Feature Mapping

In [None]:
df.columns

In [None]:
print(df['protocol_type_encoded'].value_counts())
print("\n")
for code, label in enumerate(le_protocol.classes_):
    print(f"Integer code: {code}, Corresponding class: {label}")

In [None]:
print(df['service_encoded'].value_counts())
print("\n")
for code, label in enumerate(le_service.classes_):
    print(f"Integer code: {code}, Corresponding class: {label}")

In [None]:
print(df['flag_encoded'].value_counts())
print("\n")
for code, label in enumerate(le_flag.classes_):
    print(f"Integer code: {code}, Corresponding class: {label}")

In [None]:
print(df['target_encoded'].value_counts())
print("\n")
for code, label in enumerate(le_target.classes_):
    print(f"Integer code: {code}, Corresponding class: {label}")

In [None]:
print(df['Attack Type_encoded'].value_counts())
print("\n")
for code, label in enumerate(le_Attack_type.classes_):
    print(f"Integer code: {code}, Corresponding class: {label}")

# Models

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

In [None]:
df

In [None]:
df = df.drop(['target_encoded',], axis=1)
df.shape

In [None]:
# Target variable and train set
y = df[['Attack Type_encoded']]
X = df.drop(['Attack Type_encoded',], axis=1)

# Split test and train data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

#### GAUSSIAN NAIVE BAYES

In [None]:
from sklearn.naive_bayes import GaussianNB

model_GNB = GaussianNB()

model_GNB.fit(X_train, y_train.values.ravel())

y_test_pred__GNB = model_GNB.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

print(classification_report(y_test, y_test_pred__GNB))

cm1 = confusion_matrix(y_test, y_test_pred__GNB)
cm1

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm1)
disp.plot()

In [None]:
from sklearn.model_selection import learning_curve

N, train_score, val_score = learning_curve(model_GNB, X_train, y_train.values.ravel(),
                                           train_sizes=np.linspace(0.1, 1.0, 20), cv=5)

print(N)
plt.plot(N, train_score.mean(axis=1), label='train')
plt.plot(N, val_score.mean(axis=1), label='validation')
plt.xlabel('train_sizes')
plt.legend()

#### SGDClassifier

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_classifier = SGDClassifier(loss='hinge', alpha=0.0001, max_iter=1200000, random_state=42)

sgd_classifier.fit(X_train, y_train.values.ravel())

# Make predictions on the test set
y_test_pred_SGD = sgd_classifier.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

print(classification_report(y_test, y_test_pred_SGD))

cm4 = confusion_matrix(y_test, y_test_pred_SGD)
cm4

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm4)
disp.plot()

In [None]:
from sklearn.model_selection import learning_curve

N, train_score, val_score = learning_curve(sgd_classifier, X_train, y_train.values.ravel(),
                                           train_sizes=np.linspace(0.1, 1.0, 20), cv=5)

print(N)
plt.plot(N, train_score.mean(axis=1), label='train')
plt.plot(N, val_score.mean(axis=1), label='validation')
plt.xlabel('train_sizes')
plt.legend()

#### LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression

model_LR = LogisticRegression(max_iter=1200000, multi_class='ovr')

model_LR.fit(X_train, y_train.values.ravel())

y_test_pred_LR = model_LR.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

print(classification_report(y_test, y_test_pred_LR))

cm2 = confusion_matrix(y_test, y_test_pred_LR)
cm2

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm2)
disp.plot()

#### DECISION TREE

In [None]:
#Decision Tree 
from sklearn.tree import DecisionTreeClassifier

model_DT = DecisionTreeClassifier(criterion="entropy", max_depth = 4)

model_DT.fit(X_train, y_train.values.ravel())

y_test_pred_DT = model_DT.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

print(classification_report(y_test, y_test_pred_DT))

cm3 = confusion_matrix(y_test, y_test_pred_DT)
cm3

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm3)
disp.plot()

In [None]:
from sklearn.model_selection import learning_curve

N, train_score, val_score = learning_curve(model_DT, X_train, y_train,
                                           train_sizes=np.linspace(0.1, 1.0, 20), cv=5)

print(N)
plt.plot(N, train_score.mean(axis=1), label='train')
plt.plot(N, val_score.mean(axis=1), label='validation')
plt.xlabel('train_sizes')
plt.legend()