In [1]:
import numpy as np
import pandas as pd

In [46]:
data = {
    'age': ['<=30', '<=30', '31...40', '>40', '>40', '>40', '31...40','<=30', '<=30','>40' ,'<=30', '31...40', '31...40', '>40'],
    'income': ['high', 'high', 'high', 'medium', 'low', 'low', 'low', 'medium', 'low','medium','medium','medium','high', 'medium'],
    'student': ['no', 'no', 'no','no', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no'],
    'credit_rating': ['fair', 'excellent', 'fair', 'fair','fair', 'excellent' ,'excellent', 'fair', 'fair','fair','excellent','excellent', 'fair','excellent'],
    'buys_computer': ['no', 'no', 'yes', 'yes', 'yes','no', 'yes','no', 'yes','yes', 'yes','yes', 'yes','no']
}         

df = pd.DataFrame(data)
df

Unnamed: 0,age,income,student,credit_rating,buys_computer
0,<=30,high,no,fair,no
1,<=30,high,no,excellent,no
2,31...40,high,no,fair,yes
3,>40,medium,no,fair,yes
4,>40,low,yes,fair,yes
5,>40,low,yes,excellent,no
6,31...40,low,yes,excellent,yes
7,<=30,medium,no,fair,no
8,<=30,low,yes,fair,yes
9,>40,medium,yes,fair,yes


# A1

In [47]:
print("Prior probabilties for each features w.r.t 'buys_computer'\n")
for column in df.columns:
    if column != 'buys_computer':  
        print(f"\nPrior Probabilities for {column.upper()}:")
        prior_probabilities = df.groupby(column)['buys_computer'].value_counts(normalize=True)
        print(prior_probabilities)
        print()


Prior probabilties for each features w.r.t 'buys_computer'


Prior Probabilities for AGE:
age      buys_computer
31...40  yes              1.0
<=30     no               0.6
         yes              0.4
>40      yes              0.6
         no               0.4
Name: buys_computer, dtype: float64


Prior Probabilities for INCOME:
income  buys_computer
high    no               0.500000
        yes              0.500000
low     yes              0.750000
        no               0.250000
medium  yes              0.666667
        no               0.333333
Name: buys_computer, dtype: float64


Prior Probabilities for STUDENT:
student  buys_computer
no       no               0.571429
         yes              0.428571
yes      yes              0.857143
         no               0.142857
Name: buys_computer, dtype: float64


Prior Probabilities for CREDIT_RATING:
credit_rating  buys_computer
excellent      no               0.50
               yes              0.50
fair           yes         

In [48]:
print("Prior probabilities for the 'buys_computer' classes\n")
total_instances = len(df)
buys_computer_counts = df['buys_computer'].value_counts()
prior_prob_yes = buys_computer_counts['yes'] / total_instances
prior_prob_no = buys_computer_counts['no'] / total_instances
print(f"Prior probability for buys_computer = 'yes': {prior_prob_yes}")
print(f"Prior probability for buys_computer = 'no': {prior_prob_no}")

Prior probabilities for the 'buys_computer' classes

Prior probability for buys_computer = 'yes': 0.6428571428571429
Prior probability for buys_computer = 'no': 0.35714285714285715


# A2

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import OneHotEncoder

df = pd.DataFrame(data)

df_encoded = pd.get_dummies(df, columns=['age', 'income', 'student', 'credit_rating'])

X = df_encoded.drop('buys_computer', axis=1)
y = df_encoded['buys_computer']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = GaussianNB()
model.fit(X_train, y_train)
class_conditional_densities = model.theta_ 

print("Class Conditional Densities:")
print(class_conditional_densities)


Class Conditional Densities:
[[0.         0.5        0.5        0.25       0.25       0.5
  0.75       0.25       0.75       0.25      ]
 [0.42857143 0.28571429 0.28571429 0.28571429 0.42857143 0.28571429
  0.28571429 0.71428571 0.28571429 0.71428571]]


# A3

In [50]:
import pandas as pd
from scipy.stats import chi2_contingency

df = pd.DataFrame(data)

for feature in df.columns[:-1]:  
    for target_feature in df.columns[:-1]:
        if feature != target_feature:
            contingency_table = pd.crosstab(df[feature], df[target_feature])
            chi2, p, _, _ = chi2_contingency(contingency_table)

            alpha = 0.05
            print(f"\nChi-square test for independence between '{feature}' and '{target_feature}':")
            print(f"p-value: {p}")
            if p < alpha:
                print("dependent.")
            else:
                print("independent.")



Chi-square test for independence between 'age' and 'income':
p-value: 0.5049810026322079
independent.

Chi-square test for independence between 'age' and 'student':
p-value: 0.8187307530779818
independent.

Chi-square test for independence between 'age' and 'credit_rating':
p-value: 0.9433354498734922
independent.

Chi-square test for independence between 'income' and 'age':
p-value: 0.5049810026322077
independent.

Chi-square test for independence between 'income' and 'student':
p-value: 0.05881647164242991
independent.

Chi-square test for independence between 'income' and 'credit_rating':
p-value: 0.6944859597510076
independent.

Chi-square test for independence between 'student' and 'age':
p-value: 0.8187307530779818
independent.

Chi-square test for independence between 'student' and 'income':
p-value: 0.05881647164242988
independent.

Chi-square test for independence between 'student' and 'credit_rating':
p-value: 1.0
independent.

Chi-square test for independence between 'credi

# A4

In [51]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import LabelEncoder as LE
 
label_encoder = LE()
dxl_encoded = df.copy()

for col in df.columns:
    if df[col].dtype == 'object':
        dxl_encoded[col] = label_encoder.fit_transform(df[col])
X = dxl_encoded.loc[:, 'age':'credit_rating']
y = dxl_encoded['buys_computer']  
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=42) 
model = GaussianNB()
model.fit(X_train, y_train)
predicted = model.predict(X_test)
 
print(X_test)
print("\nPredicted classes for test data:")
print(predicted)

    age  income  student  credit_rating
9     2       2        1              1
11    0       2        0              0
0     1       0        0              1

Predicted classes for test data:
[1 1 1]


# A5

In [52]:

data = pd.read_csv("NID_data.csv")
data

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,anomaly
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25187,0,tcp,exec,RSTO,0,0,0,0,0,0,...,7,0.03,0.06,0.00,0.00,0.00,0.00,1.00,1.00,anomaly
25188,0,tcp,ftp_data,SF,334,0,0,0,0,0,...,39,1.00,0.00,1.00,0.18,0.00,0.00,0.00,0.00,anomaly
25189,0,tcp,private,REJ,0,0,0,0,0,0,...,13,0.05,0.07,0.00,0.00,0.00,0.00,1.00,1.00,anomaly
25190,0,tcp,nnsp,S0,0,0,0,0,0,0,...,20,0.08,0.06,0.00,0.00,1.00,1.00,0.00,0.00,anomaly


In [53]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import pandas as pd

data = data.dropna()
le_protocol = LabelEncoder()
le_service = LabelEncoder()
le_flag = LabelEncoder()
le_class = LabelEncoder()
data['protocol_type'] = le_protocol.fit_transform(data['protocol_type'])
data['service'] = le_service.fit_transform(data['service'])
data['flag'] = le_flag.fit_transform(data['flag'])
data['class'] = le_class.fit_transform(data['class'])
X = data.drop('class', axis=1)
y = data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)
y_pred = nb_classifier.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred, average='weighted')
recall = metrics.recall_score(y_test, y_pred, average='weighted')
f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)
 

Accuracy: 0.5572534232982734
Precision: 0.6647941945496836
Recall: 0.5572534232982734
F1 Score: 0.43747403728594875
