# A1: Spam Filter Using Naive Bayes Algorithm

In [1]:
#import required packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#import dataset
spam_df = pd.read_csv('spam.csv')

In [11]:
#lets look our data
spam_df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [3]:
#inspect the data
spam_df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [4]:
#convert spam/ham into numerical data, creating new column called 'spam'
spam_df['spam'] = spam_df['Category'].apply(lambda x: 1 if x == 'spam' else 0)

In [5]:
#create train test split
x_train, x_test, y_train, y_test = train_test_split(spam_df.Message, spam_df.spam, test_size=0.25)

In [6]:
#find word coun and store data as matrix
cv = CountVectorizer()
x_train_count = cv.fit_transform(x_train.values)

In [7]:
#train model
model = MultinomialNB()
model.fit(x_train_count, y_train)

In [8]:
#pre-test ham
email_ham = ['baseball ticket later']
email_ham_count = cv.transform(email_ham)
model.predict(email_ham_count)

array([0])

In [9]:
#pre test spam
email_spam = ['reward money click']
email_spam_count = cv.transform(email_spam)
model.predict(email_spam_count)

array([1])

In [10]:
#test_model
x_test_count = cv.transform(x_test)
model.score(x_test_count, y_test)

0.9863603732950467

# A2: Classify DDos Attack with AI

In [13]:
#import required libraries
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns

In [14]:
#Read dataset
data = pd.read_csv('Ddos_dataset.csv')

In [15]:
#check the dataset
data.head()

Unnamed: 0,dt,switch,src,dst,pktcount,bytecount,dur,dur_nsec,tot_dur,flows,...,pktrate,Pairflow,Protocol,port_no,tx_bytes,rx_bytes,tx_kbps,rx_kbps,tot_kbps,label
0,11425,1,10.0.0.1,10.0.0.8,45304,48294064,100,716000000,101000000000.0,3,...,451,0,UDP,3,143928631,3917,0,0.0,0.0,0
1,11605,1,10.0.0.1,10.0.0.8,126395,134737070,280,734000000,281000000000.0,2,...,451,0,UDP,4,3842,3520,0,0.0,0.0,0
2,11425,1,10.0.0.2,10.0.0.8,90333,96294978,200,744000000,201000000000.0,3,...,451,0,UDP,1,3795,1242,0,0.0,0.0,0
3,11425,1,10.0.0.2,10.0.0.8,90333,96294978,200,744000000,201000000000.0,3,...,451,0,UDP,2,3688,1492,0,0.0,0.0,0
4,11425,1,10.0.0.2,10.0.0.8,90333,96294978,200,744000000,201000000000.0,3,...,451,0,UDP,3,3413,3665,0,0.0,0.0,0


In [16]:
#check the shape of the dataset
data.shape

(104345, 23)

In [17]:
#check the information of the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104345 entries, 0 to 104344
Data columns (total 23 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   dt           104345 non-null  int64  
 1   switch       104345 non-null  int64  
 2   src          104345 non-null  object 
 3   dst          104345 non-null  object 
 4   pktcount     104345 non-null  int64  
 5   bytecount    104345 non-null  int64  
 6   dur          104345 non-null  int64  
 7   dur_nsec     104345 non-null  int64  
 8   tot_dur      104345 non-null  float64
 9   flows        104345 non-null  int64  
 10  packetins    104345 non-null  int64  
 11  pktperflow   104345 non-null  int64  
 12  byteperflow  104345 non-null  int64  
 13  pktrate      104345 non-null  int64  
 14  Pairflow     104345 non-null  int64  
 15  Protocol     104345 non-null  object 
 16  port_no      104345 non-null  int64  
 17  tx_bytes     104345 non-null  int64  
 18  rx_bytes     104345 non-

In [18]:
#we see that the label contains boolean value: 0-bening, 1-maliciuous
data.label.unique()

array([0, 1])

In [19]:
#count the total value of 0 and 1
data.label.value_counts()

0    63561
1    40784
Name: label, dtype: int64

In [21]:
#check the null values in data
data.isnull().sum()

dt               0
switch           0
src              0
dst              0
pktcount         0
bytecount        0
dur              0
dur_nsec         0
tot_dur          0
flows            0
packetins        0
pktperflow       0
byteperflow      0
pktrate          0
Pairflow         0
Protocol         0
port_no          0
tx_bytes         0
rx_bytes         0
tx_kbps          0
rx_kbps        506
tot_kbps       506
label            0
dtype: int64

# A3: Split sample data into training & testing sets.

In [22]:
#import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split

In [23]:
#read the dataset
datasets = pd.read_csv('DataSplit.csv')

In [24]:
#check the data set using head() function
datasets.head()

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1


iloc[] ▶ Purely integer-location based indexing for selection by position.

In [25]:
#get the location
x = datasets.iloc[:, :-1]
y = datasets.iloc[:, :-1]

In [26]:
#split tha datasets using train_test_split fucntion
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=0)

In [27]:
#check the size of train dataset
xtrain = x_train.shape
ytrain = y_train.shape
print(f'Dataset size of x_test: {xtrain}')
print(f'Dataset size of y_test: {ytrain}')

Dataset size of x_test: (393, 7)
Dataset size of y_test: (393, 7)


In [28]:
#check the test datasets shape
xtest = x_test.shape
ytest = y_test.shape
print(f'Dataset size of x_test: {xtest}')
print(f'Dataset size of y_test: {ytest}')

Dataset size of x_test: (21, 7)
Dataset size of y_test: (21, 7)


In [29]:
#check the _x_train dataset
x_train

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude
37,38,2013.167,12.0,1360.13900,1,24.95204,121.54842
334,335,2012.917,30.0,1013.34100,5,24.99006,121.53460
54,55,2013.083,16.1,289.32480,5,24.98203,121.54348
145,146,2012.917,2.1,451.24380,5,24.97563,121.54694
284,285,2012.917,15.0,383.28050,7,24.96735,121.54464
...,...,...,...,...,...,...,...
323,324,2013.417,28.6,197.13380,6,24.97631,121.54436
192,193,2013.167,43.8,57.58945,7,24.96750,121.54069
117,118,2013.000,13.6,4197.34900,0,24.93885,121.50383
47,48,2013.583,35.9,640.73910,3,24.97563,121.53715


In [30]:
#check the y_train dataset
y_train

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude
37,38,2013.167,12.0,1360.13900,1,24.95204,121.54842
334,335,2012.917,30.0,1013.34100,5,24.99006,121.53460
54,55,2013.083,16.1,289.32480,5,24.98203,121.54348
145,146,2012.917,2.1,451.24380,5,24.97563,121.54694
284,285,2012.917,15.0,383.28050,7,24.96735,121.54464
...,...,...,...,...,...,...,...
323,324,2013.417,28.6,197.13380,6,24.97631,121.54436
192,193,2013.167,43.8,57.58945,7,24.96750,121.54069
117,118,2013.000,13.6,4197.34900,0,24.93885,121.50383
47,48,2013.583,35.9,640.73910,3,24.97563,121.53715


In [31]:
#check the x_test dataset
x_test

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude
356,357,2012.833,10.3,211.4473,1,24.97417,121.52999
170,171,2013.333,24.0,4527.687,0,24.94741,121.49628
224,225,2013.333,34.5,324.9419,6,24.97814,121.5417
331,332,2013.333,25.6,4519.69,0,24.94826,121.49587
306,307,2013.5,14.4,169.9803,1,24.97369,121.52979
325,326,2013.083,36.6,488.8193,8,24.97015,121.54494
150,151,2013.25,35.8,170.7311,7,24.96719,121.54269
10,11,2013.083,34.8,405.2134,1,24.97349,121.53372
21,22,2013.417,10.5,279.1726,7,24.97528,121.54541
268,269,2013.417,17.2,390.5684,5,24.97937,121.54245


In [32]:
#check the y_test Datasets
y_test

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude
356,357,2012.833,10.3,211.4473,1,24.97417,121.52999
170,171,2013.333,24.0,4527.687,0,24.94741,121.49628
224,225,2013.333,34.5,324.9419,6,24.97814,121.5417
331,332,2013.333,25.6,4519.69,0,24.94826,121.49587
306,307,2013.5,14.4,169.9803,1,24.97369,121.52979
325,326,2013.083,36.6,488.8193,8,24.97015,121.54494
150,151,2013.25,35.8,170.7311,7,24.96719,121.54269
10,11,2013.083,34.8,405.2134,1,24.97349,121.53372
21,22,2013.417,10.5,279.1726,7,24.97528,121.54541
268,269,2013.417,17.2,390.5684,5,24.97937,121.54245


# A4: Perform Feature Engineering Operation on Raw Data

In [33]:
#import dependencies
import numpy as np
import pandas as pd

In [34]:
#create dataframes
data={
  'candy variety':['chocolate hearts','sour jelly','candy canes','sour jelly','fruit drops'],
	'Date and Time':['09-02-2020 14:05','24-10-2020 18:00','18-12-2020 20:13','25-10-2020 10:00','18-10-2020 15:46'],
	'Day':['sunday','saturday','friday','sunday','sunday'],
	'Length':[3,3.5,3.5,3.5,3],
	'Breadth':[2,2,2.5,2,3],
	'Price':[7.5,7.6,8,7.6,9]
  }
df=pd.DataFrame(data)
df.head()

Unnamed: 0,candy variety,Date and Time,Day,Length,Breadth,Price
0,chocolate hearts,09-02-2020 14:05,sunday,3.0,2.0,7.5
1,sour jelly,24-10-2020 18:00,saturday,3.5,2.0,7.6
2,candy canes,18-12-2020 20:13,friday,3.5,2.5,8.0
3,sour jelly,25-10-2020 10:00,sunday,3.5,2.0,7.6
4,fruit drops,18-10-2020 15:46,sunday,3.0,3.0,9.0


In [35]:
#change the format of 'Date and Time'
df['Date and Time']=pd.to_datetime(df['Date and Time'],format="%d-%m-%Y %H:%M")
print(df)

      candy variety       Date and Time       Day  Length  Breadth  Price
0  chocolate hearts 2020-02-09 14:05:00    sunday     3.0      2.0    7.5
1        sour jelly 2020-10-24 18:00:00  saturday     3.5      2.0    7.6
2       candy canes 2020-12-18 20:13:00    friday     3.5      2.5    8.0
3        sour jelly 2020-10-25 10:00:00    sunday     3.5      2.0    7.6
4       fruit drops 2020-10-18 15:46:00    sunday     3.0      3.0    9.0


In [36]:
# creating new feature Date from existing feature Date and Time #
df['Date']=df['Date and Time'].dt.date
print(df[['candy variety','Date']])


      candy variety        Date
0  chocolate hearts  2020-02-09
1        sour jelly  2020-10-24
2       candy canes  2020-12-18
3        sour jelly  2020-10-25
4       fruit drops  2020-10-18


In [37]:
# creating weekend from days
df['weekend']=np.where(df['Day'].isin(['saturday','sunday']),1,0)
print(df[['candy variety','Date','weekend']])

      candy variety        Date  weekend
0  chocolate hearts  2020-02-09        1
1        sour jelly  2020-10-24        1
2       candy canes  2020-12-18        0
3        sour jelly  2020-10-25        1
4       fruit drops  2020-10-18        1


In [38]:
#create a new data set
data={
  'candy variety':['chocolate hearts','sour jelly','candy canes','sour jelly','fruit drops'],
	'Date and Time':['09-02-2020 14:05','24-10-2020 18:00','18-12-2020 20:13','25-10-2020 10:00','18-10-2020 15:46'],
	'Day':['sunday','saturday','friday','sunday','sunday'],
	'Length':[3,3.5,3.5,3.5,3],
	'Breadth':[2,2,2.5,2,3],
	'Price':[7.5,7.6,8,7.6,9]
  }
df=pd.DataFrame(data)
df.head()

Unnamed: 0,candy variety,Date and Time,Day,Length,Breadth,Price
0,chocolate hearts,09-02-2020 14:05,sunday,3.0,2.0,7.5
1,sour jelly,24-10-2020 18:00,saturday,3.5,2.0,7.6
2,candy canes,18-12-2020 20:13,friday,3.5,2.5,8.0
3,sour jelly,25-10-2020 10:00,sunday,3.5,2.0,7.6
4,fruit drops,18-10-2020 15:46,sunday,3.0,3.0,9.0


In [39]:
#Appending row with missing values
df['Date and Time']=pd.to_datetime(df['Date and Time'],format="%d-%m-%Y %H:%M")
df.loc[len(df.index)]=[np.NaN,'22-10-2020 17:24','thursday',3.5,2,np.NaN]
print(df)

      candy variety        Date and Time       Day  Length  Breadth  Price
0  chocolate hearts  2020-02-09 14:05:00    sunday     3.0      2.0    7.5
1        sour jelly  2020-10-24 18:00:00  saturday     3.5      2.0    7.6
2       candy canes  2020-12-18 20:13:00    friday     3.5      2.5    8.0
3        sour jelly  2020-10-25 10:00:00    sunday     3.5      2.0    7.6
4       fruit drops  2020-10-18 15:46:00    sunday     3.0      3.0    9.0
5               NaN     22-10-2020 17:24  thursday     3.5      2.0    NaN


In [40]:
# Imputation
df['candy variety']=df['candy variety'].fillna(df['candy variety'].mode()[0])
df['Price']=df['Price'].fillna(df['Price'].mean())
print(df)

      candy variety        Date and Time       Day  Length  Breadth  Price
0  chocolate hearts  2020-02-09 14:05:00    sunday     3.0      2.0   7.50
1        sour jelly  2020-10-24 18:00:00  saturday     3.5      2.0   7.60
2       candy canes  2020-12-18 20:13:00    friday     3.5      2.5   8.00
3        sour jelly  2020-10-25 10:00:00    sunday     3.5      2.0   7.60
4       fruit drops  2020-10-18 15:46:00    sunday     3.0      3.0   9.00
5        sour jelly     22-10-2020 17:24  thursday     3.5      2.0   7.94


In [41]:
# Discretization
df['Type of Day']=np.where(df['Day'].isin(['saturday','sunday']),'weekend','weekday')
df[['candy variety','Day','Type of Day']]
print(df)

      candy variety        Date and Time       Day  Length  Breadth  Price  \
0  chocolate hearts  2020-02-09 14:05:00    sunday     3.0      2.0   7.50   
1        sour jelly  2020-10-24 18:00:00  saturday     3.5      2.0   7.60   
2       candy canes  2020-12-18 20:13:00    friday     3.5      2.5   8.00   
3        sour jelly  2020-10-25 10:00:00    sunday     3.5      2.0   7.60   
4       fruit drops  2020-10-18 15:46:00    sunday     3.0      3.0   9.00   
5        sour jelly     22-10-2020 17:24  thursday     3.5      2.0   7.94   

  Type of Day  
0     weekend  
1     weekend  
2     weekday  
3     weekend  
4     weekend  
5     weekday  


In [42]:
#Categorical Encoding
for x in df['Type of Day'].unique():df[x]=np.where(df['Type of Day']==x,1,0)
print(df[['candy variety','Day','Type of Day','weekend','weekday']])

      candy variety       Day Type of Day  weekend  weekday
0  chocolate hearts    sunday     weekend        1        0
1        sour jelly  saturday     weekend        1        0
2       candy canes    friday     weekday        0        1
3        sour jelly    sunday     weekend        1        0
4       fruit drops    sunday     weekend        1        0
5        sour jelly  thursday     weekday        0        1


In [43]:
# Feature Splitting
df['Date and Time']=pd.to_datetime(df['Date and Time'])
df['Date']=df['Date and Time'].dt.date
print(df[['candy variety','Date']])

      candy variety        Date
0  chocolate hearts  2020-02-09
1        sour jelly  2020-10-24
2       candy canes  2020-12-18
3        sour jelly  2020-10-25
4       fruit drops  2020-10-18
5        sour jelly  2020-10-22
