In [1]:
# Importing necessary libraries 
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

In [2]:
# Importing the data

malware = pd.read_csv("/Users/varunbhavnani/Documents/Applied ML/Project/Malware.csv")
defacement = pd.read_csv("/Users/varunbhavnani/Documents/Applied ML/Project/Defacement.csv")
phishing = pd.read_csv("/Users/varunbhavnani/Documents/Applied ML/Project/Phishing.csv")
spam = pd.read_csv("/Users/varunbhavnani/Documents/Applied ML/Project/Spam.csv")

# Part 1: Classification with Statistical Analysis

## 1.1. Preparing the data and performing statistical analysis

## A: Malware data

In [3]:
# NumberRate_Extension & Entropy_DirectoryName have NAs
# NumberRate_Extension has almost 60% of NA values so dropping it


malware = malware.drop(['NumberRate_Extension'], axis=1)

In [4]:
# Variables that need data imputation: Entropy_Filename, Entropy_Extension, Entropy_DirectoryName,
# NumberRate_FileName, NumberRate_DirectoryName 

malware['Entropy_DirectoryName'] = malware['Entropy_DirectoryName'].fillna(malware['Entropy_DirectoryName'].mean())
malware['Entropy_Filename'] = malware['Entropy_Filename'].fillna(malware['Entropy_Filename'].mean())
malware['Entropy_Extension'] = malware['Entropy_Extension'].fillna(malware['Entropy_Extension'].mean())
malware['NumberRate_FileName'] = malware['NumberRate_FileName'].fillna(malware['NumberRate_FileName'].mean())
malware['NumberRate_DirectoryName'] = malware['NumberRate_DirectoryName'].fillna(malware['NumberRate_DirectoryName'].mean())

In [5]:
malware['URL_Type_obf_Type'].replace(('malware', 'benign'), (1, 0), inplace=True)

In [6]:
malware.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14493 entries, 0 to 14492
Data columns (total 79 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Querylength                      14493 non-null  int64  
 1   domain_token_count               14493 non-null  int64  
 2   path_token_count                 14493 non-null  int64  
 3   avgdomaintokenlen                14493 non-null  float64
 4   longdomaintokenlen               14493 non-null  int64  
 5   avgpathtokenlen                  14482 non-null  float64
 6   tld                              14493 non-null  int64  
 7   charcompvowels                   14493 non-null  int64  
 8   charcompace                      14493 non-null  int64  
 9   ldl_url                          14493 non-null  int64  
 10  ldl_domain                       14493 non-null  int64  
 11  ldl_path                         14493 non-null  int64  
 12  ldl_filename      

In [7]:
# Multicollinearity check for each data set

# MC check for malware' explanatory variables

malware_y = malware[['URL_Type_obf_Type']]
malware_x = malware.iloc[:,:-1]

In [8]:
#Check the correlation between explanatory variables

x_corr = malware_x.corr()

x_corr

Unnamed: 0,Querylength,domain_token_count,path_token_count,avgdomaintokenlen,longdomaintokenlen,avgpathtokenlen,tld,charcompvowels,charcompace,ldl_url,...,SymbolCount_Directoryname,SymbolCount_FileName,SymbolCount_Extension,SymbolCount_Afterpath,Entropy_URL,Entropy_Domain,Entropy_DirectoryName,Entropy_Filename,Entropy_Extension,Entropy_Afterpath
Querylength,1.000000,0.116022,0.160133,-0.059424,-0.060921,-0.010820,0.116022,0.187863,0.266185,0.423439,...,-0.130282,0.612749,0.550338,0.632128,0.069114,-0.158945,0.069561,0.068770,0.179828,0.645537
domain_token_count,0.116022,1.000000,-0.151595,-0.251992,-0.039480,0.014425,1.000000,-0.178005,-0.098224,0.174948,...,0.222992,0.025927,-0.029497,-0.002208,0.095619,-0.428294,0.057390,0.087252,0.209351,0.170239
path_token_count,0.160133,-0.151595,1.000000,-0.083326,-0.118449,-0.161022,-0.151595,0.661092,0.593048,-0.052403,...,-0.014375,0.250395,0.299477,0.317373,-0.601107,0.068051,-0.242359,-0.305373,-0.211897,0.104896
avgdomaintokenlen,-0.059424,-0.251992,-0.083326,1.000000,0.929811,0.002440,-0.251992,-0.067938,-0.044913,-0.085710,...,-0.074280,0.000497,0.017159,-0.008737,-0.218024,-0.135809,0.029855,0.052798,0.033693,-0.103419
longdomaintokenlen,-0.060921,-0.039480,-0.118449,0.929811,1.000000,0.023866,-0.039480,-0.101404,-0.058332,-0.060716,...,-0.011622,-0.018956,-0.012579,-0.032725,-0.202657,-0.234330,0.032826,0.068247,0.070540,-0.089213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Entropy_Domain,-0.158945,-0.428294,0.068051,-0.135809,-0.234330,0.020866,-0.428294,0.119444,0.026652,-0.125894,...,0.042547,-0.135747,-0.095295,-0.117803,0.003348,1.000000,0.019373,-0.000334,-0.200300,-0.229231
Entropy_DirectoryName,0.069561,0.057390,-0.242359,0.029855,0.032826,0.098296,0.057390,-0.047097,-0.000402,0.053244,...,0.421847,0.193892,0.152930,0.075967,0.033088,0.019373,1.000000,0.966241,0.669481,0.064801
Entropy_Filename,0.068770,0.087252,-0.305373,0.052798,0.068247,0.068032,0.087252,-0.119360,-0.070030,0.033581,...,0.449825,0.192385,0.145440,0.067882,0.094997,-0.000334,0.966241,1.000000,0.720239,0.096336
Entropy_Extension,0.179828,0.209351,-0.211897,0.033693,0.070540,0.036295,0.209351,-0.247614,-0.160699,0.145047,...,0.264925,0.362284,0.319388,0.206697,0.157058,-0.200300,0.669481,0.720239,1.000000,0.246248


In [9]:
upper_tri = x_corr.where(np.triu(np.ones(x_corr.shape),k=1).astype(np.bool))
print(upper_tri)

                       Querylength  domain_token_count  path_token_count  \
Querylength                    NaN            0.116022          0.160133   
domain_token_count             NaN                 NaN         -0.151595   
path_token_count               NaN                 NaN               NaN   
avgdomaintokenlen              NaN                 NaN               NaN   
longdomaintokenlen             NaN                 NaN               NaN   
...                            ...                 ...               ...   
Entropy_Domain                 NaN                 NaN               NaN   
Entropy_DirectoryName          NaN                 NaN               NaN   
Entropy_Filename               NaN                 NaN               NaN   
Entropy_Extension              NaN                 NaN               NaN   
Entropy_Afterpath              NaN                 NaN               NaN   

                       avgdomaintokenlen  longdomaintokenlen  avgpathtokenlen  \
Queryl

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper_tri = x_corr.where(np.triu(np.ones(x_corr.shape),k=1).astype(np.bool))


In [10]:
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.75)]
print(); print(to_drop)


['longdomaintokenlen', 'tld', 'charcompace', 'ldl_path', 'dld_url', 'dld_path', 'dld_filename', 'dld_getArg', 'urlLen', 'domainlength', 'pathLength', 'subDirLen', 'ArgLen', 'pathurlRatio', 'ArgUrlRatio', 'argDomanRatio', 'domainUrlRatio', 'pathDomainRatio', 'argPathRatio', 'LongestVariableValue', 'URL_Letter_Count', 'host_letter_count', 'Query_LetterCount', 'LongestPathTokenLength', 'Domain_LongestWordLength', 'sub-Directory_LongestWordLength', 'URLQueries_variable', 'delimeter_path', 'delimeter_Count', 'NumberRate_URL', 'NumberRate_Domain', 'NumberRate_AfterPath', 'SymbolCount_URL', 'SymbolCount_Domain', 'SymbolCount_Directoryname', 'SymbolCount_FileName', 'SymbolCount_Extension', 'SymbolCount_Afterpath', 'Entropy_DirectoryName', 'Entropy_Filename', 'Entropy_Afterpath']


In [11]:
X_new = malware_x.drop(malware_x[to_drop], axis=1)

X_new['avgpathtokenlen'] = X_new['avgpathtokenlen'].fillna(X_new['avgpathtokenlen'].mean())

In [12]:
malware = malware.drop(malware[to_drop], axis=1)

In [13]:
malware['avgpathtokenlen'] = malware['avgpathtokenlen'].fillna(malware['avgpathtokenlen'].mean())

In [14]:
malware.rename(columns={'this.fileExtLen': 'this_fileExtLen'}, inplace=True)

In [15]:
malware.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14493 entries, 0 to 14492
Data columns (total 38 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Querylength                  14493 non-null  int64  
 1   domain_token_count           14493 non-null  int64  
 2   path_token_count             14493 non-null  int64  
 3   avgdomaintokenlen            14493 non-null  float64
 4   avgpathtokenlen              14493 non-null  float64
 5   charcompvowels               14493 non-null  int64  
 6   ldl_url                      14493 non-null  int64  
 7   ldl_domain                   14493 non-null  int64  
 8   ldl_filename                 14493 non-null  int64  
 9   ldl_getArg                   14493 non-null  int64  
 10  dld_domain                   14493 non-null  int64  
 11  fileNameLen                  14493 non-null  int64  
 12  this_fileExtLen              14493 non-null  int64  
 13  executable      

In [16]:
import statsmodels.formula.api as smf
import statsmodels.api as sm

model = smf.glm('URL_Type_obf_Type ~ Querylength + domain_token_count + avgdomaintokenlen + ldl_domain + this_fileExtLen + ISIpAddressInDomainName + host_DigitCount + Extension_DigitCount + Filename_LetterCount + Path_LongestWordLength + URL_sensitiveWord + NumberRate_DirectoryName + Entropy_Domain + path_token_count + avgpathtokenlen + ldl_filename + executable + CharacterContinuityRate + Directory_DigitCount + Query_DigitCount + Extension_LetterCount + Arguments_LongestWordLength + spcharUrl + NumberRate_FileName + Entropy_Extension + charcompvowels + ldl_getArg + isPortEighty + URL_DigitCount + File_name_DigitCount + Directory_LetterCount + delimeter_Domain + Entropy_URL + ldl_url + dld_domain + NumberofDotsinURL + fileNameLen',data = malware, family=sm.families.Binomial())
result = model.fit()
print(result.summary())

  t = np.exp(-z)


                 Generalized Linear Model Regression Results                  
Dep. Variable:      URL_Type_obf_Type   No. Observations:                14493
Model:                            GLM   Df Residuals:                    14457
Model Family:                Binomial   Df Model:                           35
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                    nan
Date:                Sat, 19 Nov 2022   Deviance:                   3.6261e+05
Time:                        18:02:16   Pearson chi2:                 1.77e+19
No. Iterations:                   100   Pseudo R-squ. (CS):                nan
Covariance Type:            nonrobust                                         
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept         

  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +
  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +


In [17]:
# Final variables in malware are: 

malware.columns, malware.shape

(Index(['Querylength', 'domain_token_count', 'path_token_count',
        'avgdomaintokenlen', 'avgpathtokenlen', 'charcompvowels', 'ldl_url',
        'ldl_domain', 'ldl_filename', 'ldl_getArg', 'dld_domain', 'fileNameLen',
        'this_fileExtLen', 'executable', 'isPortEighty', 'NumberofDotsinURL',
        'ISIpAddressInDomainName', 'CharacterContinuityRate', 'URL_DigitCount',
        'host_DigitCount', 'Directory_DigitCount', 'File_name_DigitCount',
        'Extension_DigitCount', 'Query_DigitCount', 'Directory_LetterCount',
        'Filename_LetterCount', 'Extension_LetterCount',
        'Path_LongestWordLength', 'Arguments_LongestWordLength',
        'URL_sensitiveWord', 'spcharUrl', 'delimeter_Domain',
        'NumberRate_DirectoryName', 'NumberRate_FileName', 'Entropy_URL',
        'Entropy_Domain', 'Entropy_Extension', 'URL_Type_obf_Type'],
       dtype='object'),
 (14493, 38))

## B. Defacement data

In [18]:
# Entropy_DirectoryName & NumberRate_Extension have more than 60% NAs so droping those variables


defacement = defacement.drop(['Entropy_DirectoryName'], axis=1)
defacement = defacement.drop(['NumberRate_Extension'], axis=1)

In [19]:
# Entropy_Extension, Entropy_Filename, NumberRate_AfterPath, avgpathtokenlen need data imputation

defacement['Entropy_Extension'] = defacement['Entropy_Extension'].fillna(defacement['Entropy_Extension'].mean())
defacement['Entropy_Filename'] = defacement['Entropy_Filename'].fillna(defacement['Entropy_Filename'].mean())
defacement['NumberRate_AfterPath'] = defacement['NumberRate_AfterPath'].fillna(defacement['NumberRate_AfterPath'].mean())
defacement['avgpathtokenlen'] = defacement['avgpathtokenlen'].fillna(defacement['avgpathtokenlen'].mean())

In [20]:
# Multicollinearity check for each data set

# MC check for malware' explanatory variables

defacement_y = defacement[['URL_Type_obf_Type']]
defacement_x = defacement.iloc[:,:-1]

In [21]:
#Check the correlation between explanatory variables

x_corr = defacement_x.corr()

x_corr

Unnamed: 0,Querylength,domain_token_count,path_token_count,avgdomaintokenlen,longdomaintokenlen,avgpathtokenlen,tld,charcompvowels,charcompace,ldl_url,...,SymbolCount_Domain,SymbolCount_Directoryname,SymbolCount_FileName,SymbolCount_Extension,SymbolCount_Afterpath,Entropy_URL,Entropy_Domain,Entropy_Filename,Entropy_Extension,Entropy_Afterpath
Querylength,1.000000,0.323946,0.288837,0.126162,0.276626,-0.176904,0.323946,0.569627,0.559864,0.612540,...,0.323946,-0.264508,0.649373,0.647678,0.665888,-0.302906,-0.251152,-0.043143,0.148347,0.485448
domain_token_count,0.323946,1.000000,-0.055458,-0.205814,0.177821,-0.298313,1.000000,0.149309,0.105536,0.016098,...,1.000000,-0.280923,0.389466,0.386157,0.375475,-0.034032,-0.518857,0.071184,0.225116,0.353447
path_token_count,0.288837,-0.055458,1.000000,0.014635,0.013923,0.022788,-0.055458,0.794559,0.753737,0.035372,...,-0.055458,0.117862,0.224380,0.252699,0.281230,-0.745370,0.017022,-0.353967,-0.263239,0.004194
avgdomaintokenlen,0.126162,-0.205814,0.014635,1.000000,0.890506,-0.073592,-0.205814,0.077506,0.064264,0.032953,...,-0.205814,-0.088745,0.116369,0.120837,0.120895,-0.212576,-0.212093,-0.014403,-0.007816,0.050100
longdomaintokenlen,0.276626,0.177821,0.013923,0.890506,1.000000,-0.196354,0.177821,0.163243,0.130601,0.044771,...,0.177821,-0.194082,0.285578,0.289202,0.286505,-0.243453,-0.396530,0.000185,0.072521,0.187887
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Entropy_URL,-0.302906,-0.034032,-0.745370,-0.212576,-0.243453,-0.099474,-0.034032,-0.802585,-0.770918,-0.050752,...,-0.034032,-0.157708,-0.231050,-0.273497,-0.274773,1.000000,0.114448,0.230427,0.233900,0.116474
Entropy_Domain,-0.251152,-0.518857,0.017022,-0.212093,-0.396530,0.210455,-0.518857,-0.111249,-0.079399,-0.044367,...,-0.518857,0.270723,-0.317490,-0.315645,-0.303914,0.114448,1.000000,-0.020461,-0.230779,-0.295461
Entropy_Filename,-0.043143,0.071184,-0.353967,-0.014403,0.000185,-0.026002,0.071184,-0.216135,-0.204564,-0.004040,...,0.071184,0.252687,0.189778,0.153186,0.104449,0.230427,-0.020461,1.000000,0.732293,0.202037
Entropy_Extension,0.148347,0.225116,-0.263239,-0.007816,0.072521,-0.228941,0.225116,-0.157172,-0.172936,0.036325,...,0.225116,-0.111552,0.439257,0.397059,0.332978,0.233900,-0.230779,0.732293,1.000000,0.425962


In [22]:
upper_tri = x_corr.where(np.triu(np.ones(x_corr.shape),k=1).astype(np.bool))
print(upper_tri)

                    Querylength  domain_token_count  path_token_count  \
Querylength                 NaN            0.323946          0.288837   
domain_token_count          NaN                 NaN         -0.055458   
path_token_count            NaN                 NaN               NaN   
avgdomaintokenlen           NaN                 NaN               NaN   
longdomaintokenlen          NaN                 NaN               NaN   
...                         ...                 ...               ...   
Entropy_URL                 NaN                 NaN               NaN   
Entropy_Domain              NaN                 NaN               NaN   
Entropy_Filename            NaN                 NaN               NaN   
Entropy_Extension           NaN                 NaN               NaN   
Entropy_Afterpath           NaN                 NaN               NaN   

                    avgdomaintokenlen  longdomaintokenlen  avgpathtokenlen  \
Querylength                  0.126162        

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper_tri = x_corr.where(np.triu(np.ones(x_corr.shape),k=1).astype(np.bool))


In [23]:
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.75)]
print(); print(to_drop)


['longdomaintokenlen', 'tld', 'charcompvowels', 'charcompace', 'ldl_path', 'ldl_getArg', 'dld_path', 'dld_filename', 'urlLen', 'domainlength', 'pathLength', 'subDirLen', 'ArgLen', 'pathurlRatio', 'ArgUrlRatio', 'argDomanRatio', 'pathDomainRatio', 'argPathRatio', 'LongestVariableValue', 'Query_DigitCount', 'URL_Letter_Count', 'host_letter_count', 'Extension_LetterCount', 'Query_LetterCount', 'LongestPathTokenLength', 'Domain_LongestWordLength', 'sub-Directory_LongestWordLength', 'Arguments_LongestWordLength', 'URLQueries_variable', 'delimeter_path', 'delimeter_Count', 'NumberRate_URL', 'NumberRate_Domain', 'NumberRate_AfterPath', 'SymbolCount_URL', 'SymbolCount_Domain', 'SymbolCount_Directoryname', 'SymbolCount_FileName', 'SymbolCount_Extension', 'SymbolCount_Afterpath', 'Entropy_Filename', 'Entropy_Afterpath']


In [24]:
defacement = defacement.drop(defacement[to_drop], axis=1)

In [25]:
defacement['URL_Type_obf_Type'].replace(('Defacement','benign'),(1,0), inplace=True)

In [5]:
defacement.rename(columns={'this.fileExtLen': 'this_fileExtLen'}, inplace=True)

In [27]:
import statsmodels.formula.api as smf

model = smf.glm('URL_Type_obf_Type ~ Querylength + domain_token_count + avgdomaintokenlen + ldl_filename + this_fileExtLen + NumberofDotsinURL + CharacterContinuityRate + Directory_DigitCount + Directory_LetterCount + Path_LongestWordLength + delimeter_Domain + Entropy_URL + path_token_count + avgpathtokenlen + dld_url + domainUrlRatio + ISIpAddressInDomainName + URL_DigitCount + File_name_DigitCount + Filename_LetterCount + URL_sensitiveWord + NumberRate_DirectoryName + Entropy_Domain + ldl_url + dld_domain + executable + host_DigitCount + Extension_DigitCount + spcharUrl + NumberRate_FileName + Entropy_Extension + ldl_domain + dld_getArg + isPortEighty + fileNameLen ',data = defacement, family=sm.families.Binomial(),method = "detect_separation")
result = model.fit()
print(result.summary())

  t = np.exp(-z)


                 Generalized Linear Model Regression Results                  
Dep. Variable:      URL_Type_obf_Type   No. Observations:                15711
Model:                            GLM   Df Residuals:                    15677
Model Family:                Binomial   Df Model:                           33
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                    nan
Date:                Sat, 19 Nov 2022   Deviance:                       92932.
Time:                        18:02:17   Pearson chi2:                 4.54e+18
No. Iterations:                    90   Pseudo R-squ. (CS):                nan
Covariance Type:            nonrobust                                         
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept               

  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +
  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +


In [28]:
# Final variables in malware are: 

defacement.columns, malware.shape

(Index(['Querylength', 'domain_token_count', 'path_token_count',
        'avgdomaintokenlen', 'avgpathtokenlen', 'ldl_url', 'ldl_domain',
        'ldl_filename', 'dld_url', 'dld_domain', 'dld_getArg', 'fileNameLen',
        'this_fileExtLen', 'domainUrlRatio', 'executable', 'isPortEighty',
        'NumberofDotsinURL', 'ISIpAddressInDomainName',
        'CharacterContinuityRate', 'URL_DigitCount', 'host_DigitCount',
        'Directory_DigitCount', 'File_name_DigitCount', 'Extension_DigitCount',
        'Directory_LetterCount', 'Filename_LetterCount',
        'Path_LongestWordLength', 'URL_sensitiveWord', 'spcharUrl',
        'delimeter_Domain', 'NumberRate_DirectoryName', 'NumberRate_FileName',
        'Entropy_URL', 'Entropy_Domain', 'Entropy_Extension',
        'URL_Type_obf_Type'],
       dtype='object'),
 (14493, 38))

## C. Phishing data

In [29]:
# NumberRate_Extension have more than 60% NAs so droping those variables


phishing = phishing.drop(['NumberRate_Extension'], axis=1)

# Entropy_Afterpath,Entropy_Extension,Entropy_Filename,Entropy_DirectoryName, NumberRate_AfterPath,
# NumberRate_FileName,NumberRate_DirectoryName,avgpathtokenlen need data imputation

phishing['Entropy_Afterpath'] = phishing['Entropy_Afterpath'].fillna(phishing['Entropy_Afterpath'].mean())
phishing['Entropy_Extension'] = phishing['Entropy_Extension'].fillna(phishing['Entropy_Extension'].mean())
phishing['Entropy_Filename'] = phishing['Entropy_Filename'].fillna(phishing['Entropy_Filename'].mean())
phishing['Entropy_DirectoryName'] = phishing['Entropy_DirectoryName'].fillna(phishing['Entropy_DirectoryName'].mean())
phishing['NumberRate_AfterPath'] = phishing['NumberRate_AfterPath'].fillna(phishing['NumberRate_AfterPath'].mean())
phishing['NumberRate_FileName'] = phishing['NumberRate_FileName'].fillna(phishing['NumberRate_FileName'].mean())
phishing['NumberRate_DirectoryName'] = phishing['NumberRate_DirectoryName'].fillna(phishing['NumberRate_DirectoryName'].mean())
phishing['avgpathtokenlen'] = phishing['avgpathtokenlen'].fillna(phishing['avgpathtokenlen'].mean())

In [30]:
# Multicollinearity check for each data set

# MC check for malware' explanatory variables

phishing_y = phishing[['URL_Type_obf_Type']]
phishing_x = phishing.iloc[:,:-1]

In [31]:
#Check the correlation between explanatory variables

x_corr = phishing_x.corr()

x_corr

Unnamed: 0,Querylength,domain_token_count,path_token_count,avgdomaintokenlen,longdomaintokenlen,avgpathtokenlen,tld,charcompvowels,charcompace,ldl_url,...,SymbolCount_Directoryname,SymbolCount_FileName,SymbolCount_Extension,SymbolCount_Afterpath,Entropy_URL,Entropy_Domain,Entropy_DirectoryName,Entropy_Filename,Entropy_Extension,Entropy_Afterpath
Querylength,1.000000,-0.053468,0.218294,-0.040704,-0.056614,0.062090,-0.053468,0.273243,0.391294,0.466491,...,0.035522,0.559374,0.551512,0.603472,-0.131310,0.013178,0.044444,0.006767,0.088744,0.537776
domain_token_count,-0.053468,1.000000,-0.268521,0.087895,0.318600,0.085961,1.000000,-0.192718,-0.128956,0.152290,...,0.058779,-0.086715,-0.085419,-0.075700,-0.011964,-0.516759,-0.002941,0.045168,-0.040654,-0.093018
path_token_count,0.218294,-0.268521,1.000000,-0.249478,-0.289509,-0.135474,-0.268521,0.794364,0.684431,0.019036,...,0.358003,0.211533,0.217322,0.321338,-0.655448,0.203486,-0.103114,-0.158715,-0.115744,0.248414
avgdomaintokenlen,-0.040704,0.087895,-0.249478,1.000000,0.905642,0.034148,0.087895,-0.204113,-0.135983,0.100424,...,-0.092574,-0.080799,-0.067849,-0.105775,-0.045007,-0.353929,-0.066031,-0.049793,-0.050162,-0.125393
longdomaintokenlen,-0.056614,0.318600,-0.289509,0.905642,1.000000,0.069998,0.318600,-0.221183,-0.140228,0.144452,...,-0.042098,-0.105841,-0.091734,-0.127645,-0.051726,-0.470505,-0.062975,-0.035881,-0.059928,-0.152927
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Entropy_Domain,0.013178,-0.516759,0.203486,-0.353929,-0.470505,-0.109671,-0.516759,0.132582,0.055677,-0.145276,...,-0.007350,0.006042,-0.004827,0.016169,0.124993,1.000000,0.025630,-0.005277,-0.005178,0.057863
Entropy_DirectoryName,0.044444,-0.002941,-0.103114,-0.066031,-0.062975,0.040116,-0.002941,-0.029059,-0.000931,0.063844,...,0.301158,0.237079,0.191355,0.074831,0.043326,0.025630,1.000000,0.957331,0.654404,0.071164
Entropy_Filename,0.006767,0.045168,-0.158715,-0.049793,-0.035881,0.014595,0.045168,-0.096295,-0.066303,0.033490,...,0.344161,0.204951,0.144956,0.050805,0.101533,-0.005277,0.957331,1.000000,0.690181,0.071305
Entropy_Extension,0.088744,-0.040654,-0.115744,-0.050162,-0.059928,-0.093789,-0.040654,-0.140374,-0.113768,0.028084,...,0.104700,0.375711,0.269593,0.101573,0.124216,-0.005178,0.654404,0.690181,1.000000,0.104704


In [32]:
upper_tri = x_corr.where(np.triu(np.ones(x_corr.shape),k=1).astype(np.bool))
print(upper_tri)

                       Querylength  domain_token_count  path_token_count  \
Querylength                    NaN           -0.053468          0.218294   
domain_token_count             NaN                 NaN         -0.268521   
path_token_count               NaN                 NaN               NaN   
avgdomaintokenlen              NaN                 NaN               NaN   
longdomaintokenlen             NaN                 NaN               NaN   
...                            ...                 ...               ...   
Entropy_Domain                 NaN                 NaN               NaN   
Entropy_DirectoryName          NaN                 NaN               NaN   
Entropy_Filename               NaN                 NaN               NaN   
Entropy_Extension              NaN                 NaN               NaN   
Entropy_Afterpath              NaN                 NaN               NaN   

                       avgdomaintokenlen  longdomaintokenlen  avgpathtokenlen  \
Queryl

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper_tri = x_corr.where(np.triu(np.ones(x_corr.shape),k=1).astype(np.bool))


In [33]:
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.75)]
print(); print(to_drop)


['longdomaintokenlen', 'tld', 'charcompvowels', 'charcompace', 'ldl_path', 'dld_url', 'dld_domain', 'dld_path', 'dld_filename', 'dld_getArg', 'urlLen', 'domainlength', 'pathLength', 'subDirLen', 'ArgLen', 'pathurlRatio', 'ArgUrlRatio', 'argDomanRatio', 'pathDomainRatio', 'LongestVariableValue', 'URL_DigitCount', 'host_DigitCount', 'Extension_DigitCount', 'Query_DigitCount', 'URL_Letter_Count', 'host_letter_count', 'Query_LetterCount', 'LongestPathTokenLength', 'URLQueries_variable', 'delimeter_path', 'delimeter_Count', 'NumberRate_URL', 'NumberRate_AfterPath', 'SymbolCount_Domain', 'SymbolCount_Directoryname', 'SymbolCount_FileName', 'SymbolCount_Extension', 'SymbolCount_Afterpath', 'Entropy_DirectoryName', 'Entropy_Filename', 'Entropy_Afterpath']


In [34]:
phishing = phishing.drop(phishing[to_drop], axis=1)

In [35]:
phishing['URL_Type_obf_Type'].replace(('phishing','benign'),(1,0), inplace=True)

In [6]:
phishing.rename(columns={'this.fileExtLen': 'this_fileExtLen'}, inplace=True)
phishing.rename(columns={'sub-Directory_LongestWordLength': 'sub_Directory_LongestWordLength'}, inplace=True)

In [37]:
phishing.dropna()

Unnamed: 0,Querylength,domain_token_count,path_token_count,avgdomaintokenlen,avgpathtokenlen,ldl_url,ldl_domain,ldl_filename,ldl_getArg,fileNameLen,...,spcharUrl,delimeter_Domain,NumberRate_Domain,NumberRate_DirectoryName,NumberRate_FileName,SymbolCount_URL,Entropy_URL,Entropy_Domain,Entropy_Extension,URL_Type_obf_Type
0,0,2,12,5.500000,4.083334,0,0,0,0,2,...,5,0,0.0,-1.000000,-1.000000,6,0.676804,0.860529,-1.00000,0
1,0,3,12,5.000000,3.583333,2,0,2,0,40,...,4,0,0.0,0.666667,0.044444,7,0.715629,0.776796,1.00000,0
2,2,2,11,4.000000,4.750000,0,0,0,0,2,...,6,0,0.0,0.000000,0.000000,8,0.677701,1.000000,0.00000,0
3,0,2,7,4.500000,5.714286,0,0,0,0,2,...,4,0,0.0,0.000000,0.000000,5,0.696067,0.879588,0.00000,0
4,19,2,10,6.000000,2.250000,0,0,0,0,2,...,3,0,0.0,0.000000,0.139535,9,0.747202,0.833700,0.83615,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15362,0,2,3,8.000000,3.333333,0,0,0,0,2,...,3,0,0.0,0.000000,0.000000,4,0.797046,0.884870,0.00000,1
15363,0,3,0,9.000000,5.289936,0,0,0,0,2,...,1,0,0.0,-1.000000,-1.000000,3,0.797564,0.813569,-1.00000,1
15364,0,3,2,6.666666,3.000000,0,0,0,0,2,...,2,0,0.0,0.000000,0.000000,4,0.791104,0.801139,0.00000,1
15365,0,2,3,8.000000,3.333333,0,0,0,0,2,...,3,0,0.0,0.000000,0.000000,4,0.716580,0.787659,0.00000,1


In [38]:
import statsmodels.formula.api as smf
import statsmodels.api as sm

model = smf.glm('URL_Type_obf_Type ~ Querylength + domain_token_count + avgdomaintokenlen + ldl_filename + domainUrlRatio + NumberofDotsinURL + CharacterContinuityRate + File_name_DigitCount + Extension_LetterCount + Path_LongestWordLength + Arguments_LongestWordLength + delimeter_Domain + NumberRate_FileName + Entropy_Domain + path_token_count + avgpathtokenlen + ldl_getArg + argPathRatio + ISIpAddressInDomainName + Directory_DigitCount + Directory_LetterCount + Domain_LongestWordLength + sub_Directory_LongestWordLength + URL_sensitiveWord + NumberRate_Domain + SymbolCount_URL + Entropy_Extension + ldl_url + fileNameLen + executable + Filename_LetterCount + spcharUrl + NumberRate_DirectoryName + Entropy_URL + ldl_domain + this_fileExtLen + isPortEighty',data = phishing, family=sm.families.Binomial())
result = model.fit()
print(result.summary())

MissingDataError: exog contains inf or nans

## D. Spam data

In [39]:
#  more than 60% NAs so droping those variables


spam = spam.drop(['NumberRate_Extension'], axis=1)

#  need data imputation

spam['Entropy_Afterpath'] = spam['Entropy_Afterpath'].fillna(spam['Entropy_Afterpath'].mean())
spam['Entropy_Extension'] = spam['Entropy_Extension'].fillna(spam['Entropy_Extension'].mean())
spam['Entropy_Filename'] = spam['Entropy_Filename'].fillna(spam['Entropy_Filename'].mean())
spam['Entropy_DirectoryName'] = spam['Entropy_DirectoryName'].fillna(spam['Entropy_DirectoryName'].mean())
spam['NumberRate_AfterPath'] = spam['NumberRate_AfterPath'].fillna(spam['NumberRate_AfterPath'].mean())
spam['avgpathtokenlen'] = spam['avgpathtokenlen'].fillna(spam['avgpathtokenlen'].mean())

In [40]:
# Multicollinearity check for each data set

# MC check for malware' explanatory variables

spam_y = spam[['URL_Type_obf_Type']]
spam_x = spam.iloc[:,:-1]

In [None]:
#Check the correlation between explanatory variables

x_corr = spam_x.corr()

x_corr

In [None]:
upper_tri = x_corr.where(np.triu(np.ones(x_corr.shape),k=1).astype(np.bool))
print(upper_tri)

In [None]:
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.75)]
print(); print(to_drop)

In [None]:
spam = spam.drop(spam[to_drop], axis=1)

In [None]:
spam['URL_Type_obf_Type'].replace(('spam','benign'),(1,0), inplace=True)

In [7]:
spam.rename(columns={'this.fileExtLen': 'this_fileExtLen'}, inplace=True)
spam.rename(columns={'sub-Directory_LongestWordLength': 'sub_Directory_LongestWordLength'}, inplace=True)

In [None]:
import statsmodels.formula.api as smf
import statsmodels.api as sm

model = smf.glm('URL_Type_obf_Type ~ Querylength + domain_token_count + avgdomaintokenlen + dld_domain + pathurlRatio + isPortEighty + CharacterContinuityRate + File_name_DigitCount + Path_LongestWordLength + URL_sensitiveWord + NumberRate_URL + SymbolCount_URL + path_token_count + avgpathtokenlen + domainlength + ArgUrlRatio + NumberofDotsinURL + host_DigitCount + Directory_LetterCount + Arguments_LongestWordLength + spcharUrl + NumberRate_DirectoryName + Entropy_URL + ldl_domain + fileNameLen + domainUrlRatio + ISIpAddressInDomainName + Directory_DigitCount + Filename_LetterCount + delimeter_Domain + NumberRate_FileName + Entropy_Domain + ldl_filename + this_fileExtLen + executable + delimeter_path + Entropy_Extension',data = spam, family=sm.families.Binomial())
result = model.fit()
print(result.summary())

In [None]:
spam.columns, spam.shape

### Common Columns are : 

Querylength
domain_token_count
avgdomaintokenlen
ldl_domain
this_fileExtLen
ISIpAddressInDomainName
Filename_LetterCount
Path_LongestWordLength
URL_sensitiveWord
NumberRate_DirectoryName
Entropy_Domain
path_token_count
avgpathtokenlen
ldl_filename
executable
CharacterContinuityRate
Directory_DigitCount
spcharUrl
NumberRate_FileName
Entropy_Extension
isPortEighty
File_name_DigitCount
Directory_LetterCount
delimeter_Domain
Entropy_URL
NumberofDotsinURL
fileNameLen

## 1.2. Stacking the final data with common significant columns

In [9]:
malware1 = pd.read_csv("/Users/varunbhavnani/Documents/Applied ML/Project/Malware.csv")
defacement1 = pd.read_csv("/Users/varunbhavnani/Documents/Applied ML/Project/Defacement.csv")
phishing1 = pd.read_csv("/Users/varunbhavnani/Documents/Applied ML/Project/Phishing.csv")
spam1 = pd.read_csv("/Users/varunbhavnani/Documents/Applied ML/Project/Spam.csv")

In [10]:
df = pd.concat([malware1, defacement1, phishing1,spam1], ignore_index=True,axis=0)

In [11]:
df['URL_Type_obf_Type'].value_counts()

benign        31124
Defacement     7930
phishing       7586
malware        6712
spam           6698
Name: URL_Type_obf_Type, dtype: int64

In [12]:
df['URL_Type_obf_Type'].replace(('benign','Defacement','phishing','malware','spam'), (0,1,2,3,4), inplace=True)

In [13]:
df_new = df[['Querylength','domain_token_count','avgdomaintokenlen',
'ldl_domain',
'this.fileExtLen',
'ISIpAddressInDomainName',
'Filename_LetterCount',
'Path_LongestWordLength',
'URL_sensitiveWord',
'NumberRate_DirectoryName',
'Entropy_Domain',
'path_token_count',
'avgpathtokenlen',
'ldl_filename',
'executable','CharacterContinuityRate',
'Directory_DigitCount',
'spcharUrl',
'NumberRate_FileName',
'Entropy_Extension',
'isPortEighty',
'File_name_DigitCount',
'Directory_LetterCount',
'delimeter_Domain',
'Entropy_URL',
'NumberofDotsinURL',
'fileNameLen','URL_Type_obf_Type']]

In [14]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60050 entries, 0 to 60049
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Querylength               60050 non-null  int64  
 1   domain_token_count        60050 non-null  int64  
 2   avgdomaintokenlen         60050 non-null  float64
 3   ldl_domain                60050 non-null  int64  
 4   this.fileExtLen           60050 non-null  int64  
 5   ISIpAddressInDomainName   60050 non-null  int64  
 6   Filename_LetterCount      60050 non-null  int64  
 7   Path_LongestWordLength    60050 non-null  int64  
 8   URL_sensitiveWord         60050 non-null  int64  
 9   NumberRate_DirectoryName  60040 non-null  float64
 10  Entropy_Domain            60050 non-null  float64
 11  path_token_count          60050 non-null  int64  
 12  avgpathtokenlen           59749 non-null  float64
 13  ldl_filename              60050 non-null  int64  
 14  execut

In [15]:
df_new['NumberRate_DirectoryName'] = df['NumberRate_DirectoryName'].fillna(df['NumberRate_DirectoryName'].mean())
df_new['avgpathtokenlen'] = df['avgpathtokenlen'].fillna(df['avgpathtokenlen'].mean())
df_new['NumberRate_FileName'] = df['NumberRate_FileName'].fillna(df['NumberRate_FileName'].mean())
df_new['Entropy_Extension'] = df['Entropy_Extension'].fillna(df['Entropy_Extension'].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['NumberRate_DirectoryName'] = df['NumberRate_DirectoryName'].fillna(df['NumberRate_DirectoryName'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['avgpathtokenlen'] = df['avgpathtokenlen'].fillna(df['avgpathtokenlen'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_n

In [16]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60050 entries, 0 to 60049
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Querylength               60050 non-null  int64  
 1   domain_token_count        60050 non-null  int64  
 2   avgdomaintokenlen         60050 non-null  float64
 3   ldl_domain                60050 non-null  int64  
 4   this.fileExtLen           60050 non-null  int64  
 5   ISIpAddressInDomainName   60050 non-null  int64  
 6   Filename_LetterCount      60050 non-null  int64  
 7   Path_LongestWordLength    60050 non-null  int64  
 8   URL_sensitiveWord         60050 non-null  int64  
 9   NumberRate_DirectoryName  60050 non-null  float64
 10  Entropy_Domain            60050 non-null  float64
 11  path_token_count          60050 non-null  int64  
 12  avgpathtokenlen           60050 non-null  float64
 13  ldl_filename              60050 non-null  int64  
 14  execut

In [17]:
df_new['URL_Type_obf_Type'].value_counts()

0    31124
1     7930
2     7586
3     6712
4     6698
Name: URL_Type_obf_Type, dtype: int64

## 1.3. KNN Model

In [46]:
y = df_new[['URL_Type_obf_Type']]
x = df_new.iloc[:,:-1]

In [47]:
# Data Splitting
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state=42)

In [79]:
# Train a k-NN model with k = 3
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

  return self._fit(X, y)


KNeighborsClassifier()

In [80]:
knn.score(X_test, y_test)

0.9395856923999201

In [81]:
y_pred=knn.predict(X_test)

In [52]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, y_pred)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_test,y_pred)
print("Accuracy:",result2)

Confusion Matrix:
[[7702   14   40   40   22]
 [  42 1821   65   14   15]
 [ 201   94 1528   53   30]
 [  62   17   46 1505   10]
 [  40   45   48    9 1550]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97      7818
           1       0.91      0.93      0.92      1957
           2       0.88      0.80      0.84      1906
           3       0.93      0.92      0.92      1640
           4       0.95      0.92      0.93      1692

    accuracy                           0.94     15013
   macro avg       0.93      0.91      0.92     15013
weighted avg       0.94      0.94      0.94     15013

Accuracy: 0.9395856923999201


In [53]:
knn = KNeighborsClassifier()
from sklearn.model_selection import GridSearchCV
k_range = list(range(3,10,2))
param_grid = dict(n_neighbors=k_range)
  
# defining parameter range
grid = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', return_train_score=False,verbose=1)
  
# fitting the model for grid search
grid_search=grid.fit(X_train, y_train.values.ravel())

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [74]:
print(grid_search.best_params_)

{'n_neighbors': 3}


In [55]:
accuracy = grid_search.best_score_ *100
print("Accuracy for our training dataset with tuning is : {:.2f}%".format(accuracy) )

Accuracy for our training dataset with tuning is : 94.59%


In [76]:
y_pred=grid_search.predict(X_test)

In [77]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, y_pred)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_test,y_pred)
print("Accuracy:",result2)

Confusion Matrix:
[[7775    0   22    6   15]
 [  25 1839   59   16   18]
 [ 166   83 1574   60   23]
 [  34   14   38 1546    8]
 [  34   41   45    8 1564]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      7818
           1       0.93      0.94      0.93      1957
           2       0.91      0.83      0.86      1906
           3       0.94      0.94      0.94      1640
           4       0.96      0.92      0.94      1692

    accuracy                           0.95     15013
   macro avg       0.94      0.93      0.93     15013
weighted avg       0.95      0.95      0.95     15013

Accuracy: 0.9523746086724838


In [None]:
import mlxtend
from mlextend.plotting import plot_decision_regions

plot_decision_regions(X_test.values, y_test.values, clf = knn, legend = 5)
plt.title("Decision boundary using KNN Classification (Test)")
plt.xlabel("mean_perimeter")
plt.ylabel("mean_texture")

## 1.4. Decision Tree

In [58]:
from sklearn.metrics import classification_report # for model evaluation metrics
from sklearn import tree # for decision tree models

In [None]:
df_new.head()

In [None]:
def fitting(X, y, criterion, splitter, mdepth, clweight, minleaf):

    # Create training and testing samples
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    # Fit the model
    model = tree.DecisionTreeClassifier(criterion=criterion, 
                                        splitter=splitter, 
                                        max_depth=mdepth,
                                        class_weight=clweight,
                                        min_samples_leaf=minleaf, 
                                        random_state=0, 
                                  )
    clf = model.fit(X_train, y_train)

    # Predict class labels on training data
    pred_labels_tr = model.predict(X_train)
    # Predict class labels on a test data
    pred_labels_te = model.predict(X_test)

    # Tree summary and model evaluation metrics
    print('*************** Tree Summary ***************')
    print('Classes: ', clf.classes_)
    print('Tree Depth: ', clf.tree_.max_depth)
    print('No. of leaves: ', clf.tree_.n_leaves)
    print('No. of features: ', clf.n_features_in_)
    print('--------------------------------------------------------')
    print("")
    
    print('*************** Evaluation on Test Data ***************')
    score_te = model.score(X_test, y_test)
    print('Accuracy Score: ', score_te)
    # Look at classification report to evaluate the model
    print(classification_report(y_test, pred_labels_te))
    print('--------------------------------------------------------')
    print("")
    
    print('*************** Evaluation on Training Data ***************')
    score_tr = model.score(X_train, y_train)
    print('Accuracy Score: ', score_tr)
    # Look at classification report to evaluate the model
    print(classification_report(y_train, pred_labels_tr))
    print('--------------------------------------------------------')
    

    
    # Return relevant data for chart plotting
    return X_train, X_test, y_train, y_test, clf

In [59]:
# Fit the model and display results
X_train, X_test, y_train, y_test, clf = fitting(x, y, 'gini', 'best', 
                                                       mdepth=3, 
                                                       clweight=None,
                                                       minleaf=1000)

NameError: name 'fitting' is not defined

In [60]:
from sklearn.tree import DecisionTreeClassifier

# GridSearch + CV
from sklearn.model_selection import GridSearchCV

opt_tree = DecisionTreeClassifier(random_state = 0, criterion="gini")

dt_params = {'max_depth':  range(1,10),
             'min_samples_split':   range(2,11),
             'max_leaf_nodes':    range(2,11)   }

grid_tree = GridSearchCV(opt_tree, dt_params)
grid_tree.fit(X_train, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=0),
             param_grid={'max_depth': range(1, 10),
                         'max_leaf_nodes': range(2, 11),
                         'min_samples_split': range(2, 11)})

In [61]:
# Report the best hyperparameters chosen

grid_tree.best_params_

{'max_depth': 5, 'max_leaf_nodes': 10, 'min_samples_split': 2}

In [62]:
grid_tree.best_score_

0.7420564496806412

In [63]:
print(classification_report(y_test, grid_tree.predict(X_test)))

              precision    recall  f1-score   support

           0       0.85      0.90      0.87      7818
           1       0.76      0.51      0.61      1957
           2       0.49      0.76      0.59      1906
           3       0.54      0.30      0.39      1640
           4       0.79      0.69      0.74      1692

    accuracy                           0.74     15013
   macro avg       0.69      0.63      0.64     15013
weighted avg       0.75      0.74      0.74     15013



## 1.5. Random Forest

In [19]:
y = df_new[['URL_Type_obf_Type']]
x = df_new.iloc[:,:-1]

In [61]:
# Create training and testing samples

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [63]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=200, max_samples=100, random_state=42)

rnd_clf.fit(X_train, y_train.values.ravel())

print(rnd_clf.score(X_test, y_test))

0.830391340549542


In [68]:
from sklearn.model_selection import GridSearchCV

rfc=RandomForestClassifier(random_state=42)

param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train.values.ravel())

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [200, 500]})

In [69]:
CV_rfc.best_params_

{'criterion': 'entropy',
 'max_depth': 8,
 'max_features': 'auto',
 'n_estimators': 500}

In [70]:
CV_rfc.best_score_

0.9156952539550375

In [72]:
from sklearn.metrics import classification_report

print(classification_report(y_test, CV_rfc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.92      0.99      0.96      6190
           1       0.95      0.86      0.90      1608
           2       0.81      0.83      0.82      1523
           3       0.97      0.73      0.84      1301
           4       0.95      0.93      0.94      1388

    accuracy                           0.92     12010
   macro avg       0.92      0.87      0.89     12010
weighted avg       0.92      0.92      0.92     12010



## 1.6. Multinomial Logistics Regression Model

In [79]:
mul_lr = LogisticRegression(multi_class='multinomial', solver='newton-cg').fit(X_train,y_train.values.ravel())



In [80]:
print(mul_lr.score(X_test, y_test))

0.8108243130724396


In [81]:
print(classification_report(y_test, mul_lr.predict(X_test)))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      6190
           1       0.74      0.74      0.74      1608
           2       0.70      0.72      0.71      1523
           3       0.63      0.37      0.47      1301
           4       0.80      0.74      0.77      1388

    accuracy                           0.81     12010
   macro avg       0.75      0.71      0.72     12010
weighted avg       0.80      0.81      0.80     12010



In [65]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge
logreg=LogisticRegression(multi_class='multinomial', solver='newton-cg')
logreg_cv=GridSearchCV(logreg,grid,cv=10)
logreg_cv.fit(X_train,y_train.values.ravel())

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

70 fits failed out of a total of 140.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
70 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.80431659 

tuned hpyerparameters :(best parameters)  {'C': 0.1, 'penalty': 'l2'}
accuracy : 0.8052047352288133


In [66]:
print(logreg.score(X_test, y_test))

NotFittedError: This LogisticRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [67]:
print(classification_report(y_test, logreg.predict(X_test)))

NotFittedError: This LogisticRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [68]:
logreg=LogisticRegression(multi_class='multinomial', solver='newton-cg', regularization='l2', C = 0.1)

TypeError: __init__() got an unexpected keyword argument 'regularization'

In [70]:
regularized_lr=LogisticRegression(penalty='l2',solver='newton-cg',max_iter=100, C = 0.1)

regularized_lr.fit(X_train,y_train.values.ravel())

LogisticRegression(C=0.1, solver='newton-cg')

In [71]:
print(regularized_lr.score(X_test, y_test))

0.8040365016985279


In [72]:
print(classification_report(y_test,regularized_lr.predict(X_test)))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      7818
           1       0.71      0.72      0.71      1957
           2       0.68      0.69      0.68      1906
           3       0.62      0.36      0.45      1640
           4       0.78      0.75      0.77      1692

    accuracy                           0.80     15013
   macro avg       0.73      0.69      0.71     15013
weighted avg       0.79      0.80      0.79     15013



## 1.7. Naive Bayes Classification

In [20]:
# Create training and testing samples

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [22]:
from sklearn.naive_bayes import GaussianNB

gaussian_nb = GaussianNB()
gaussian_nb.fit(X_train, y_train.values.ravel())

GaussianNB()

In [23]:
Y_preds = gaussian_nb.predict(X_test)

print(Y_preds[:15])
print(y_test[:15])

print('\nTest Accuracy : {:.3f}'.format(gaussian_nb.score(X_test, y_test))) ## Score method also evaluates accuracy for classification models.
print('Training Accuracy : {:.3f}'.format(gaussian_nb.score(X_train, y_train)))

[1 1 0 1 0 0 0 1 0 2 0 1 1 2 1]
       URL_Type_obf_Type
55263                  4
11085                  3
7838                   3
33638                  0
72                     0
49717                  0
53852                  4
55262                  4
34871                  0
43015                  2
31749                  0
20273                  1
12514                  3
45100                  2
58040                  4

Test Accuracy : 0.652
Training Accuracy : 0.656


In [25]:
from sklearn.metrics import classification_report, accuracy_score

print("Accuracy Score : {:.3f}".format(accuracy_score(y_test, Y_preds)))
print("\nClassification Report :")
print(classification_report(y_test, Y_preds))

Accuracy Score : 0.652

Classification Report :
              precision    recall  f1-score   support

           0       0.86      0.82      0.84      6190
           1       0.42      0.85      0.56      1608
           2       0.67      0.23      0.34      1523
           3       0.66      0.16      0.25      1301
           4       0.42      0.61      0.50      1388

    accuracy                           0.65     12010
   macro avg       0.61      0.53      0.50     12010
weighted avg       0.70      0.65      0.64     12010



In [28]:
n_features, n_classes = x.shape[1], np.unique(y)

n_features, n_classes

(27, array([0, 1, 2, 3, 4]))

In [31]:
from sklearn.model_selection import GridSearchCV

params = {
            'priors': [None, [0.1,]* len(n_classes),],
            'var_smoothing': [1e-9, 1e-6, 1e-12],
         }

gaussian_nb_grid = GridSearchCV(GaussianNB(), param_grid=params, n_jobs=-1, cv=5, verbose=5)
gaussian_nb_grid.fit(X_train,y_train.values.ravel())

print('Best Accuracy Through Grid Search : {:.3f}'.format(gaussian_nb_grid.best_score_))
print('Best Parameters : {}\n'.format(gaussian_nb_grid.best_params_))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Accuracy Through Grid Search : 0.652
Best Parameters : {'priors': None, 'var_smoothing': 1e-09}



15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/naive_bayes.py", line 245, in fit
    return self._partial_fit(
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/naive_bayes.py", line 431, in _partial_fit
    raise ValueError("The sum of the priors should be 1.")
ValueError: The sum of the priors should be 1.



In [32]:
from sklearn.metrics import classification_report, accuracy_score

Y_preds = gaussian_nb_grid.best_estimator_.predict(X_test)
Y_preds_train = gaussian_nb_grid.best_estimator_.predict(X_train)

print("Test Accuracy Score : {:.3f}".format(accuracy_score(y_test, Y_preds)))
print("Train Accuracy Score : {:.3f}".format(accuracy_score(y_train, Y_preds_train)))
print("\nClassification Report :")
print(classification_report(y_test, Y_preds))

Test Accuracy Score : 0.652
Train Accuracy Score : 0.656

Classification Report :
              precision    recall  f1-score   support

           0       0.86      0.82      0.84      6190
           1       0.42      0.85      0.56      1608
           2       0.67      0.23      0.34      1523
           3       0.66      0.16      0.25      1301
           4       0.42      0.61      0.50      1388

    accuracy                           0.65     12010
   macro avg       0.61      0.53      0.50     12010
weighted avg       0.70      0.65      0.64     12010

[CV 3/5] END ..priors=None, var_smoothing=1e-09;, score=0.657 total time=   0.1s
[CV 4/5] END ..priors=None, var_smoothing=1e-06;, score=0.640 total time=   0.1s
[CV 3/5] END ..priors=None, var_smoothing=1e-12;, score=0.656 total time=   0.1s
[CV 1/5] END priors=[0.1, 0.1, 0.1, 0.1, 0.1], var_smoothing=1e-06;, score=nan total time=   0.0s
[CV 2/5] END priors=[0.1, 0.1, 0.1, 0.1, 0.1], var_smoothing=1e-12;, score=nan total tim

## 1.8. SVM Classification

In [33]:
#Importing the necessary packages and libaries
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import svm, datasets
import matplotlib.pyplot as plt
import numpy as np

In [35]:
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state = 0)

In [37]:
linear = svm.SVC(kernel='linear', C=1, decision_function_shape='ovo').fit(X_train, y_train.values.ravel())
rbf = svm.SVC(kernel='rbf', gamma=1, C=1, decision_function_shape='ovo').fit(X_train, y_train.values.ravel())
poly = svm.SVC(kernel='poly', degree=3, C=1, decision_function_shape='ovo').fit(X_train, y_train.values.ravel())
sig = svm.SVC(kernel='sigmoid', C=1, decision_function_shape='ovo').fit(X_train, y_train.values.ravel())

In [38]:
linear_pred = linear.predict(X_test)
poly_pred = poly.predict(X_test)
rbf_pred = rbf.predict(X_test)
sig_pred = sig.predict(X_test)

In [40]:
# retrieve the accuracy and print it for all 4 kernel functions
accuracy_lin = linear.score(X_test, y_test)
accuracy_poly = poly.score(X_test, y_test)
accuracy_rbf = rbf.score(X_test, y_test)
accuracy_sig = sig.score(X_test, y_test)
print("Accuracy Linear Kernel:", accuracy_lin)
print("Accuracy Polynomial Kernel:", accuracy_poly)
print("Accuracy Radial Basis Kernel:", accuracy_rbf)
print("Accuracy Sigmoid Kernel:", accuracy_sig)

Accuracy Linear Kernel: 0.8388009991673605
Accuracy Polynomial Kernel: 0.5895087427144047
Accuracy Radial Basis Kernel: 0.8810990840965862
Accuracy Sigmoid Kernel: 0.48209825145711904


In [41]:
print(classification_report(y_test,linear_pred))
print(classification_report(y_test,poly_pred))
print(classification_report(y_test,rbf_pred))
print(classification_report(y_test,sig_pred))

              precision    recall  f1-score   support

           0       0.88      0.97      0.92      6190
           1       0.77      0.76      0.77      1608
           2       0.72      0.76      0.74      1523
           3       0.80      0.45      0.58      1301
           4       0.89      0.78      0.83      1388

    accuracy                           0.84     12010
   macro avg       0.81      0.75      0.77     12010
weighted avg       0.84      0.84      0.83     12010

              precision    recall  f1-score   support

           0       0.57      0.98      0.72      6190
           1       0.67      0.28      0.39      1608
           2       0.93      0.03      0.06      1523
           3       0.86      0.07      0.14      1301
           4       0.73      0.29      0.42      1388

    accuracy                           0.59     12010
   macro avg       0.75      0.33      0.35     12010
weighted avg       0.68      0.59      0.50     12010

              precisio

In [45]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
  
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 
  
grid = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose = 3)
  
# fitting the model for grid search
grid.fit(X_train, y_train.values.ravel())




Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.709 total time= 6.0min
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.718 total time= 6.0min
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.715 total time= 6.0min
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.715 total time= 5.9min
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.712 total time= 6.0min
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.826 total time= 3.9min
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.827 total time= 3.9min
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.827 total time= 3.9min
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.828 total time= 3.9min
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.819 total time= 4.2min
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.861 total time= 1.9min
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

KeyboardInterrupt: 

In [None]:
# Best parameter: C=0.1, gamma=0.01, kernel=rbf;, score=0.871

## 1.9. Voting Classifier

In [82]:
# S2: Apply Voting Classifier

from sklearn.ensemble import VotingClassifier

# define voting classifier
voting_clf = VotingClassifier(estimators=[('lr1', regularized_lr), ('svc', rbf), ('knn', knn )] , voting='hard')

# train the model
voting_clf.fit(X_train, y_train)

# Performance Measure
print("Test score for voting classifier is:", voting_clf.score(X_test, y_test))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test score for voting classifier is: 0.9230666755478585


In [83]:
voting_clf.predict(X_test)

array([1, 1, 2, ..., 0, 0, 0])

In [84]:
print(classification_report(y_test,voting_clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94      7818
           1       0.95      0.88      0.92      1957
           2       0.93      0.71      0.81      1906
           3       0.99      0.90      0.94      1640
           4       0.99      0.89      0.94      1692

    accuracy                           0.92     15013
   macro avg       0.95      0.88      0.91     15013
weighted avg       0.93      0.92      0.92     15013

