In [1]:
import pandas as pd
import numpy as np
import sys
import joblib
sys.modules['sklearn.externals.joblib'] = joblib
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV , train_test_split
from sklearn.metrics import accuracy_score, recall_score,confusion_matrix
from skelm import ELMClassifier
from itertools import product

In [2]:
# Now we import the dataset
df=pd.read_csv('Group_6_data_cleaned.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,SFH,popUpWidnow,SSLfinal_State,Request_URL,URL_of_Anchor,web_traffic,URL_Length,age_of_domain,having_IP_Address,Result
0,0,1,-1,1,-1,-1,1,1,1,0,0
1,1,-1,-1,-1,-1,-1,0,1,1,1,1
2,2,1,-1,0,0,-1,0,-1,1,0,1
3,3,1,0,1,-1,-1,0,1,1,0,0
4,4,-1,-1,1,-1,0,0,-1,1,0,1


In [3]:
df = df.drop(['Unnamed: 0'],axis=1)

In [4]:
X = df.drop(["Result"], axis = 1)
y = df["Result"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 42,test_size = 0.2)

In [6]:
# Oversampling and transform the dataset
import imblearn
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X_train,y_train= oversample.fit_resample(X_train, y_train)

# Bi-directional Elmination with wrapper method

# Best Model from Part 1: Random Forest

In [7]:
rf_ = RandomForestClassifier()

In [8]:
sffs = SFS(rf_,forward=True, floating=True, k_features=5, scoring='accuracy', cv=10)

In [9]:
X_train.shape,y_train.shape

((1710, 9), (1710,))

In [10]:
sffs.fit(X_train,y_train)

SequentialFeatureSelector(cv=10, estimator=RandomForestClassifier(),
                          floating=True, k_features=(5, 5), scoring='accuracy')

In [11]:
sffs.k_feature_names_

('SFH', 'SSLfinal_State', 'Request_URL', 'URL_of_Anchor', 'URL_Length')

In [12]:
x_train_alt = X_train[list(sffs.k_feature_names_)]
x_test_alt=X_test[list(sffs.k_feature_names_)]

print(x_train_alt)

      SFH  SSLfinal_State  Request_URL  URL_of_Anchor  URL_Length
0      -1              -1           -1              1          -1
1       1               1            1              1           0
2      -1               0           -1             -1           1
3       0               1            0              1           0
4      -1               1           -1             -1          -1
...   ...             ...          ...            ...         ...
1705    1               0           -1             -1           0
1706    1               0           -1             -1          -1
1707    1               0           -1             -1          -1
1708   -1              -1            0             -1           0
1709   -1              -1           -1             -1           0

[1710 rows x 5 columns]


In [13]:
print(x_test_alt)

      SFH  SSLfinal_State  Request_URL  URL_of_Anchor  URL_Length
49      1              -1           -1              1           1
638    -1               0           -1             -1          -1
1033    1               1           -1              1           0
746     1               1            1              1           0
918    -1               1           -1             -1           0
...   ...             ...          ...            ...         ...
731     1               1            1              0           0
1283    1               1            0             -1           0
1306   -1              -1           -1              1          -1
724    -1              -1           -1              1           0
54      1               0           -1             -1          -1

[271 rows x 5 columns]


In [14]:
params_grid={
    'max_depth': [10,50,90],
    'max_features':['auto','log2'],
    'n_estimators':[10,55,100]
}

In [15]:
grid_rf=GridSearchCV(rf_, params_grid, scoring='accuracy', refit=True, n_jobs=1)

In [16]:
grid_search = grid_rf.fit(X_train,y_train)
print(grid_search)

GridSearchCV(estimator=RandomForestClassifier(), n_jobs=1,
             param_grid={'max_depth': [10, 50, 90],
                         'max_features': ['auto', 'log2'],
                         'n_estimators': [10, 55, 100]},
             scoring='accuracy')


In [17]:
print(grid_search.best_params_)
accuracy=grid_search.best_score_*100
print("Accuracy for our training dataset",accuracy)

{'max_depth': 50, 'max_features': 'log2', 'n_estimators': 100}
Accuracy for our training dataset 94.09356725146199


In [18]:
rf_clf_sffs = RandomForestClassifier(n_estimators=100, max_depth = 10, max_features = 'log2')
rf_clf_sffs.fit(x_train_alt, y_train)

RandomForestClassifier(max_depth=10, max_features='log2')

In [19]:
rf_preds = rf_clf_sffs.predict(x_train_alt)

In [20]:
accuracy_score(y_true = y_train, y_pred = rf_preds)
print("Accuracy for our training dataset",accuracy)

Accuracy for our training dataset 94.09356725146199


In [21]:
recall = recall_score(y_true = y_train, y_pred = rf_preds,average='weighted')
print("Recall for our training dataset",recall)

Recall for our training dataset 0.9146198830409357


In [22]:
cm= confusion_matrix(y_train,rf_preds)
cm

array([[538,   7,  25],
       [  0, 560,  10],
       [ 48,  56, 466]])

In [23]:
rf_preds = rf_clf_sffs.predict(x_train_alt)
accuracy = accuracy_score(y_true = y_train, y_pred = rf_preds)
print("Accuracy for training dataset",accuracy)
recall = recall_score(y_true = y_train, y_pred = rf_preds,average='weighted')
print("Recall for training dataset",recall)
cm= confusion_matrix(y_train,rf_preds)
cm

Accuracy for training dataset 0.9146198830409357
Recall for training dataset 0.9146198830409357


array([[538,   7,  25],
       [  0, 560,  10],
       [ 48,  56, 466]])

In [24]:
rf_preds = rf_clf_sffs.predict(x_test_alt)
accuracy = accuracy_score(y_true = y_test, y_pred = rf_preds)
print("Accuracy for testing dataset",accuracy)
recall = recall_score(y_true = y_test, y_pred = rf_preds,average='weighted')
print("Recall for testing dataset",recall)
cm= confusion_matrix(y_test,rf_preds)
cm

Accuracy for testing dataset 0.8265682656826568
Recall for testing dataset 0.8265682656826568


array([[120,   3,   9],
       [  0,  21,   4],
       [ 21,  10,  83]])

# Best Model from Part-2 : ELM

In [26]:
# Add the required tags for compatibility with scikit-learn 1.0 and later
ELMClassifier._get_tags = lambda self: {'binary_only': True, 'multiclass': True, 'multioutput': False, 'pairwise': False, 'poor_score': False, 'no_validation': False, 'no_score': False}


In [27]:
# Define ELM classifier with default parameters
clf = ELMClassifier()

In [28]:
sffs = SFS(clf,
          k_features=5,
          forward=True,
          floating=True,
          scoring='accuracy',
          cv=5)

In [29]:
# Fit SFS on training data
sffs.fit(X_train, y_train)

SequentialFeatureSelector(estimator=ELMClassifier(), floating=True,
                          k_features=(5, 5), scoring='accuracy')

In [30]:
selected_features = list(sffs.k_feature_idx_)
selected_features

[0, 2, 3, 4, 5]

In [31]:
sffs.k_feature_names_

('SFH', 'SSLfinal_State', 'Request_URL', 'URL_of_Anchor', 'web_traffic')

In [32]:
x_train_alt = X_train[list(sffs.k_feature_names_)]
x_test_alt=X_test[list(sffs.k_feature_names_)]


In [33]:
from itertools import product

class ELMGridSearch:
    def __init__(self, param_grid,cv=5):
        self.param_grid = param_grid
        self.cv = cv
        
    def fit(self, X_train, y_train):
        best_score = 0
        best_params = None
        
        n_samples = X_train.shape[0]
        fold_size = n_samples // self.cv
        
        for params in product(*self.param_grid.values()):
            params = dict(zip(self.param_grid.keys(), params))
            clf = ELMClassifier(**params)
            clf.fit(X_train, y_train)
            # Perform manual cross-validation
            scores = []
            for fold in range(self.cv):
                start = fold * fold_size
                end = start + fold_size
                
                # Split data into training and validation folds
                X_val = X_train[start:end]
                y_val = y_train[start:end]
                X_tr = np.concatenate((X_train[:start], X_train[end:]))
                y_tr = np.concatenate((y_train[:start], y_train[end:]))
                
                # Train the model on the training fold
                clf.fit(X_tr, y_tr)
                
                # Evaluate the model on the validation fold
                score = clf.score(X_val, y_val)
                scores.append(score)
                
            # Compute the average performance across all folds
            score = np.mean(scores)
            
            
            if score > best_score:
                best_score = score
                best_params = params
        
        self.best_params_ = best_params
        self.best_score_ = best_score
        self.best_estimator_ = ELMClassifier(**best_params)
        self.best_estimator_.fit(X_train, y_train)
        
        return self
        
    def predict(self, X_test):
        return self.best_estimator_.predict(X_test)
param_grid = {
    'n_neurons': [100, 200, 300,400,512,1000],
    'ufunc': [ 'relu','tanh'],
    'alpha': [0.001, 0.01, 0.1]
}

search = ELMGridSearch(param_grid,cv=5)
search.fit(X_train, y_train)

print(search.best_params_)
print(search.best_score_)

{'n_neurons': 400, 'ufunc': 'relu', 'alpha': 0.1}
0.9321637426900585


In [34]:
model_2 = ELMClassifier(n_neurons = 400,ufunc = 'relu',alpha = 0.1)
model_2.fit(x_train_alt,y_train)

ELMClassifier(alpha=0.1, n_neurons=400, ufunc='relu')

In [35]:
elm_preds = model_2.predict(x_train_alt)

In [36]:
accuracy_score(y_true = y_train, y_pred = elm_preds)
print("Accuracy for our training dataset",search.best_score_*100)

Accuracy for our training dataset 93.21637426900585


In [37]:
recall = recall_score(y_true = y_train, y_pred = elm_preds,average='weighted')
print("Recall for our training dataset",recall)

Recall for our training dataset 0.895906432748538


In [38]:
cm= confusion_matrix(y_train,elm_preds)
cm

array([[494,  46,  30],
       [  1, 528,  41],
       [ 37,  23, 510]])

In [None]:
elm_preds = model_2.predict(x_test_alt)
accuracy = accuracy_score(y_true = y_test, y_pred = elm_preds)
print("Accuracy for testing dataset",accuracy)
recall = recall_score(y_true = y_test, y_pred = elm_preds,average='weighted')
print("Recall for testing dataset",recall)
cm= confusion_matrix(y_test,elm_preds)
cm