In [1]:
import pandas as pd
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV,train_test_split,cross_val_score
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,RobustScaler
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.metrics import f1_score,accuracy_score,confusion_matrix,roc_auc_score,roc_curve,auc
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

### Data Processing

In [2]:
data = pd.read_csv('C:/Users/ADMIN/Desktop/Code/Jupyter N_book/Week 5/Train_Data.csv')
data.describe()

Unnamed: 0,duration,srcbytes,dstbytes,land,wrongfragment,urgent,hot,numfailedlogins,loggedin,numcompromised,...,dsthostsamesrvrate,dsthostdiffsrvrate,dsthostsamesrcportrate,dsthostsrvdiffhostrate,dsthostserrorrate,dsthostsrvserrorrate,dsthostrerrorrate,dsthostsrvrerrorrate,lastflag,attack
count,86845.0,86845.0,86845.0,86845.0,86845.0,86845.0,86845.0,86845.0,86845.0,86845.0,...,86845.0,86845.0,86845.0,86845.0,86845.0,86845.0,86845.0,86845.0,86845.0,86845.0
mean,104.166872,8455.707,2732.095,4.6e-05,0.0,3.5e-05,0.144971,0.000771,0.441142,0.320583,...,0.526457,0.050186,0.07581,0.016246,0.324666,0.318934,0.092391,0.091045,20.060522,0.379964
std,1038.273538,358214.1,55430.41,0.006787,0.0,0.007588,1.837958,0.037009,0.496527,27.993474,...,0.447353,0.103323,0.208388,0.055271,0.464291,0.463506,0.282768,0.281452,1.421425,0.48538
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,0.0
50%,0.0,46.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.55,0.03,0.0,0.0,0.0,0.0,0.0,0.0,21.0,0.0
75%,0.0,272.0,768.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.07,0.02,0.01,1.0,1.0,0.0,0.0,21.0,1.0
max,40504.0,89581520.0,7028652.0,1.0,0.0,2.0,77.0,4.0,1.0,7479.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,21.0,1.0


In [3]:
target = data.attack
data.drop('attack', axis = 1,inplace = True)
data.shape

(86845, 42)

In [4]:
data.dtypes

duration                    int64
protocoltype               object
service                    object
flag                       object
srcbytes                    int64
dstbytes                    int64
land                        int64
wrongfragment               int64
urgent                      int64
hot                         int64
numfailedlogins             int64
loggedin                    int64
numcompromised              int64
rootshell                   int64
suattempted                 int64
numroot                     int64
numfilecreations            int64
numshells                   int64
numaccessfiles              int64
numoutboundcmds             int64
ishostlogin                 int64
isguestlogin                int64
count                       int64
srvcount                    int64
serrorrate                float64
srvserrorrate             float64
rerrorrate                float64
srvrerrorrate             float64
samesrvrate               float64
diffsrvrate   

In [5]:
encode = LabelEncoder()
obj_cols = ['protocoltype','service','flag']
for col in obj_cols:
    data[col] = encode.fit_transform(data[col])
data[obj_cols].head()

Unnamed: 0,protocoltype,service,flag
0,1,33,1
1,1,49,8
2,1,22,8
3,1,22,8
4,1,62,1


In [6]:
scaler = RobustScaler()
data_scaled = scaler.fit_transform(data)
data_scaled = pd.DataFrame(data_scaled, columns=data.columns)
data_scaled.head()

Unnamed: 0,duration,protocoltype,service,flag,srcbytes,dstbytes,land,wrongfragment,urgent,hot,...,dsthostsrvcount,dsthostsamesrvrate,dsthostdiffsrvrate,dsthostsamesrcportrate,dsthostsrvdiffhostrate,dsthostserrorrate,dsthostsrvserrorrate,dsthostrerrorrate,dsthostsrvrerrorrate,lastflag
0,0.0,0.0,0.478261,-1.75,-0.169118,-0.057292,0.0,0.0,0.0,0.0,...,-0.255144,-0.515789,0.428571,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.0,0.0,1.173913,0.0,4.386029,0.463542,0.0,0.0,0.0,0.0,...,0.160494,-0.105263,0.142857,0.0,0.0,0.11,0.0,0.02,0.0,-1.5
2,0.0,0.0,0.0,0.0,0.647059,1.173177,0.0,0.0,0.0,0.0,...,0.73251,0.473684,-0.428571,1.0,3.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.694853,1.739583,0.0,0.0,0.0,0.0,...,0.73251,0.473684,-0.428571,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.73913,-1.75,-0.169118,-0.057292,0.0,0.0,0.0,0.0,...,-0.304527,-0.568421,0.714286,0.0,0.0,0.0,0.0,1.0,1.0,-1.0


### Cross Validation

In [7]:
x_train,x_test,y_train,y_test = train_test_split(data_scaled,target, test_size=0.33)

In [20]:
log = LogisticRegression()
log_para = {
    'penalty': ['l2'],
    'solver': ['newton-cg'],
    'max_iter': [480,500,520]
}

svm = SVC()
svm_para = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'coef0': [0.0, 0.1, 0.5, 1.0],
    'max_iter': [1000, 5000, 10000]
}

gboost = GradientBoostingClassifier()
gboost_para = {
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [5, 10, 25, 50],
    'min_samples_split': [250, 500, 1000],
    'min_samples_leaf': [100, 200, 400],
    'max_features': ['auto', 'sqrt', 'log2'],
    'loss': ['deviance', 'exponential']
}

rand_for = RandomForestClassifier()
rand_for_para={
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [None, 5, 10, 25, 50],
    'min_samples_split': [250, 500, 1000],
    'min_samples_leaf': [100, 200, 400],
    'max_features': ['auto', 'sqrt', 'log2'],
    'criterion': ['gini', 'entropy']
}

gnb = GaussianNB()
gnb_para = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}

In [9]:
def score(model,parameter,title = "Default"):
    classifier = RandomizedSearchCV(model,parameter,cv = 5)
    classifier.fit(x_train,y_train)
    best_para = classifier.best_params_
    print(f'The best Para.s for {model} is {best_para}')

    best_model = model.set_params(**best_para)
    best_model.fit(x_train, y_train)
    preds = best_model.predict(x_test)
    
    print(confusion_matrix(y_test, preds))
    accuracy = round(accuracy_score(y_test, preds), 5)
    print('Accuracy for', title, ':', accuracy, '\n')
    
    cross_val = cross_val_score(best_model,x_train,y_train)
    print(f' The cross val score is for {title} is:{cross_val}')
    
    f1 = f1_score(y_test,preds)
    print(f' The F1 score is for {title} is:{f1}')

In [12]:
def score_knownpara(model,title = "Default"):
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    
    print(confusion_matrix(y_test, preds))
    accuracy = round(accuracy_score(y_test, preds), 5)
    print('Accuracy for', title, ':', accuracy, '\n')
    
    cross_val = cross_val_score(model,x_train,y_train)
    print(f' The cross val score is for {title} is:{cross_val}')
    
    f1 = f1_score(y_test,preds)
    print(f' The F1 score is for {title} is:{f1}')

In [15]:
log_know_500 = LogisticRegression(penalty= 'l2',solver = 'newton-cg',max_iter = 500)
score_knownpara(log_know_500,'Logistic Regression')

[[17673     0]
 [    2 10984]]
Accuracy for Logistic Regression : 0.99993 

 The cross val score is for Logistic Regression is:[0.99991407 1.         0.99965627 0.99991407 0.99982813]
 The F1 score is for Logistic Regression is:0.9999089667728721


In [21]:
score(rand_for,rand_for_para,'Random Forest')

The best Para.s for RandomForestClassifier() is {'n_estimators': 100, 'min_samples_split': 250, 'min_samples_leaf': 100, 'max_features': 'sqrt', 'max_depth': 50, 'criterion': 'entropy'}
[[17668     5]
 [    5 10981]]
Accuracy for Random Forest : 0.99965 

 The cross val score is for Random Forest is:[0.9996563  0.99957034 0.99982813 0.99957034 0.99982813]
 The F1 score is for Random Forest is:0.999544875295831


In [22]:
score(gnb,gnb_para,'Gaussian NB')

The best Para.s for GaussianNB() is {'var_smoothing': 1e-09}
[[17585    88]
 [   15 10971]]
Accuracy for Gaussian NB : 0.99641 

 The cross val score is for Gaussian NB is:[0.99647706 0.99561743 0.99656269 0.99725015 0.99604709]
 The F1 score is for Gaussian NB is:0.9953277387162622


In [24]:
score(gboost,gboost_para,'Gradient Boosting')

The best Para.s for GradientBoostingClassifier() is {'n_estimators': 200, 'min_samples_split': 250, 'min_samples_leaf': 100, 'max_features': 'sqrt', 'max_depth': 10, 'loss': 'exponential', 'learning_rate': 0.3}
[[17673     0]
 [    0 10986]]
Accuracy for Gradient Boosting : 1.0 

 The cross val score is for Gradient Boosting is:[1. 1. 1. 1. 1.]
 The F1 score is for Gradient Boosting is:1.0


### __Test Data Result__

In [37]:
test = pd.read_csv('C:/Users/ADMIN/Desktop/Code/Jupyter N_book/Week 5/Test_Data.csv')
test.head()

Unnamed: 0,duration,protocoltype,service,flag,srcbytes,dstbytes,land,wrongfragment,urgent,hot,...,dsthostsrvcount,dsthostsamesrvrate,dsthostdiffsrvrate,dsthostsamesrcportrate,dsthostsrvdiffhostrate,dsthostserrorrate,dsthostsrvserrorrate,dsthostrerrorrate,dsthostsrvrerrorrate,lastflag
0,0,tcp,mtp,REJ,0,0,0,0,0,0,...,7,0.03,0.08,0.0,0.0,0.0,0.0,1.0,1.0,20
1,0,tcp,http,SF,199,1721,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21
2,0,tcp,discard,S0,0,0,0,0,0,0,...,14,0.05,0.09,0.0,0.0,1.0,1.0,0.0,0.0,18
3,0,tcp,telnet,S0,0,0,0,0,0,0,...,2,0.01,0.09,0.0,0.0,1.0,1.0,0.0,0.0,18
4,0,tcp,exec,S0,0,0,0,0,0,0,...,16,0.06,0.06,0.0,0.0,1.0,1.0,0.0,0.0,20


In [38]:
encode = LabelEncoder()
obj_cols = ['protocoltype','service','flag']
for col in obj_cols:
    test[col] = encode.fit_transform(test[col])

In [39]:
scaler = RobustScaler()
test_scaled = scaler.fit_transform(test)
test_scaled = pd.DataFrame(test_scaled, columns=test.columns)

In [40]:
test_scaled.head()

Unnamed: 0,duration,protocoltype,service,flag,srcbytes,dstbytes,land,wrongfragment,urgent,hot,...,dsthostsrvcount,dsthostsamesrvrate,dsthostdiffsrvrate,dsthostsamesrcportrate,dsthostsrvdiffhostrate,dsthostserrorrate,dsthostsrvserrorrate,dsthostrerrorrate,dsthostsrvrerrorrate,lastflag
0,0.0,0.0,0.375,-1.75,-0.168498,-0.058728,0.0,0.0,0.0,0.0,...,-0.296296,-0.557895,0.857143,0.0,0.0,0.0,0.0,1.0,1.0,-0.5
1,0.0,0.0,0.0,0.0,0.56044,2.187276,0.0,0.0,0.0,0.0,...,0.72428,0.463158,-0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,-0.541667,-1.0,-0.168498,-0.058728,0.0,0.0,0.0,0.0,...,-0.26749,-0.536842,1.0,0.0,0.0,1.0,1.0,0.0,0.0,-1.5
3,0.0,0.0,1.375,-1.0,-0.168498,-0.058728,0.0,0.0,0.0,0.0,...,-0.316872,-0.578947,1.0,0.0,0.0,1.0,1.0,0.0,0.0,-1.5
4,0.0,0.0,-0.25,-1.0,-0.168498,-0.058728,0.0,0.0,0.0,0.0,...,-0.259259,-0.526316,0.571429,0.0,0.0,1.0,1.0,0.0,0.0,-0.5


#### Recruiting Finalised Model

In [41]:
best_paras = {'n_estimators': 200, 
              'min_samples_split': 250, 
              'min_samples_leaf': 100, 
              'max_features': 'sqrt', 
              'max_depth': 10, 
              'loss': 'exponential', 
              'learning_rate': 0.3
             }
GB_final = GradientBoostingClassifier(**best_paras)

In [42]:
GB_final.fit(data_scaled,target)

In [47]:
preds = GB_final.predict(test_scaled)
series_preds = pd.Series(preds)
dataframe_preds = pd.DataFrame({
    'attack': series_preds,
})

In [48]:
dataframe_preds

Unnamed: 0,attack
0,1
1,0
2,1
3,1
4,1
...,...
21707,0
21708,0
21709,1
21710,0


In [49]:
dataframe_preds.to_csv(f"Submission_Pranay_Sol.csv")