In [1]:
import numpy as np 
import pandas as pd

from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import train_test_split
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import Normalizer

from sklearn.svm import SVC

from sklearn.metrics import (precision_score, recall_score,f1_score, accuracy_score)



In [2]:
traindata = pd.read_csv('kddtrain.csv', header=None)
testdata = pd.read_csv('kddtest.csv', header=None)

X,Y = traindata.iloc[:,1:42], traindata.iloc[:,0]
C,T = testdata.iloc[:,0] ,testdata.iloc[:,1:42]

scaler = Normalizer().fit(X)
trainX = scaler.transform(X) #scale / normalize X.

traindata = np.array(trainX) #convert to np array()
trainlabel = np.array(Y) #convert to np array()

scaler = Normalizer().fit(T) #scale/ normalize x_te
testT = scaler.transform(T) 

testdata = np.array(testT) #convert to np array()
testlabel = np.array(C) #convert to np array()

# Do LR, SVM, NB, DT, RF.

#LR
classifiers = ["Logistic Regression ", "Naive Bayes", "Decision Trees", "Ensemble Model : Random Forest"]
classifier_deets = []

lr_model = LogisticRegression()
lr_model.fit(traindata, trainlabel)
expected = testlabel
lr_predicted = lr_model.predict(testdata)
lr_proba = lr_model.predict_proba(testdata)

y_train1 = expected
y_pred = lr_predicted

accuracy = accuracy_score(y_train1, y_pred)
recall = recall_score(y_train1, y_pred , average="binary")
precision = precision_score(y_train1, y_pred , average="binary")
f1 = f1_score(y_train1, y_pred, average="binary")

classifier_deets.append([accuracy, recall, precision, f1]) #acc, rec, pre, f1. Print in that order.

#Naive Bayes

nb_model = GaussianNB()
nb_model.fit(traindata, trainlabel)
expected = testlabel
nb_predicted = nb_model.predict(testdata)
nb_proba = nb_model.predict_proba(testdata)

y_train1 = expected
y_pred = nb_predicted

accuracy = accuracy_score(y_train1, y_pred)
recall = recall_score(y_train1, y_pred , average="binary")
precision = precision_score(y_train1, y_pred , average="binary")
f1 = f1_score(y_train1, y_pred, average="binary")

classifier_deets.append([accuracy, recall, precision, f1])

# DT

dt_model = DecisionTreeClassifier()
dt_model.fit(traindata, trainlabel)
expected = testlabel
dt_predicted = dt_model.predict(testdata)
dt_proba = dt_model.predict_proba(testdata)

y_train1 = expected
y_pred = dt_predicted

accuracy = accuracy_score(y_train1, y_pred)
recall = recall_score(y_train1, y_pred , average="binary")
precision = precision_score(y_train1, y_pred , average="binary")
f1 = f1_score(y_train1, y_pred, average="binary")

classifier_deets.append([accuracy, recall, precision, f1])

# RF

rf_model = RandomForestClassifier(n_estimators = 150)
rf_model.fit(traindata, trainlabel)
expected = testlabel
rf_predicted = rf_model.predict(testdata)
rf_proba = rf_model.predict_proba(testdata)

y_train1 = expected
y_pred = rf_predicted

accuracy = accuracy_score(y_train1, y_pred)
recall = recall_score(y_train1, y_pred , average="binary")
precision = precision_score(y_train1, y_pred , average="binary")
f1 = f1_score(y_train1, y_pred, average="binary")

classifier_deets.append([accuracy, recall, precision, f1])

for i in range(len(classifiers)):
    print(classifiers[i],":")
    print("Accuracy : ", classifier_deets[i][0])
    print("Recall : ", classifier_deets[i][1])
    print("Precision : ", classifier_deets[i][2])
    print("F1 Score : ", classifier_deets[i][3])
    print("**************************************\n")

('Logistic Regression ', ':')
('Accuracy : ', 0.8480720447289481)
('Recall : ', 0.8208444472839368)
('Precision : ', 0.988521610340649)
('F1 Score : ', 0.89691355870085)
**************************************

('Naive Bayes', ':')
('Accuracy : ', 0.9294695992978147)
('Recall : ', 0.9232418661853727)
('Precision : ', 0.9883980900024366)
('F1 Score : ', 0.9547095876439777)
**************************************

('Decision Trees', ':')
('Accuracy : ', 0.9282896450170242)
('Recall : ', 0.9122210864252743)
('Precision : ', 0.9985968685252695)
('F1 Score : ', 0.9534567308695562)
**************************************

('Ensemble Model : Random Forest', ':')
('Accuracy : ', 0.9262448196148912)
('Recall : ', 0.9095617243527289)
('Precision : ', 0.9987241206955515)
('F1 Score : ', 0.9520599188320509)
**************************************



In [3]:
# SVM

# svm_model = SVC(kernel='linear', C=0.01, probability=True)
# svm_model.fit(traindata, trainlabel)
# expected = testlabel
# svm_predicted = svm_model.predict(testdata)
# svm_proba = svm_model.predict_proba(testdata)

# y_train1 = expected
# y_pred = svm_predicted

# accuracy = accuracy_score(y_train1, y_pred)
# recall = recall_score(y_train1, y_pred , average="binary")
# precision = precision_score(y_train1, y_pred , average="binary")
# f1 = f1_score(y_train1, y_pred, average="binary")

# print ("SVM : ")
# print ("Accuracy : ", accuracy)
# print ("Recall : ", recall)
# print ("Precision : ", precision)
# print ("F1 Score : ", f1)