In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE

In [2]:
#data = pd.read_csv("../../data/mergeddata2.csv")
data = pd.read_csv("../../data/ipmapping.csv")

In [None]:
#data.sample(5)
data

In [3]:
df = pd.DataFrame()
df["source_port"] = data["Source Port"]
df["destination_port"] = data["Destination Port"]
df["source_ip"] = data["Source IP"]
df["destination_ip"] = data["Destination IP"]
df["device_type"] = data["Device Type1"]


In [None]:
data = data.convert_objects(convert_numeric=True)

In [None]:
df.head()

In [None]:
data = data.dropna(axis=0, how='all')

In [4]:
X = [df.columns[0],df.columns[1],df.columns[2],df.columns[3]]
y = df.columns[4]

In [None]:
n_features = df[X].shape[1]

In [None]:
any(data.isnull())

In [None]:
df[y].value_counts()

In [None]:
sns.countplot(x = y, data=df, palette='hls')

In [None]:
plt.show()

In [None]:
data.groupby(y).mean()

In [None]:
data.groupby(X[0]).mean()

In [None]:
data.groupby(X[1]).mean()

In [None]:
data.groupby(X[2]).mean()

In [None]:
%matplotlib inline
pd.crosstab(df[X[0]],df[y]).plot(kind='bar')
plt.title('Frequency for Source Port')
plt.xlabel('Source Port')
plt.ylabel('Packet')
plt.savefig('src-freq-packet-device')

In [None]:
%matplotlib inline
pd.crosstab(df[X[1]], df[y]).plot(kind='bar')
plt.title('Mapping of Destination Port on Label' )
plt.xlabel('Dest Port')
plt.ylabel('Packet')
plt.savefig('src-dest-packet')

In [None]:
%matplotlib inline
pd.crosstab(df[X[2]],df[y]).plot(kind='bar')
plt.title('Frequency for Source IP')
plt.xlabel('Source IP')
plt.ylabel('packet')
plt.savefig('src-ip-packet-device')

In [None]:
%matplotlib inline
pd.crosstab(data[X[1]],data[y]).plot(kind='bar')
plt.title('Frequency for Destination Port')
plt.xlabel('Destination Port')
plt.ylabel('Packet')
plt.savefig('dst-port-packet')

In [None]:
%matplotlib inline
plt.figure(figsize=(1,2))
pd.crosstab(data[X[2]], data[y]).plot(kind='bar')
plt.title('Frequency for Protocol')
plt.xlabel('Protocol')
plt.ylabel('Packet')
plt.savefig('protocol-packet')

In [None]:
sns.pairplot(data)

In [None]:
plt.show()

In [None]:
plt.plot( X[0], y, data=df, marker='o', color='mediumvioletred')
plt.show()

In [None]:
g = sns.lmplot(x=X[0], y=X[1], hue="Packet", data=data,palette="Set1")

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df[X], df[y], test_size=0.20, random_state=42)

In [6]:
logreg = LogisticRegression()
rfe = RFE(logreg, 18)
rfe = rfe.fit(X_train, y_train)
print(rfe.support_)
print(rfe.ranking_)

[ True  True  True  True]
[1 1 1 1]


In [None]:
import statsmodels.api as sm
logit_model=sm.Logit(data[y],data[X])
result=logit_model.fit()
print(result.summary())

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10, random_state=7)
modelCV = LogisticRegression()
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, X_train, y_train, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: %.3f" % (results.mean()))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()