In [1]:
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import pandas as pd 
import numpy as np

models = [
LinearSVC(),
KNeighborsClassifier()
]

missing_values = ["n/a", "na", "--", " ", "", "NA"]
dataset = pd.read_csv("./Dataset_Challenge2.csv", na_values = missing_values)

In [14]:
dataset

Unnamed: 0,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,CONTENT_LENGTH,TCP_CONVERSATION_EXCHANGE,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,SOURCE_APP_PACKETS,REMOTE_APP_PACKETS,APP_PACKETS,DNS_QUERY_TIMES,Type
0,16,7,263.0,7,0,2,700,9,10,9,2.0,1
1,16,6,15087.0,17,7,4,1230,17,19,17,0.0,0
2,16,6,324.0,0,0,0,0,0,0,0,0.0,0
3,17,6,162.0,31,22,3,3812,39,37,39,8.0,0
4,17,6,124140.0,57,2,5,4278,61,62,61,4.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1776,194,16,,0,0,0,0,0,3,0,0.0,1
1777,198,17,,0,0,0,0,0,2,0,0.0,1
1778,201,34,8904.0,83,2,6,6631,87,89,87,4.0,0
1779,234,34,,0,0,0,0,0,0,0,0.0,0


In [15]:
# count the number of nan values in each column
print(dataset.isnull().sum())

URL_LENGTH                     0
NUMBER_SPECIAL_CHARACTERS      0
CONTENT_LENGTH               812
TCP_CONVERSATION_EXCHANGE      0
DIST_REMOTE_TCP_PORT           0
REMOTE_IPS                     0
APP_BYTES                      0
SOURCE_APP_PACKETS             0
REMOTE_APP_PACKETS             0
APP_PACKETS                    0
DNS_QUERY_TIMES                1
Type                           0
dtype: int64


In [16]:
dataset = dataset.fillna(0)

In [17]:
x_train = dataset.iloc[:,:-1]
y_train = dataset.iloc[:,-1]
for model in models:
    model.seed = 42
    num_folds = 5
    scores = cross_val_score(model, x_train, y_train, cv=num_folds, scoring='neg_mean_squared_error')
    score_description = " %0.2f (+/- %0.2f)" % (np.sqrt(scores.mean()*-1), scores.std() * 2)
    print('{model:25} CV-5 RMSE: {score}'.format(model=model.__class__.__name__, score=score_description))




LinearSVC                 CV-5 RMSE:  0.42 (+/- 0.12)
KNeighborsClassifier      CV-5 RMSE:  0.32 (+/- 0.03)


In [18]:
models = [
LinearSVC(),
KNeighborsClassifier()
]

missing_values = ["n/a", "na", "--", " ", "", "NA"]
dataset = pd.read_csv("./Dataset_Challenge2.csv", na_values = missing_values)
dataset = dataset.fillna(0)

x_train = dataset.iloc[:100,:-1]
y_train = dataset.iloc[:100,-1]
x_unlabeled = dataset.iloc[100:,:-1]
for model in models:
    model.seed = 42
    num_folds = 5
    model.fit(x_train, y_train)   # Training
    y_unlabeled = model.predict(x_unlabeled)  # Creating pseudo-labeled data
    y_unlabeled = pd.DataFrame(y_unlabeled, columns = ['Type'])
    seudo_labeled_data = x_unlabeled.join(y_unlabeled)
    seudo_labeled_data = seudo_labeled_data.fillna(0)
    x_merged = x_train.append(seudo_labeled_data.iloc[:,:-1])
    y_merged = y_train.append(seudo_labeled_data.iloc[:,-1])
    scores = cross_val_score(model, x_merged, y_merged, cv=num_folds, scoring='neg_mean_squared_error')
    score_description = " %0.2f (+/- %0.2f)" % (np.sqrt(scores.mean()*-1), scores.std() * 2)
    print('{model:25} CV-5 RMSE: {score}'.format(model=model.__class__.__name__, score=score_description))



LinearSVC                 CV-5 RMSE:  0.54 (+/- 0.40)
KNeighborsClassifier      CV-5 RMSE:  0.35 (+/- 0.11)


