## Pseudo Labelling using multiple supervised ML models

In [2]:
# to handle datasets
import pandas as pd
import numpy as np

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# machine learning imports
from sklearn.svm import LinearSVC, NuSVC 
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error

### Making list of all models

In [3]:
models = [
    NuSVC(),
    LinearSVC(),
    SGDClassifier(),
    KNeighborsClassifier(),
    NearestCentroid()
]

In [4]:
# load dataset
labelled_data = pd.read_csv('dataset_labeled.csv')
unlabelled_data = pd.read_csv('dataset_unlabeled.csv')

# rows and columns of the data
print(labelled_data.shape)
print(unlabelled_data.shape)

# visualise the dataset
labelled_data.head()

(1500, 6)
(41777, 5)


Unnamed: 0,hash,size_of_data,virtual_address,entropy,virtual_size,malware
0,071e8c3f8922e186e57548cd4c703a5d,443392,4096,6.529624,442984,1
1,33f8e6d08a6aae939f25a8e0d63dd523,331264,4096,6.604314,330784,1
2,b68abd064e975e1c6d5f25e748663076,74240,4096,6.046789,73819,1
3,72049be7bd30ea61297ea624ae198067,219648,4096,6.497018,219524,1
4,c9b3700a77facf29172f32df6bc77f48,262144,4096,6.638142,261943,1


In [7]:
unlabelled_data.nunique()

hash               41660
size_of_data        2506
virtual_address      128
entropy            16671
virtual_size       10906
dtype: int64

In [4]:
unlabelled_data.head(10)

Unnamed: 0,hash,size_of_data,virtual_address,entropy,virtual_size
0,a4f7d238f59c6ea07159a83182f86538,63488,4096,6.6365,63195
1,64fe3cc06265bca6cc175cecfc16fc2e,87040,4096,6.584123,86944
2,e06686752e033aff3198ff10e47b3bd3,25600,4096,6.468836,25494
3,8be9ad3cbe9c0da179466393583c486e,733184,4096,6.665506,732799
4,c6830efb14d4f80e1ba6a9e56d05bce6,1196032,4096,6.655021,1195949
5,1325d315f2ddbc2acc79a95d07ef41f2,48128,4096,6.661043,47981
6,9522b650e9eed6134d72e1720d1d7724,244224,4096,6.564004,244109
7,38eba210348b73a609f891d67ab50557,262144,4096,6.638142,261943
8,c8408030aa41b72d7a466fabde262ebe,198144,4096,6.615477,197905
9,028e30f7b50fd2ab2cd9df91cd3fd66d,120320,4096,6.583415,120079


In [5]:
labelled_data.describe()

Unnamed: 0,size_of_data,virtual_address,entropy,virtual_size,malware
count,1500.0,1500.0,1500.0,1500.0,1500.0
mean,713205.4,5543.253,6.353871,713072.0,0.333333
std,2568605.0,31173.16,0.546324,2568588.0,0.471562
min,1024.0,4096.0,0.650292,900.0,0.0
25%,28672.0,4096.0,6.262846,28432.0,0.0
50%,103168.0,4096.0,6.497885,102840.0,0.0
75%,443392.0,4096.0,6.581666,442984.0,1.0
max,52962820.0,1159168.0,7.999813,52962520.0,1.0


In [6]:
labelled_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   hash             1500 non-null   object 
 1   size_of_data     1500 non-null   int64  
 2   virtual_address  1500 non-null   int64  
 3   entropy          1500 non-null   float64
 4   virtual_size     1500 non-null   int64  
 5   malware          1500 non-null   int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 70.4+ KB


In [7]:
for column in labelled_data.columns:
    print(column," ",labelled_data[column].nunique())

hash   1484
size_of_data   661
virtual_address   11
entropy   1079
virtual_size   1058
malware   2


### Encoding the Categorial data 

In [8]:
le = LabelEncoder()

In [9]:
le.fit(labelled_data['hash'])
labelled_data['hash']= le.transform(labelled_data['hash'])


le.fit(unlabelled_data['hash'])
unlabelled_data['hash']= le.transform(unlabelled_data['hash'])

labelled_data.head()

Unnamed: 0,hash,size_of_data,virtual_address,entropy,virtual_size,malware
0,34,443392,4096,6.529624,442984,1
1,293,331264,4096,6.604314,330784,1
2,1076,74240,4096,6.046789,73819,1
3,647,219648,4096,6.497018,219524,1
4,1179,262144,4096,6.638142,261943,1


In [10]:
unlabelled_data.head()

Unnamed: 0,hash,size_of_data,virtual_address,entropy,virtual_size
0,25400,63488,4096,6.6365,63195
1,15547,87040,4096,6.584123,86944
2,36607,25600,4096,6.468836,25494
3,21334,733184,4096,6.665506,732799
4,32238,1196032,4096,6.655021,1195949


### Evaulating the Performance of all the Models

Including 5-fold cross validation

In [11]:
%%time

X_train, X_test, y_train, y_test = train_test_split(labelled_data.iloc[:,:-1],
                                                    labelled_data.iloc[:,-1], test_size=0.20, random_state=42)
    
for model in models:
    model.seed = 101
    num_folds = 5
    scores = cross_val_score(model, X_train, y_train, cv=num_folds, scoring='neg_mean_squared_error')
    score_description = " {:04.3f} (+/- {:04.3f})".format((np.sqrt(scores.mean()*-1)), (scores.std() * 2))
    
    print('{model:25} CV-5 RMSE: {score}'.format(model=model.__class__.__name__, score=score_description))
    

NuSVC                     CV-5 RMSE:  0.603 (+/- 0.195)
LinearSVC                 CV-5 RMSE:  0.576 (+/- 0.042)
SGDClassifier             CV-5 RMSE:  0.630 (+/- 0.262)
KNeighborsClassifier      CV-5 RMSE:  0.481 (+/- 0.025)
NearestCentroid           CV-5 RMSE:  0.757 (+/- 0.221)
CPU times: user 488 ms, sys: 21.7 ms, total: 510 ms
Wall time: 509 ms


### Pseudo Labelling 

In [12]:
%%time

x_train = labelled_data.iloc[:,:-1]
y_train = labelled_data.iloc[:,-1]
x_unlabeled = unlabelled_data

for model in models:
    model.seed = 101
    num_folds = 5
    model.fit(x_train, y_train)
    y_unlabeled = model.predict(x_unlabeled)
    y_unlabeled = pd.DataFrame(y_unlabeled, columns = ['malware'])
    seudo_labeled_data = unlabelled_data.join(y_unlabeled)
    seudo_labeled_data = seudo_labeled_data.fillna(0)
    x_merged = x_train.append(seudo_labeled_data.iloc[:,:-1])
    y_merged = y_train.append(seudo_labeled_data.iloc[:,-1])
    scores = cross_val_score(model, x_merged.sample(frac=1,random_state=101), y_merged.sample(frac=1,random_state=101), cv=num_folds, scoring='neg_mean_squared_error')
    score_description = " {:04.3f} (+/- {:04.3f})".format((np.sqrt(scores.mean()*-1)), (scores.std() * 2))
    print("{model:20} CV-5 RMSE: {score}".format(model=model.__class__.__name__, score=score_description))

NuSVC                CV-5 RMSE:  0.290 (+/- 0.006)
LinearSVC            CV-5 RMSE:  0.375 (+/- 0.096)
SGDClassifier        CV-5 RMSE:  0.152 (+/- 0.023)
KNeighborsClassifier CV-5 RMSE:  0.155 (+/- 0.003)
NearestCentroid      CV-5 RMSE:  0.243 (+/- 0.004)
CPU times: user 2min 22s, sys: 2.98 s, total: 2min 25s
Wall time: 2min 23s


In [13]:
seudo_labeled_data.head()

Unnamed: 0,hash,size_of_data,virtual_address,entropy,virtual_size,malware
0,25400,63488,4096,6.6365,63195,1
1,15547,87040,4096,6.584123,86944,1
2,36607,25600,4096,6.468836,25494,1
3,21334,733184,4096,6.665506,732799,0
4,32238,1196032,4096,6.655021,1195949,0


In [14]:
seudo_labeled_data.shape

(41777, 6)

This Psedo-Labeled Dataset can be further used to train supervised models. But we must keep in mind that the new dataset has on an average 20% incorrectly labeled rows.