## Pseudo Labelling using multiple supervised ML models

In [3]:
# to handle datasets
import pandas as pd
import numpy as np

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# for plotting
import matplotlib.pyplot as plt

# machine learning imports
from sklearn.svm import LinearSVC, NuSVC 
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error

### Making list of all models

In [4]:
models = [
    NuSVC(),
    LinearSVC(),
    SGDClassifier(),
    KNeighborsClassifier(),
    NearestCentroid()
]

In [5]:
# load dataset
labelled_data = pd.read_csv('dataset_labeled.csv')
unlabelled_data = pd.read_csv('dataset_unlabeled.csv')

# rows and columns of the data
print(labelled_data.shape)
print(unlabelled_data.shape)

# visualise the dataset
labelled_data.head()

(1500, 6)
(41777, 5)


Unnamed: 0,hash,size_of_data,virtual_address,entropy,virtual_size,malware
0,071e8c3f8922e186e57548cd4c703a5d,443392,4096,6.529624,442984,1
1,33f8e6d08a6aae939f25a8e0d63dd523,331264,4096,6.604314,330784,1
2,b68abd064e975e1c6d5f25e748663076,74240,4096,6.046789,73819,1
3,72049be7bd30ea61297ea624ae198067,219648,4096,6.497018,219524,1
4,c9b3700a77facf29172f32df6bc77f48,262144,4096,6.638142,261943,1


In [6]:
labelled_data.describe()

Unnamed: 0,size_of_data,virtual_address,entropy,virtual_size,malware
count,1500.0,1500.0,1500.0,1500.0,1500.0
mean,713205.4,5543.253,6.353871,713072.0,0.333333
std,2568605.0,31173.16,0.546324,2568588.0,0.471562
min,1024.0,4096.0,0.650292,900.0,0.0
25%,28672.0,4096.0,6.262846,28432.0,0.0
50%,103168.0,4096.0,6.497885,102840.0,0.0
75%,443392.0,4096.0,6.581666,442984.0,1.0
max,52962820.0,1159168.0,7.999813,52962520.0,1.0


In [7]:
labelled_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   hash             1500 non-null   object 
 1   size_of_data     1500 non-null   int64  
 2   virtual_address  1500 non-null   int64  
 3   entropy          1500 non-null   float64
 4   virtual_size     1500 non-null   int64  
 5   malware          1500 non-null   int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 70.4+ KB


In [8]:
for column in labelled_data.columns:
    print(column," ",labelled_data[column].nunique())

hash   1484
size_of_data   661
virtual_address   11
entropy   1079
virtual_size   1058
malware   2


### Encoding the Categorial data 

In [9]:
le = LabelEncoder()

In [10]:
le.fit(labelled_data['hash'])
labelled_data['hash']= le.transform(labelled_data['hash'])


le.fit(unlabelled_data['hash'])
unlabelled_data['hash']= le.transform(unlabelled_data['hash'])

labelled_data.head()

Unnamed: 0,hash,size_of_data,virtual_address,entropy,virtual_size,malware
0,34,443392,4096,6.529624,442984,1
1,293,331264,4096,6.604314,330784,1
2,1076,74240,4096,6.046789,73819,1
3,647,219648,4096,6.497018,219524,1
4,1179,262144,4096,6.638142,261943,1


In [11]:
unlabelled_data.head()

Unnamed: 0,hash,size_of_data,virtual_address,entropy,virtual_size
0,25400,63488,4096,6.6365,63195
1,15547,87040,4096,6.584123,86944
2,36607,25600,4096,6.468836,25494
3,21334,733184,4096,6.665506,732799
4,32238,1196032,4096,6.655021,1195949


### Evaulating the Performance of all the Models

Including 5-fold cross validation

In [12]:
%%time

X_train, X_test, y_train, y_test = train_test_split(labelled_data.iloc[:,:-1],
                                                    labelled_data.iloc[:,-1], test_size=0.20, random_state=42)
    
for model in models:
    model.seed = 101
    num_folds = 5
    scores = cross_val_score(model, X_train, y_train, cv=num_folds, scoring='neg_mean_squared_error')
    score_description = " {:04.3f} (+/- {:04.3f})".format((np.sqrt(scores.mean()*-1)), (scores.std() * 2))
    
    print('{model:25} CV-5 RMSE: {score}'.format(model=model.__class__.__name__, score=score_description))
    

NuSVC                     CV-5 RMSE:  0.603 (+/- 0.195)
LinearSVC                 CV-5 RMSE:  0.610 (+/- 0.169)
SGDClassifier             CV-5 RMSE:  0.777 (+/- 0.270)
KNeighborsClassifier      CV-5 RMSE:  0.481 (+/- 0.025)
NearestCentroid           CV-5 RMSE:  0.757 (+/- 0.221)
CPU times: user 550 ms, sys: 86.6 ms, total: 637 ms
Wall time: 1.69 s


### Pseudo Labelling 

In [13]:
%%time

x_train = labelled_data.iloc[:,:-1]
y_train = labelled_data.iloc[:,-1]
x_unlabeled = unlabelled_data

for model in models:
    model.seed = 101
    num_folds = 5
    model.fit(x_train, y_train)
    y_unlabeled = model.predict(x_unlabeled)
    y_unlabeled = pd.DataFrame(y_unlabeled, columns = ['Type'])
    seudo_labeled_data = unlabelled_data.join(y_unlabeled)
    seudo_labeled_data = seudo_labeled_data.fillna(0)
    x_merged = x_train.append(seudo_labeled_data.iloc[:,:-1])
    y_merged = y_train.append(seudo_labeled_data.iloc[:,-1])
    scores = cross_val_score(model, x_merged.sample(frac=1,random_state=101), y_merged.sample(frac=1,random_state=101), cv=num_folds, scoring='neg_mean_squared_error')
    score_description = " {:04.3f} (+/- {:04.3f})".format((np.sqrt(scores.mean()*-1)), (scores.std() * 2))
    print("{model:25} CV-5 RMSE: {score}".format(model=model.__class__.__name__, score=score_description))

NuSVC                     CV-5 RMSE:  0.290 (+/- 0.006)
LinearSVC                 CV-5 RMSE:  0.334 (+/- 0.153)
SGDClassifier             CV-5 RMSE:  0.131 (+/- 0.005)
KNeighborsClassifier      CV-5 RMSE:  0.155 (+/- 0.003)
NearestCentroid           CV-5 RMSE:  0.243 (+/- 0.004)
CPU times: user 2min 15s, sys: 2.33 s, total: 2min 17s
Wall time: 2min 15s
