In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder


df = pd.read_csv('BHD1.csv')
print("Initial DataFrame:")
print(df.head())



Initial DataFrame:
                              address  year  day  length    weight  count  \
0   111K8kZAEnJg245r2cM6y9zgJGHZtJPy6  2017   11      18  0.008333    1.0   
1  1123pJv8jzeFQaCV4w644pzQJzVWay2zcA  2016  132      44  0.000244    1.0   
2  112536im7hy6wtKbpH1qYDWtTyMRAcA2p7  2016  246       0  1.000000    1.0   
3  1126eDRw2wqSkWosjTCre8cjjQW8sSeWH7  2016  322      72  0.003906    1.0   
4  1129TSjKtx65E35GiUo4AYVeyo48twbrGX  2016  238     144  0.072848  456.0   

   looped  neighbors       income            label  
0     0.0        2.0  100050000.0  princetonCerber  
1     0.0        1.0  100000000.0   princetonLocky  
2     0.0        2.0  200000000.0  princetonCerber  
3     0.0        2.0   71200000.0  princetonCerber  
4     0.0        1.0  200000000.0   princetonLocky  


In [2]:

df.dropna(inplace=True)

df['weight_binary'] = df['weight'].apply(lambda x: 1 if x > 1 else 0)
print("DataFrame after adding weight_binary column:")
print(df.head())

DataFrame after adding weight_binary column:
                              address  year  day  length    weight  count  \
0   111K8kZAEnJg245r2cM6y9zgJGHZtJPy6  2017   11      18  0.008333    1.0   
1  1123pJv8jzeFQaCV4w644pzQJzVWay2zcA  2016  132      44  0.000244    1.0   
2  112536im7hy6wtKbpH1qYDWtTyMRAcA2p7  2016  246       0  1.000000    1.0   
3  1126eDRw2wqSkWosjTCre8cjjQW8sSeWH7  2016  322      72  0.003906    1.0   
4  1129TSjKtx65E35GiUo4AYVeyo48twbrGX  2016  238     144  0.072848  456.0   

   looped  neighbors       income            label  weight_binary  
0     0.0        2.0  100050000.0  princetonCerber              0  
1     0.0        1.0  100000000.0   princetonLocky              0  
2     0.0        2.0  200000000.0  princetonCerber              0  
3     0.0        2.0   71200000.0  princetonCerber              0  
4     0.0        1.0  200000000.0   princetonLocky              0  


In [3]:

df['label'] = df['label'].str.strip().str.lower()

label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])
print("DataFrame after encoding labels:")
print(df.head())


DataFrame after encoding labels:
                              address  year  day  length    weight  count  \
0   111K8kZAEnJg245r2cM6y9zgJGHZtJPy6  2017   11      18  0.008333    1.0   
1  1123pJv8jzeFQaCV4w644pzQJzVWay2zcA  2016  132      44  0.000244    1.0   
2  112536im7hy6wtKbpH1qYDWtTyMRAcA2p7  2016  246       0  1.000000    1.0   
3  1126eDRw2wqSkWosjTCre8cjjQW8sSeWH7  2016  322      72  0.003906    1.0   
4  1129TSjKtx65E35GiUo4AYVeyo48twbrGX  2016  238     144  0.072848  456.0   

   looped  neighbors       income            label  weight_binary  \
0     0.0        2.0  100050000.0  princetoncerber              0   
1     0.0        1.0  100000000.0   princetonlocky              0   
2     0.0        2.0  200000000.0  princetoncerber              0   
3     0.0        2.0   71200000.0  princetoncerber              0   
4     0.0        1.0  200000000.0   princetonlocky              0   

   label_encoded  
0              8  
1              9  
2              8  
3            

In [4]:

label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label mapping:")
print(label_mapping)

df.to_csv('processed_BHD1.csv', index=False)

Label mapping:
{'montrealcryptolocker': 0, 'montrealcryptotorlocker2015': 1, 'montrealcryptxxx': 2, 'montrealdmalockerv3': 3, 'montrealflyper': 4, 'montrealsamsam': 5, 'montrealwannacry': 6, 'paduacryptowall': 7, 'princetoncerber': 8, 'princetonlocky': 9}
