[Link a la carpeta con todo el TP](https://drive.google.com/drive/u/0/folders/1do-iyf2SzQln-fh8tmu9mSfzLuim5xs5)

# Imports & Download stage

In [1]:
!pip install category_encoders



In [2]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from category_encoders import TargetEncoder
from category_encoders import BinaryEncoder
import pandas as pd
import numpy as np
import math
%matplotlib inline
import pyarrow.parquet as pq
import os
import random

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDClassifier
from scipy.stats import uniform, randint
from sklearn.metrics import f1_score

pd.options.display.float_format = '{:20,.10f}'.format

import gc

In [3]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [4]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [5]:
if not os.path.exists('test.parq'):
  id1="1puQpPyRVtoX_MTPJITQEtQny8a714zXk"
  downloaded1 = drive.CreateFile({'id': id1})
  downloaded1.GetContentFile('test.parq')

if not os.path.exists('train.parq'):
  id2="1Tf5pfrYBk8yM6QzC5IWebtSlja3R5BwN"
  downloaded2 = drive.CreateFile({'id': id2})
  downloaded2.GetContentFile('train.parq')

load dataframe

In [6]:
train_df = pq.read_table('train.parq').to_pandas()
test_df = pq.read_table('test.parq').to_pandas()

# Feature engineering stage

### Feature engineering, 1,2/5: split attack type into service and class

In [7]:
train_df['attack_category'] = train_df['attack_type'].apply(lambda x: x.split(':')[1]).astype('category')
train_df['attack_service'] = train_df['attack_type'].apply(lambda x: x.split(':')[0]).astype('category')

test_df['attack_category'] = test_df['attack_type'].apply(lambda x: x.split(':')[1]).astype('category')
test_df['attack_service'] = test_df['attack_type'].apply(lambda x: x.split(':')[0]).astype('category')

### Feature Engineering 3,4/5: encode time: hour as categorical and day sine-encoded

In [8]:
# Function to categorize the hour of day
def categorize_time(hour):
    if 7 <= hour < 12:
        return 'morning'
    elif 13 <= hour < 19:
        return 'afternoon'
    else:
        return 'night'

# Feature engineering for training set
train_df['attack_time_of_day'] = train_df['attack_time'].dt.hour.apply(categorize_time).astype("category")
test_df['attack_time_of_day'] = test_df['attack_time'].dt.hour.apply(categorize_time).astype("category")

# sine-encode day
train_df['attack_day_sin'] = train_df['attack_time'].dt.dayofweek.apply(lambda x: math.sin((2*x*np.pi)/7)).astype('float32')
train_df['attack_day_cos'] = train_df['attack_time'].dt.dayofweek.apply(lambda x: math.cos((2*x*np.pi)/7)).astype('float32')

test_df['attack_day_sin'] = test_df['attack_time'].dt.dayofweek.apply(lambda x: math.sin((2*x*np.pi)/7)).astype('float32')
test_df['attack_day_cos'] = test_df['attack_time'].dt.dayofweek.apply(lambda x: math.cos((2*x*np.pi)/7)).astype('float32')

# Deleting unused data to free up memory
train_df.drop('attack_time', axis=1, inplace=True)
test_df.drop('attack_time', axis=1, inplace=True)

gc.collect()

48

### Feature engineering 5/5: attacker/watcher continent

In [9]:
continents = {
    'US': 'NA', 'DE': 'EU', 'SG': 'AS', 'CN': 'AS', 'NL': 'EU', 'GI': 'EU',
    'FR': 'EU', 'IN': 'AS', 'GB': 'EU', 'RU': 'EU', 'CA': 'NA', 'GM': 'AF',
    'VN': 'AS', 'BR': 'SA', 'ID': 'AS', 'HK': 'AS', 'IT': 'EU', 'GN': 'AF',
    'AU': 'OC', 'JP': 'AS', 'KR': 'AS', 'UA': 'EU', 'SE': 'EU', 'GP': 'CA',
    'FI': 'EU', 'BG': 'EU', 'IR': 'AS', 'TR': 'AS', 'BD': 'AS', 'PH': 'AS',
    'ES': 'EU', 'SK': 'EU', 'TH': 'AS', 'PL': 'EU', 'AR': 'SA', 'GQ': 'AF',
    'LU': 'EU', 'CH': 'EU', 'PK': 'AS', 'MX': 'NA', 'RO': 'EU', 'GU': 'CA',
    'ZA': 'AF', 'BE': 'EU', 'NO': 'EU', 'RS': 'EU', 'TW': 'AS', 'GY': 'SA',
    'AT': 'EU', 'AE': 'AS', 'LV': 'EU', 'MY': 'AS', 'PT': 'EU', 'KM': 'AF',
    'CZ': 'EU', 'CO': 'SA', 'AZ': 'AS', 'DK': 'EU', 'LT': 'EU', 'LC': 'CA',
    'IL': 'AS', 'KH': 'AS', 'IE': 'EU', 'GR': 'EU', 'VE': 'SA', 'ML': 'AF',
    'PE': 'SA', 'SA': 'AS', 'KE': 'AF', 'CL': 'SA', 'BF': 'AF', 'MW': 'AF',
    'MA': 'AF', 'MD': 'EU', 'CI': 'AF', 'MN': 'AS', 'HR': 'EU', 'NA': 'AF',
    'KZ': 'AS', 'EG': 'AF', 'ET': 'AF', 'MC': 'EU', 'MU': 'AF', 'RE': 'AF',
    'TN': 'AF', 'DO': 'NA', 'NP': 'AS', 'PA': 'CA', 'BM': 'NA', 'SD': 'AF',
    'GH': 'AF', 'HU': 'EU', 'EE': 'EU', 'BY': 'EU', 'BA': 'EU', 'SX': 'CA',
    'DZ': 'AF', 'MK': 'EU', 'GE': 'AS', 'EC': 'SA', 'IQ': 'AS', 'TC': 'CA',
    'LK': 'AS', 'NG': 'AF', 'AM': 'EU', 'AL': 'EU', 'BW': 'AF', 'YE': 'AS',
    'LB': 'AS', 'AO': 'AF', 'NZ': 'OC', 'JO': 'AS', 'BO': 'SA', 'NC': 'OC',
    'TZ': 'AF', 'PY': 'SA', 'GT': 'CA', 'HN': 'CA', 'BN': 'AS', 'SM': 'EU',
    'MZ': 'AF', 'UZ': 'AS', 'QA': 'AS', 'CY': 'EU', 'LA': 'AS', 'SI': 'EU',
    'OM': 'AS', 'KW': 'AS', 'UG': 'AF', 'ME': 'EU', 'SV': 'CA', 'AS': 'NA',
    'JM': 'NA', 'BH': 'AS', 'UY': 'SA', 'BB': 'NA', 'BQ': 'SA', 'CC': 'OC',
    'RW': 'AF', 'PF': 'OC', 'CR': 'CA', 'MV': 'AS', 'PS': 'AS', 'CK': 'OC',
    'LY': 'AF', 'SN': 'AF', 'ZW': 'AF', 'TT': 'NA', 'XK': 'EU', 'FK': 'SA',
    'IS': 'EU', 'MT': 'EU', 'MM': 'AS', 'KG': 'AS', 'BI': 'AF', 'LI': 'EU',
    'WS': 'OC', 'BZ': 'CA', 'PR': 'NA', 'TG': 'AF', 'BS': 'NA', 'TF': 'SA',
    'NI': 'CA', 'SC': 'AF', 'ZM': 'AF', 'CM': 'AF', 'MR': 'AF',
    'VG': 'NA', 'CW': 'NA', 'SO': 'AF', 'MG': 'AF', 'DM': 'NA',
    'AF': 'AS', 'SY': 'AS', 'CD': 'AF', 'CV': 'AF', 'SS': 'AF', 'AG': 'NA',
    'SR': 'SA', 'TJ': 'AS', 'HT': 'NA', 'BJ': 'AF', 'AD': 'EU',
    'DJ': 'AF', 'IM': 'EU', 'MO': 'AS', 'GG': 'EU', 'GF': 'SA',
    'BT': 'AS', 'ER': 'AF', 'NE': 'AF', 'FO': 'EU', 'GD': 'NA',
    'SB': 'OC', 'SZ': 'AF', 'KN': 'NA', 'KY': 'NA', 'FM': 'OC',
    'LS': 'AF', 'VC': 'NA', 'CX': 'OC', 'GW': 'AF', 'PG': 'OC',
    'PM': 'NA', 'GL': 'NA', 'SH': 'AF', 'ST': 'AF', 'GA': 'AF',
    'VI': 'NA', 'LR': 'AF', 'TL': 'AS', 'CG': 'AF', 'MP': 'OC',
    'KP': 'AS', 'CU': 'CA', 'AI': 'NA', 'SL': 'AF', 'AW': 'NA',
    'FJ': 'OC', 'MQ': 'NA', 'JE': 'EU', 'GG': 'EU'
}

In [10]:
test_df['attacker_continent'] = test_df['attacker_country'].apply(lambda x: continents[x]).astype('category')
test_df['watcher_continent'] = test_df['watcher_country'].apply(lambda x: continents[x]).astype('category')

train_df['attacker_continent'] = train_df['attacker_country'].apply(lambda x: continents[x]).astype('category')
train_df['watcher_continent'] = train_df['watcher_country'].apply(lambda x: continents[x]).astype('category')

del continents
gc.collect()

0

### Ctrl+s

In [11]:
train_df.to_parquet("train_fe.pq")
test_df.to_parquet("test_fe.pq")

# Init stage

Load in case I don't have them

In [7]:
import pyarrow.parquet as pq
train_df = pq.read_table('train_fe.pq').to_pandas()
test_df = pq.read_table('test_fe.pq').to_pandas()

Train/test split based on IPs

In [8]:
# Create a DataFrame with unique IPs and their corresponding labels
raw_df = train_df
ip_label_df = raw_df[['attacker_ip_enum', 'label']].drop_duplicates()

# Split the unique IPs into two sets with roughly 80%-20% proportions, stratified by label
train_ips, test_ips = train_test_split(ip_label_df['attacker_ip_enum'],
                                       test_size=0.2,
                                       random_state=42,
                                       stratify=ip_label_df['label'])

# Use these sets to filter the original DataFrame
train_df = raw_df[raw_df['attacker_ip_enum'].isin(train_ips)]
val_df = raw_df[raw_df['attacker_ip_enum'].isin(test_ips)]

del raw_df
del ip_label_df
gc.collect()

4

# Encoding stage

Para el encoding traté de mantener la mayor cantidad de información posible de los dataframes. Por eso sólo usé mean cuando había demasiadas categorías.

Además, las categorías mean-encodeadas en el preceptron eran muy importantes, por lo que me pareció que beneficiaría al modelo tenerlas así

Por como me quedó armado el dataset, no tuve que tener ninguna consideración especial por usar un modelo de árboles ya que las únicas columnas que no son un mean/OHE/binary son las del día que están encodeadas con sinoidales

### Drop IP to avoid overfitting/target leak

In [9]:
train_df.drop(['attacker_ip_enum'], axis=1, inplace=True)
val_df.drop(['attacker_ip_enum'], axis=1, inplace=True)

test_ips = test_df['attacker_ip_enum'] # I'll need these for kaggle submission
test_df.drop(['attacker_ip_enum'], axis=1, inplace=True)

### Countries (binary)

In [None]:
# Initialize a BinaryEncoder for the 'watcher_country' column
binary_encoder = BinaryEncoder(cols=['watcher_country'])

# Fit and transform on the training data
train_df_encoded = binary_encoder.fit_transform(train_df[['watcher_country']])

# Transform the test data
test_df_encoded = binary_encoder.transform(test_df[['watcher_country']])
val_df_encoded = binary_encoder.transform(val_df[['watcher_country']])

# Merge the encoded columns back into the original DataFrames
train_df = pd.concat([train_df, train_df_encoded], axis=1)
test_df = pd.concat([test_df, test_df_encoded], axis=1)
val_df = pd.concat([val_df, val_df_encoded], axis=1)

# Clean up unnecessary variables
del train_df_encoded, test_df_encoded, val_df_encoded
gc.collect()

# Drop the original 'watcher_country' column
train_df.drop('watcher_country', axis=1, inplace=True)
test_df.drop('watcher_country', axis=1, inplace=True)
val_df.drop('watcher_country', axis=1, inplace=True)

# Cast to uint8 for efficiency
train_df['watcher_country_0'] = train_df['watcher_country_0'].astype('uint8')
test_df['watcher_country_0'] = test_df['watcher_country_0'].astype('uint8')
val_df['watcher_country_0'] = val_df['watcher_country_0'].astype('uint8')

train_df['watcher_country_1'] = train_df['watcher_country_1'].astype('uint8')
test_df['watcher_country_1'] = test_df['watcher_country_1'].astype('uint8')
val_df['watcher_country_1'] = val_df['watcher_country_1'].astype('uint8')

train_df['watcher_country_2'] = train_df['watcher_country_2'].astype('uint8')
test_df['watcher_country_2'] = test_df['watcher_country_2'].astype('uint8')
val_df['watcher_country_2'] = val_df['watcher_country_2'].astype('uint8')

train_df['watcher_country_3'] = train_df['watcher_country_3'].astype('uint8')
test_df['watcher_country_3'] = test_df['watcher_country_3'].astype('uint8')
val_df['watcher_country_3'] = val_df['watcher_country_3'].astype('uint8')

train_df['watcher_country_4'] = train_df['watcher_country_4'].astype('uint8')
test_df['watcher_country_4'] = test_df['watcher_country_4'].astype('uint8')
val_df['watcher_country_4'] = val_df['watcher_country_4'].astype('uint8')

In [None]:
# Initialize a BinaryEncoder for the 'attacker_country' column
binary_encoder = BinaryEncoder(cols=['attacker_country'])

# Fit and transform on the training data
train_df_encoded = binary_encoder.fit_transform(train_df[['attacker_country']])

# Transform the test data
test_df_encoded = binary_encoder.transform(test_df[['attacker_country']])
val_df_encoded = binary_encoder.transform(val_df[['attacker_country']])

# Merge the encoded columns back into the original DataFrames
train_df = pd.concat([train_df, train_df_encoded], axis=1)
test_df = pd.concat([test_df, test_df_encoded], axis=1)
val_df = pd.concat([val_df, val_df_encoded], axis=1)

# Clean up unnecessary variables
del train_df_encoded, test_df_encoded, val_df_encoded
gc.collect()

# Drop the original 'attacker_country' column
train_df.drop('attacker_country', axis=1, inplace=True)
test_df.drop('attacker_country', axis=1, inplace=True)
val_df.drop('attacker_country', axis=1, inplace=True)

# Cast to uint8 for efficiency
train_df['attacker_country_0'] = train_df['attacker_country_0'].astype('uint8')
test_df['attacker_country_0'] = test_df['attacker_country_0'].astype('uint8')
val_df['attacker_country_0'] = val_df['attacker_country_0'].astype('uint8')

train_df['attacker_country_1'] = train_df['attacker_country_1'].astype('uint8')
test_df['attacker_country_1'] = test_df['attacker_country_1'].astype('uint8')
val_df['attacker_country_1'] = val_df['attacker_country_1'].astype('uint8')

train_df['attacker_country_2'] = train_df['attacker_country_2'].astype('uint8')
test_df['attacker_country_2'] = test_df['attacker_country_2'].astype('uint8')
val_df['attacker_country_2'] = val_df['attacker_country_2'].astype('uint8')

train_df['attacker_country_3'] = train_df['attacker_country_3'].astype('uint8')
test_df['attacker_country_3'] = test_df['attacker_country_3'].astype('uint8')
val_df['attacker_country_3'] = val_df['attacker_country_3'].astype('uint8')

train_df['attacker_country_4'] = train_df['attacker_country_4'].astype('uint8')
test_df['attacker_country_4'] = test_df['attacker_country_4'].astype('uint8')
val_df['attacker_country_4'] = val_df['attacker_country_4'].astype('uint8')

### AS Names (Mean/Target)

In [10]:
target_encoder = TargetEncoder(cols=['watcher_as_name'])

train_df_encoded = target_encoder.fit_transform(train_df[['watcher_as_name']], train_df['label'])
test_df_encoded = target_encoder.transform(test_df[['watcher_as_name']])
val_df_encoded = target_encoder.transform(val_df[['watcher_as_name']])

train_df[['watcher_as_name']] = train_df_encoded.astype('float32')
test_df[['watcher_as_name']] = test_df_encoded.astype('float32')
val_df[['watcher_as_name']] = val_df_encoded.astype('float32')

del train_df_encoded, test_df_encoded, val_df_encoded, target_encoder
gc.collect()

0

In [11]:
target_encoder = TargetEncoder(cols=['attacker_as_name'])

train_df_encoded = target_encoder.fit_transform(train_df[['attacker_as_name']], train_df['label'])
test_df_encoded = target_encoder.transform(test_df[['attacker_as_name']])
val_df_encoded = target_encoder.transform(val_df[['attacker_as_name']])

train_df[['attacker_as_name']] = train_df_encoded.astype('float32')
test_df[['attacker_as_name']] = test_df_encoded.astype('float32')
val_df[['attacker_as_name']] = val_df_encoded.astype('float32')

del train_df_encoded, test_df_encoded, val_df_encoded, target_encoder
gc.collect()

0

### Watcher UUID Enums (Mean/Target)

In [12]:
target_encoder = TargetEncoder(cols=['watcher_uuid_enum'])

train_df_encoded = target_encoder.fit_transform(train_df[['watcher_uuid_enum']], train_df['label'])
test_df_encoded = target_encoder.transform(test_df[['watcher_uuid_enum']])
val_df_encoded = target_encoder.transform(val_df[['watcher_uuid_enum']])

train_df[['watcher_uuid_enum']] = train_df_encoded.astype('float32')
test_df[['watcher_uuid_enum']] = test_df_encoded.astype('float32')
val_df[['watcher_uuid_enum']] = val_df_encoded.astype('float32')

del train_df_encoded, test_df_encoded, val_df_encoded, target_encoder
gc.collect()

0

### Attack type (OHE/Bin)

In [None]:
#Not sure if I want to use this yet

train_df.drop(['attack_type'], axis=1, inplace=True)
test_df.drop(['attack_type'], axis=1, inplace=True)
val_df.drop(['attack_type'], axis=1, inplace=True)

In [None]:
# Initialize a BinaryEncoder for the 'attack_service' column
binary_encoder = BinaryEncoder(cols=['attack_service'])

# Fit and transform on the training data
train_df_encoded = binary_encoder.fit_transform(train_df[['attack_service']])

# Transform the test data
test_df_encoded = binary_encoder.transform(test_df[['attack_service']])
val_df_encoded = binary_encoder.transform(val_df[['attack_service']])

# Merge the encoded columns back into the original DataFrames
train_df = pd.concat([train_df, train_df_encoded], axis=1)
test_df = pd.concat([test_df, test_df_encoded], axis=1)
val_df = pd.concat([val_df, val_df_encoded], axis=1)

# Clean up unnecessary variables
del train_df_encoded, test_df_encoded, val_df_encoded
gc.collect()

# Drop the original 'attack_service' column
train_df.drop('attack_service', axis=1, inplace=True)
test_df.drop('attack_service', axis=1, inplace=True)
val_df.drop('attack_service', axis=1, inplace=True)

# Cast to uint8 for efficiency
train_df['attack_service_0'] = train_df['attack_service_0'].astype('uint8')
test_df['attack_service_0'] = test_df['attack_service_0'].astype('uint8')
val_df['attack_service_0'] = val_df['attack_service_0'].astype('uint8')

train_df['attack_service_1'] = train_df['attack_service_1'].astype('uint8')
test_df['attack_service_1'] = test_df['attack_service_1'].astype('uint8')
val_df['attack_service_1'] = val_df['attack_service_1'].astype('uint8')

train_df['attack_service_2'] = train_df['attack_service_2'].astype('uint8')
test_df['attack_service_2'] = test_df['attack_service_2'].astype('uint8')
val_df['attack_service_2'] = val_df['attack_service_2'].astype('uint8')

train_df['attack_service_3'] = train_df['attack_service_3'].astype('uint8')
test_df['attack_service_3'] = test_df['attack_service_3'].astype('uint8')
val_df['attack_service_3'] = val_df['attack_service_3'].astype('uint8')

In [None]:
# Create a OneHotEncoder instance
encoder = OneHotEncoder(sparse=True, drop='first')  # Use sparse=True

# Fit and transform the 'attack_category' column in both train and test DataFrames
encoded_attack_category_train = encoder.fit_transform(train_df[['attack_category']]
                                                      )
# Convert the sparse matrices to dense arrays of integers
encoded_attack_category_train = encoded_attack_category_train.todense().astype('int8')

# Create DataFrames for the one-hot encoded 'attack_category' column
encoded_attack_category_train_df = pd.DataFrame(encoded_attack_category_train, columns=encoder.get_feature_names_out(['attack_category']))

# Concatenate the one-hot encoded 'attack_category' columns to the respective DataFrames
train_df = train_df.reset_index().drop(['index', 'attack_category'],axis=1).join(encoded_attack_category_train_df)

# Run GC to save memory
del encoded_attack_category_train, encoded_attack_category_train_df
gc.collect()

# Do it again for other 2 dataframes

encoded_attack_category_test = encoder.transform(test_df[['attack_category']])
encoded_attack_category_test = encoded_attack_category_test.todense().astype('int8')
encoded_attack_category_test_df = pd.DataFrame(encoded_attack_category_test, columns=encoder.get_feature_names_out(['attack_category']))
test_df = test_df.reset_index().drop(['index', 'attack_category'],axis=1).join(encoded_attack_category_test_df)
del encoded_attack_category_test, encoded_attack_category_test_df
gc.collect()

encoded_attack_category_val = encoder.transform(val_df[['attack_category']])
encoded_attack_category_val = encoded_attack_category_val.todense().astype('int8')
encoded_attack_category_val_df = pd.DataFrame(encoded_attack_category_val, columns=encoder.get_feature_names_out(['attack_category']))
val_df = val_df.reset_index().drop(['index', 'attack_category'],axis=1).join(encoded_attack_category_val_df)
del encoded_attack_category_val, encoded_attack_category_val_df
gc.collect()

### Attack time of day (OHE)

In [None]:
# Create a OneHotEncoder instance
encoder = OneHotEncoder(sparse=True, drop='first')  # Use sparse=True

# Fit and transform the 'attack_time_of_day' column in both train and test DataFrames
encoded_attack_time_of_day_train = encoder.fit_transform(train_df[['attack_time_of_day']]
                                                      )
# Convert the sparse matrices to dense arrays of integers
encoded_attack_time_of_day_train = encoded_attack_time_of_day_train.todense().astype('int8')

# Create DataFrames for the one-hot encoded 'attack_time_of_day' column
encoded_attack_time_of_day_train_df = pd.DataFrame(encoded_attack_time_of_day_train, columns=encoder.get_feature_names_out(['attack_time_of_day']))

# Concatenate the one-hot encoded 'attack_time_of_day' columns to the respective DataFrames
train_df = train_df.reset_index().drop(['index', 'attack_time_of_day'],axis=1).join(encoded_attack_time_of_day_train_df)

# Run GC to save memory
del encoded_attack_time_of_day_train, encoded_attack_time_of_day_train_df
gc.collect()

# Do it again for other 2 dataframes

encoded_attack_time_of_day_test = encoder.transform(test_df[['attack_time_of_day']])
encoded_attack_time_of_day_test = encoded_attack_time_of_day_test.todense().astype('int8')
encoded_attack_time_of_day_test_df = pd.DataFrame(encoded_attack_time_of_day_test, columns=encoder.get_feature_names_out(['attack_time_of_day']))
test_df = test_df.reset_index().drop(['index', 'attack_time_of_day'],axis=1).join(encoded_attack_time_of_day_test_df)
del encoded_attack_time_of_day_test, encoded_attack_time_of_day_test_df
gc.collect()

encoded_attack_time_of_day_val = encoder.transform(val_df[['attack_time_of_day']])
encoded_attack_time_of_day_val = encoded_attack_time_of_day_val.todense().astype('int8')
encoded_attack_time_of_day_val_df = pd.DataFrame(encoded_attack_time_of_day_val, columns=encoder.get_feature_names_out(['attack_time_of_day']))
val_df = val_df.reset_index().drop(['index', 'attack_time_of_day'],axis=1).join(encoded_attack_time_of_day_val_df)
del encoded_attack_time_of_day_val, encoded_attack_time_of_day_val_df
gc.collect()

### Continents (OHE)

In [None]:
# Create a OneHotEncoder instance
encoder = OneHotEncoder(sparse=True, drop='first')  # Use sparse=True

# Fit and transform the 'attacker_continent' column in both train and test DataFrames
encoded_attacker_continent_train = encoder.fit_transform(train_df[['attacker_continent']]
                                                      )
# Convert the sparse matrices to dense arrays of integers
encoded_attacker_continent_train = encoded_attacker_continent_train.todense().astype('int8')

# Create DataFrames for the one-hot encoded 'attacker_continent' column
encoded_attacker_continent_train_df = pd.DataFrame(encoded_attacker_continent_train, columns=encoder.get_feature_names_out(['attacker_continent']))

# Concatenate the one-hot encoded 'attacker_continent' columns to the respective DataFrames
train_df = train_df.reset_index().drop(['index', 'attacker_continent'],axis=1).join(encoded_attacker_continent_train_df)

# Run GC to save memory
del encoded_attacker_continent_train, encoded_attacker_continent_train_df
gc.collect()

# Do it again for other 2 dataframes

encoded_attacker_continent_test = encoder.transform(test_df[['attacker_continent']])
encoded_attacker_continent_test = encoded_attacker_continent_test.todense().astype('int8')
encoded_attacker_continent_test_df = pd.DataFrame(encoded_attacker_continent_test, columns=encoder.get_feature_names_out(['attacker_continent']))
test_df = test_df.reset_index().drop(['index', 'attacker_continent'],axis=1).join(encoded_attacker_continent_test_df)
del encoded_attacker_continent_test, encoded_attacker_continent_test_df
gc.collect()

encoded_attacker_continent_val = encoder.transform(val_df[['attacker_continent']])
encoded_attacker_continent_val = encoded_attacker_continent_val.todense().astype('int8')
encoded_attacker_continent_val_df = pd.DataFrame(encoded_attacker_continent_val, columns=encoder.get_feature_names_out(['attacker_continent']))
val_df = val_df.reset_index().drop(['index', 'attacker_continent'],axis=1).join(encoded_attacker_continent_val_df)
del encoded_attacker_continent_val, encoded_attacker_continent_val_df
gc.collect()

In [None]:
# Create a OneHotEncoder instance
encoder = OneHotEncoder(sparse=True, drop='first')  # Use sparse=True

# Fit and transform the 'watcher_continent' column in both train and test DataFrames
encoded_watcher_continent_train = encoder.fit_transform(train_df[['watcher_continent']]
                                                      )
# Convert the sparse matrices to dense arrays of integers
encoded_watcher_continent_train = encoded_watcher_continent_train.todense().astype('int8')

# Create DataFrames for the one-hot encoded 'watcher_continent' column
encoded_watcher_continent_train_df = pd.DataFrame(encoded_watcher_continent_train, columns=encoder.get_feature_names_out(['watcher_continent']))

# Concatenate the one-hot encoded 'watcher_continent' columns to the respective DataFrames
train_df = train_df.reset_index().drop(['index', 'watcher_continent'],axis=1).join(encoded_watcher_continent_train_df)

# Run GC to save memory
del encoded_watcher_continent_train, encoded_watcher_continent_train_df
gc.collect()

# Do it again for other 2 dataframes

encoded_watcher_continent_test = encoder.transform(test_df[['watcher_continent']])
encoded_watcher_continent_test = encoded_watcher_continent_test.todense().astype('int8')
encoded_watcher_continent_test_df = pd.DataFrame(encoded_watcher_continent_test, columns=encoder.get_feature_names_out(['watcher_continent']))
test_df = test_df.reset_index().drop(['index', 'watcher_continent'],axis=1).join(encoded_watcher_continent_test_df)
del encoded_watcher_continent_test, encoded_watcher_continent_test_df
gc.collect()

encoded_watcher_continent_val = encoder.transform(val_df[['watcher_continent']])
encoded_watcher_continent_val = encoded_watcher_continent_val.todense().astype('int8')
encoded_watcher_continent_val_df = pd.DataFrame(encoded_watcher_continent_val, columns=encoder.get_feature_names_out(['watcher_continent']))
val_df = val_df.reset_index().drop(['index', 'watcher_continent'],axis=1).join(encoded_watcher_continent_val_df)
del encoded_watcher_continent_val, encoded_watcher_continent_val_df
gc.collect()

### Checkpoint!

In [None]:
train_df.sample(5)

# Model Stage

In [None]:
# I need all memory I can get
test_df.to_parquet('test_processed.parq')
del test_df
gc.collect()

### Hiperparam search

Metodología de la búsqueda: me guardé el score para los defaults (lo único que cambié para la 'base' fue el `n_estimators`=10), y luego puse rangos que me parecieron aceptables alrededor de esos parámetros.

Luego lo dejé corriendo y que me devolviera todos las combinaciones que superaran esa base, y repetir eso para varias combinaciones de los parámetros más 'categóricos'.

El problema de hacer este tipo de cosas con random forest es que tarda *mucho más* que SGD en entrenar (~15 minutos por fit vs. ~30 segundos), por lo que no podía probar muchas combinaciones. Este problema se ve claro en los rangos de `n_estimators`: números más altos podrían darme mejores resultados, pero el tiempo que tardaría en fitear crecería de más (con el default de 100 tarda más de una hora)

In [None]:
train0_df = train_df[train_df.label == 0]
train1_df = train_df[train_df.label == 1]

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier

# Define hyperparameters
n_batches = 6 # ~?M batch size
n_max_iter = 10000
cut_score = 0.48

# Split the data into features (X) and target variable (y)
X_train = train_df.drop('label', axis=1)
y_train = train_df['label']

X_batch_size = int(np.floor(X_train.shape[0]/n_batches))
y_batch_size = int(np.floor(y_train.shape[0]/n_batches))

X_test = val_df.drop('label', axis=1)
y_test = val_df['label']

penalty_opt = ['l1', 'l2', 'elasticnet']
for n_iter in range(n_max_iter):

  params = {
        'C': random.uniform(0.3, 1.0),
        'max_iter': random.randint(1000, 10000),
        'tol': (random.uniform(10e-10, 10))*(10**(-1*random.randint(2,5))),
        'n_iter_no_change': random.randint(5, 15),
        'shuffle': False, # Va MUCHO mejor que true
        'loss': 'squared_hinge' # Va mejor que hinge
        }

  # Instantiate the model
  model = PassiveAggressiveClassifier(
                C=params['C'],
                max_iter=params['max_iter'],
                tol=params['tol'],
                n_iter_no_change=params['n_iter_no_change'],
                shuffle=params['shuffle'],
                loss=params['loss'],
                random_state=42
                )

  # Train the model in batches, keeping label proportion
  for i in range(0, n_batches):
      X_batch = X_train.iloc[i*X_batch_size: (i+1)*X_batch_size]
      gc.collect()

      y_batch = y_train.iloc[i*y_batch_size:i*y_batch_size + y_batch_size]

      model.partial_fit(X_batch, y_batch, classes=[0,1])

  # Make predictions on the test set
  y_pred = model.predict(X_test)

  # Evaluate the model
  f1_accuracy = f1_score(y_test, y_pred)

  if f1_accuracy > cut_score:
    max_score = f1_accuracy
    print(f'F1 Accuracy: {f1_accuracy}')
    print(f'Best Params: {params}')


F1 Accuracy: 0.4968466743265671
Best Params: {'C': 0.7224302804776632, 'max_iter': 7997, 'tol': 0.0007947787572787601, 'n_iter_no_change': 7, 'shuffle': False, 'loss': 'squared_hinge'}
F1 Accuracy: 0.5003645674364975
Best Params: {'C': 0.5556407531322847, 'max_iter': 1087, 'tol': 0.002119759349945546, 'n_iter_no_change': 7, 'shuffle': False, 'loss': 'squared_hinge'}
F1 Accuracy: 0.49880063343536796
Best Params: {'C': 0.6373848772119706, 'max_iter': 3878, 'tol': 0.004772078636473666, 'n_iter_no_change': 6, 'shuffle': False, 'loss': 'squared_hinge'}
F1 Accuracy: 0.4937803618232888
Best Params: {'C': 0.8445998446157337, 'max_iter': 1414, 'tol': 0.05860853770160484, 'n_iter_no_change': 14, 'shuffle': False, 'loss': 'squared_hinge'}
F1 Accuracy: 0.5047416426179091
Best Params: {'C': 0.32446347324947533, 'max_iter': 7129, 'tol': 0.00037569430947009905, 'n_iter_no_change': 9, 'shuffle': False, 'loss': 'squared_hinge'}
F1 Accuracy: 0.5006547192758677
Best Params: {'C': 0.5393443739095769, 'max

### Resultados


F1 Accuracy: 0.5037195819215238

Best Params: {'C': 0.37986432749857796, 'max_iter': 6855, 'tol': 0.007251420064349369, 'n_iter_no_change': 8, 'shuffle': False, 'loss': 'squared_hinge'}


---

F1 Accuracy: 0.5044168717758521

Best Params: {'C': 0.3358956626879644, 'max_iter': 4684, 'tol': 0.009943273587276515, 'n_iter_no_change': 11, 'shuffle': False, 'loss': 'squared_hinge'}


### Kaggle submission

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier

# Define hyperparameters
n_batches = 6 # ~?M batch size
n_max_iter = 10000
cut_score = 0.48

# Split the data into features (X) and target variable (y)
X_train = train_df.drop('label', axis=1)
y_train = train_df['label']

X_batch_size = int(np.floor(X_train.shape[0]/n_batches))
y_batch_size = int(np.floor(y_train.shape[0]/n_batches))

X_test = val_df.drop('label', axis=1)
y_test = val_df['label']

params =  {'C': 0.37986432749857796,
  'max_iter': 6855,
  'tol': 0.007251420064349369,
  'n_iter_no_change': 8,
  'shuffle': False,
  'loss': 'squared_hinge'
  }

# Instantiate the model
model = PassiveAggressiveClassifier(
              C=params['C'],
              max_iter=params['max_iter'],
              tol=params['tol'],
              n_iter_no_change=params['n_iter_no_change'],
              shuffle=params['shuffle'],
              loss=params['loss'],
              random_state=42
              )

# Train the model in batches, keeping label proportion
for i in range(0, n_batches):
    X_batch = X_train.iloc[i*X_batch_size: (i+1)*X_batch_size]
    gc.collect()

    y_batch = y_train.iloc[i*y_batch_size:i*y_batch_size + y_batch_size]

    model.partial_fit(X_batch, y_batch, classes=[0,1])

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
f1_accuracy = f1_score(y_test, y_pred)

print(f'F1 Accuracy: {f1_accuracy}')
print(f'Best Params: {params}')


NameError: ignored

In [None]:
test_df = pq.read_table('test_processed.parq').to_pandas()

In [None]:
# in case I need them again
train_df = pq.read_table('train_encoded.parq').to_pandas()
val_df = pq.read_table('val_encoded.parq').to_pandas()

In [None]:
train_df.to_parquet('train_encoded.parq')
val_df.to_parquet('val_encoded.parq')
del train_df, val_df
gc.collect()

0

In [None]:
kaggle_pred = model.predict(test_df)
kaggle_df = pd.DataFrame({'attacker_ip_enum': test_ips, 'prediction': kaggle_pred})

In [None]:
kaggle_df = kaggle_df.groupby('attacker_ip_enum').mean().reset_index()
kaggle_df['prediction'] = kaggle_df['prediction'].round().astype(int)

In [None]:
kaggle_df.set_index('attacker_ip_enum').to_csv('rcv_submission.csv')