[Link a la carpeta con todo el TP](https://drive.google.com/drive/u/0/folders/1do-iyf2SzQln-fh8tmu9mSfzLuim5xs5)

# Imports stage

In [None]:
!pip install category_encoders



In [None]:
import math
import pandas as pd
import numpy as np
%matplotlib inline
import pyarrow.parquet as pq
import os

pd.options.display.float_format = '{:20,.10f}'.format

import gc

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
id_train="1Tf5pfrYBk8yM6QzC5IWebtSlja3R5BwN"
id_test="1puQpPyRVtoX_MTPJITQEtQny8a714zXk"

In [None]:
name = 'test.parq' # use train.parq/test.parq
id = id_test

if not os.path.exists(name):
  downloaded2 = drive.CreateFile({'id': id})
  downloaded2.GetContentFile(name)

# Init stage

load dataframe (select which one to encode)

In [None]:
df = pq.read_table(name).to_pandas()

# Basic encoding stage

### drop redundant columns and duplicate rows

In [None]:
df.drop(['watcher_as_num', 'attacker_as_num'], axis=1, inplace=True)
df.drop_duplicates(inplace=True)

### Efficency casts

In [None]:
df['watcher_uuid_enum'] = df['watcher_uuid_enum'].astype('uint32')

### Cut smaller countries

In [None]:
# Step 1: Count the occurrences of each category
category_counts = df['watcher_country'].value_counts()

# Step 2: Identify the top k categories
cut = 20
top_categories = category_counts.head(cut).index

# Step 3: Replace other categories with 'others'
df['watcher_country'] = df['watcher_country'].apply(lambda x: x if x in top_categories else 'other').astype('category')

In [None]:
# Step 1: Count the occurrences of each category
category_counts = df['attacker_country'].value_counts()

# Step 2: Identify the top k categories
cut = 20
top_categories = category_counts.head(cut).index

# Step 3: Replace other categories with 'others'
df['attacker_country'] = df['attacker_country'].apply(lambda x: x if x in top_categories else 'other').astype('category')

# Feature engineering stage

### just for sanity checks

In [None]:
df.sample(3)

Unnamed: 0,attack_time,watcher_country,watcher_as_name,attacker_country,attacker_as_name,attack_type,watcher_uuid_enum,attacker_ip_enum
18645111,2023-08-18 19:02:28+00:00,FI,Hetzner Online GmbH,DE,Contabo GmbH,http:spam,1543,197082
18296359,2023-08-10 23:00:57+00:00,US,SERVERCENTRAL,US,Inter Connects Inc,http:spam,10202,197534
7668252,2023-08-07 20:23:59+00:00,US,DREAMHOST-AS,GB,Enix Ltd,http:exploit,2828,83027


### Feature engineering, 1,2/5: split attack type into service and class

In [None]:
df['attack_category'] = df['attack_type'].apply(lambda x: x.split(':')[1]).astype('category')
df['attack_service'] = df['attack_type'].apply(lambda x: x.split(':')[0]).astype('category')

todo: hacer la visu para ver si me quedo con la conjunta o no


### Feature Engineering 3,4/5: encode time: hour as categorical and day sine-encoded

In [None]:
# Function to categorize the hour of day
def categorize_time(hour):
    if 9 <= hour < 15:
        return 'day'
    elif 15 <= hour < 21:
        return 'afternoon'
    else:
        return 'night'

# Feature engineering for training set
df['attack_time_of_day'] = df['attack_time'].dt.hour.apply(categorize_time).astype("category")

# sine-encode day
df['attack_day_sin'] = df['attack_time'].dt.day.apply(lambda x: math.sin((2*x*np.pi)/7)).astype('float32')
df['attack_day_cos'] = df['attack_time'].dt.day.apply(lambda x: math.cos((2*x*np.pi)/7)).astype('float32')

# Deleting unused data to free up memory
df.drop('attack_time', axis=1, inplace=True)

gc.collect()

1169

### Feature engineering 5/5: is same country?

In [None]:
all_countries = set(df['watcher_country'].unique().remove_categories([np.nan])) \
        .union(set(df['attacker_country'].unique().remove_categories([np.nan])))

all_countries.discard(np.nan)

df['watcher_country'] = df['watcher_country'].cat.set_categories(all_countries)
df['attacker_country'] = df['attacker_country'].cat.set_categories(all_countries)

df['same_country'] = (df['watcher_country'] == df['attacker_country']).astype('int8')

gc.collect()

0

### Ctrl + S

San check

In [None]:
df.sample(3)

Unnamed: 0,watcher_country,watcher_as_name,attacker_country,attacker_as_name,attack_type,watcher_uuid_enum,attacker_ip_enum,attack_category,attack_service,attack_time_of_day,attack_day_sin,attack_day_cos,same_country
6020702,US,TIER-NET,CN,CHINA UNICOM China169 Backbone,http:scan,5451,67632,scan,http,night,0.4338837266,-0.9009688497,0
10048961,US,AMAZON-AES,US,PERFECT-INTERNATIONAL,http:bruteforce,7387,118073,bruteforce,http,day,-0.9749279022,-0.2225209326,1
6655873,US,AS-CHOOPA,US,AMAZON-02,http:spam,39638,79145,spam,http,night,-0.9749279022,-0.2225209326,1


save


In [None]:
name = name.split('.')[0] + '_encoded.parq'
df.to_parquet(name)