# Importações e funções

In [19]:
import pandas as pd
import socket
import requests
import dns.resolver
import time

# Função para obter o IP de um domínio
def get_ip(domain):
    try:
        print (socket.gethostbyname(domain))
        return socket.gethostbyname(domain)
        
    except Exception as e:
        return None

# Função para obter a localização do IP usando a API ipinfo.io
def get_location(ip):
    if ip:
        try:
            response = requests.get(f'https://ipinfo.io/{ip}/json?token=b90bb6976ece63')
            data = response.json()
            print("Aqui",data)
            loc = data.get('loc', '').split(',')
            latitude = loc[0] if len(loc) > 0 else ''
            longitude = loc[1] if len(loc) > 1 else ''
            return {
                'city': data.get('city', ''),
                'region': data.get('region', ''),
                'country': data.get('country', ''),
                'org': data.get('org', ''),
                'postal': data.get('postal', ''),
                'timezone': data.get('timezone', ''),
                'hostname': data.get('hostname', ''),
                'latitude': latitude,
                'longitude': longitude,
            }
        except Exception as e:
            return {
                'city': '',
                'region': '',
                'country': '',
                'org': '',
                'postal': '',
                'timezone': '',
                'hostname': '',
                'latitude': '',
                'longitude': '',
            }
    return {
        'city': '',
        'region': '',
        'country': '',
        'org': '',
        'postal': '',
        'timezone': '',
        'hostname': '',
        'latitude': '',
        'longitude': '',
    }

# Função para obter o DNS reverso de um IP
def get_reverse_dns(ip):
    if ip:
        try:
            return socket.gethostbyaddr(ip)[0]
        except Exception as e:
            return None
    return None

# Função para obter o número de registros DNS de um IP
def get_dns_count(ip):
    if ip:
        try:
            answers = dns.resolver.resolve(ip, 'A')
            return len(answers)
        except Exception as e:
            return 0
    return 0






# Ler dataset

In [20]:
'''
Já corridos: 
    -dataset1_block6

A correr:
    -teste
'''
# Carregar o dataset
df = pd.read_csv('Datasets/teste.csv', delimiter=';')


# Linhas em que a label não se encontra preenchida

In [21]:
# Substituir valores vazios na coluna "label" por -1
df['label'].fillna(-1, inplace=True)

# Contar os valores únicos na coluna "label"
label_counts = df['label'].value_counts()
print(label_counts)

# Encontrar os índices das linhas com valor -1 na coluna "label"
indices_to_drop = df[df['label'] == -1].index

# Remover as linhas com valor -1 na coluna "label"
df.drop(indices_to_drop, inplace=True)

label
1.0    5
0.0    1
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['label'].fillna(-1, inplace=True)


# Reverse module DNS

In [22]:
# Adicionar contador para pausar a cada 100 registros
# Adicionar colunas vazias ao DataFrame
df['IP'] = ''
df['DNS'] = ''
df['Reverse DNS'] = ''
df['n_dns'] = ''
df['City'] = ''
df['Region'] = ''
df['Country'] = ''
df['Org'] = ''
df['Postal'] = ''
df['Timezone'] = ''
df['Hostname'] = ''
df['Latitude'] = ''
df['Longitude'] = ''

# Adicionar contador para pausar a cada 100 registros
for i in range(0, len(df), 100):
    # Processar blocos de 100 registros
    chunk = df.iloc[i:i+100].copy()

    # Adicionar colunas para armazenar IP, DNS, Reverse DNS e número de DNS
    chunk['IP'] = chunk['URL'].apply(lambda url: get_ip(url.split('//')[1]))
    chunk['DNS'] = chunk['URL'].apply(lambda url: url.split('//')[1])
    chunk['Reverse DNS'] = chunk['IP'].apply(get_reverse_dns)
    chunk['n_dns'] = chunk['IP'].apply(get_dns_count)

    # Obter informações de localização e adicionar colunas separadas
    location_data = chunk['IP'].apply(get_location)
    chunk['City'] = location_data.apply(lambda loc: loc['city'])
    chunk['Region'] = location_data.apply(lambda loc: loc['region'])
    chunk['Country'] = location_data.apply(lambda loc: loc['country'])
    chunk['Org'] = location_data.apply(lambda loc: loc['org'])
    chunk['Postal'] = location_data.apply(lambda loc: loc['postal'])
    chunk['Timezone'] = location_data.apply(lambda loc: loc['timezone'])
    chunk['Hostname'] = location_data.apply(lambda loc: loc['hostname'])
    chunk['Latitude'] = location_data.apply(lambda loc: loc['latitude'])
    chunk['Longitude'] = location_data.apply(lambda loc: loc['longitude'])

    # Juntar o chunk ao DataFrame original
    df.iloc[i:i+100] = chunk.copy()

    print(f"Processed chunk {i} to {i + 100}")

    if i > 0 and i % 100 == 0:
        print(f"Processed {i} records. Sleeping for 60 seconds...")
        time.sleep(60)

# Remover colunas não nomeadas
df.drop(df.columns[df.columns.str.contains('Unnamed', case=False)], axis=1, inplace=True)

# Salvar o resultado em um novo arquivo CSV
df.to_csv('teste_reverse_dns.csv', index=False, sep=';')

# Exibir o DataFrame
print(df.head())

104.21.19.20
8.29.157.202
52.86.243.119
128.14.151.194
83.98.140.23
5.253.62.116


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['IP'] = chunk['URL'].apply(lambda url: get_ip(url.split('//')[1]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['DNS'] = chunk['URL'].apply(lambda url: url.split('//')[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Reverse DNS'] = chunk['IP'].apply(get_reverse_dns)
A value 

Aqui {'ip': '104.21.19.20', 'anycast': True, 'city': 'San Francisco', 'region': 'California', 'country': 'US', 'loc': '37.7621,-122.3971', 'org': 'AS13335 Cloudflare, Inc.', 'postal': '94107', 'timezone': 'America/Los_Angeles'}
Aqui {'ip': '8.29.157.202', 'hostname': 'cloudhost-1457425.us-midwest-1.nxcli.net', 'city': 'Detroit', 'region': 'Michigan', 'country': 'US', 'loc': '42.3314,-83.0457', 'org': 'AS36444 Liquid Web, L.L.C', 'postal': '48226', 'timezone': 'America/Detroit'}
Aqui {'ip': '52.86.243.119', 'hostname': 'ec2-52-86-243-119.compute-1.amazonaws.com', 'city': 'Ashburn', 'region': 'Virginia', 'country': 'US', 'loc': '39.0437,-77.4875', 'org': 'AS14618 Amazon.com, Inc.', 'postal': '20147', 'timezone': 'America/New_York'}
Aqui {'ip': '128.14.151.194', 'city': 'Los Angeles', 'region': 'California', 'country': 'US', 'loc': '34.0522,-118.2437', 'org': 'AS21859 Zenlayer Inc', 'postal': '90009', 'timezone': 'America/Los_Angeles'}
Aqui {'ip': '83.98.140.23', 'hostname': 'ip140-23.gyr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['City'] = location_data.apply(lambda loc: loc['city'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Region'] = location_data.apply(lambda loc: loc['region'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Country'] = location_data.apply(lambda loc: loc['country'])
A value is t

### Código que não é usado

In [23]:
""" import pandas as pd
from sklearn.model_selection import train_test_split

# Carregar o dataset
df = pd.read_csv('PhishingDataset.csv', delimiter='')

# Converter todas as colunas para strings
df = df.astype(str)

# Dividir o dataset em dois novos datasets (50% e 50%)
df1, df2 = train_test_split(df, test_size=0.5, random_state=42)

# Salvar os dois novos datasets em arquivos CSV separados
df1.to_csv('dataset1.csv', index=False,sep=';')
df2.to_csv('dataset2.csv', index=False,sep=';') """


" import pandas as pd\nfrom sklearn.model_selection import train_test_split\n\n# Carregar o dataset\ndf = pd.read_csv('PhishingDataset.csv', delimiter='')\n\n# Converter todas as colunas para strings\ndf = df.astype(str)\n\n# Dividir o dataset em dois novos datasets (50% e 50%)\ndf1, df2 = train_test_split(df, test_size=0.5, random_state=42)\n\n# Salvar os dois novos datasets em arquivos CSV separados\ndf1.to_csv('dataset1.csv', index=False,sep=';')\ndf2.to_csv('dataset2.csv', index=False,sep=';') "

In [24]:
""" import pandas as pd

# Carregar o dataset com delimitador ','
df = pd.read_csv('Datasets/total.csv', delimiter=',')

# Salvar o dataset com delimitador ';'
df.to_csv('Datasets/total.csv', index=False, sep=';')  """

""" import pandas as pd

# Carregar o dataset com delimitador ';'
df = pd.read_csv('Datasets/dataset1.csv', delimiter=';')

# Calcular o número de linhas em 20% do dataset
n = len(df)
block_size = int(n * 0.2)

# Shuffle the dataset
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Create a list to hold each 20% block
blocks = []

# Generate the 20% blocks
for i in range(5):
    start_index = i * block_size
    end_index = (i + 1) * block_size
    block = df_shuffled.iloc[start_index:end_index]
    blocks.append(block)
    # Save each block to a new CSV file
    block.to_csv(f'Datasets/dataset_block_{i+1}.csv', index=False, sep=';')

# In case the dataset size is not perfectly divisible by 5, add the remaining rows to the last block
if end_index < n:
    remaining_block = df_shuffled.iloc[end_index:]
    remaining_block.to_csv(f'Datasets/dataset_block_6.csv', index=False, sep=';')  """


" import pandas as pd\n\n# Carregar o dataset com delimitador ';'\ndf = pd.read_csv('Datasets/dataset1.csv', delimiter=';')\n\n# Calcular o número de linhas em 20% do dataset\nn = len(df)\nblock_size = int(n * 0.2)\n\n# Shuffle the dataset\ndf_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)\n\n# Create a list to hold each 20% block\nblocks = []\n\n# Generate the 20% blocks\nfor i in range(5):\n    start_index = i * block_size\n    end_index = (i + 1) * block_size\n    block = df_shuffled.iloc[start_index:end_index]\n    blocks.append(block)\n    # Save each block to a new CSV file\n    block.to_csv(f'Datasets/dataset_block_{i+1}.csv', index=False, sep=';')\n\n# In case the dataset size is not perfectly divisible by 5, add the remaining rows to the last block\nif end_index < n:\n    remaining_block = df_shuffled.iloc[end_index:]\n    remaining_block.to_csv(f'Datasets/dataset_block_6.csv', index=False, sep=';')  "