<a href="https://colab.research.google.com/github/Tazkir-Hossain/Federated_Learning_Char_CNN_Model/blob/main/Fedarated_Learning_Char_CNN_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing Tools

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import string
import torch
from torch.utils.data import TensorDataset, DataLoader

## Importing the dataset

In [None]:
dataset = pd.read_csv('Data.csv')
X_raw = dataset.iloc[:, 1].astype(str).values  # Raw URLs
y_raw = dataset.iloc[:, -1].values

## Encoding categorical data

### Encoding the Independent Variable

In [None]:
ALL_CHARACTERS = string.ascii_letters + string.digits + string.punctuation
CHAR2IDX = {char: idx for idx, char in enumerate(ALL_CHARACTERS)}
NUM_CHARACTERS = len(ALL_CHARACTERS)
MAX_LENGTH = 200  # You can adjust this

def one_hot_encode_url(url, max_length):
    encoded = np.zeros((max_length, NUM_CHARACTERS), dtype=np.float32)
    for i, char in enumerate(url[:max_length]):
        if char in CHAR2IDX:
            encoded[i, CHAR2IDX[char]] = 1.0
    return encoded

# Encode URLs
X_encoded = np.array([one_hot_encode_url(url, MAX_LENGTH) for url in X_raw])

# Convert to tensors
X_tensor = torch.tensor(X_encoded)
y_tensor = torch.tensor(y).unsqueeze(1)

### Encoding the Dependent Variable

In [None]:
# Encode target: Binary 'Adult' = 1, Others = 0
y = np.array([1 if label.lower() == "adult" else 0 for label in y_raw], dtype=np.float32)
y_tensor = torch.tensor(y).unsqueeze(1)


# Create Dataset and DataLoader

In [None]:
dataset = TensorDataset(X_tensor, y_tensor)
loader = DataLoader(dataset, batch_size=64, shuffle=True)


## Splitting the dataset into the Training set and Test set

In [None]:
from torch.utils.data import random_split

# Define dataset again (if not already)
full_dataset = TensorDataset(X_tensor, y_tensor)

# Calculate lengths
train_size = int(0.7 * len(full_dataset))
val_size = int(0.15 * len(full_dataset))
test_size = len(full_dataset) - train_size - val_size

# Split dataset
train_dataset, val_dataset, test_dataset = random_split(full_dataset, [train_size, val_size, test_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


Model Building


Json Configaration


In [None]:
import json

config = {
    "alphabet": {
        "en": {
            "lower": {
                "alphabet": "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}",
                "number_of_characters": 69
            },
            "both": {
                "alphabet": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}",
                "number_of_characters": 95
            }
        }
    },
    "model_parameters": {
        "small": {
            "conv": [
                [256, 7, 3],
                [256, 7, 3],
                [256, 3, -1],
                [256, 3, -1],
                [256, 3, -1],
                [256, 3, 3]
            ],
            "fc": [1024, 1024]
        }
    },
    "data": {
        "text_column": "url",
        "label_column": "category",
        "max_length": 200,
        "num_of_classes": 2,
        "encoding": None,
        "chunksize": 50000,
        "max_rows": 100000,
        "preprocessing_steps": [
            "lower"
        ]
    },
    "training": {
        "batch_size": 128,
        "learning_rate": 0.01,
        "epochs": 10,
        "optimizer": "sgd"
    }
}

# Save the file
with open("config.json", "w") as f:
    json.dump(config, f, indent=2)


Char-CNN Model Building Code

In [None]:
# import json
# import torch
import torch.nn as nn


class CharacterLevelCNN(nn.Module):
    def __init__(self, args, number_of_classes):
        super(CharacterLevelCNN, self).__init__()

        # define conv layers

        self.dropout_input = nn.Dropout2d(args.dropout_input)

        self.conv1 = nn.Sequential(
            nn.Conv1d(
                args.number_of_characters + len(args.extra_characters),
                256,
                kernel_size=7,
                padding=0,
            ),
            nn.ReLU(),
            nn.MaxPool1d(3),
        )

        self.conv2 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size=7, padding=0), nn.ReLU(), nn.MaxPool1d(3)
        )

        self.conv3 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size=3, padding=0), nn.ReLU()
        )

        self.conv4 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size=3, padding=0), nn.ReLU()
        )

        self.conv5 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size=3, padding=0), nn.ReLU()
        )

        self.conv6 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size=3, padding=0), nn.ReLU(), nn.MaxPool1d(3)
        )

        # compute the  output shape after forwarding an input to the conv layers

        input_shape = (
            128,
            args.max_length,
            args.number_of_characters + len(args.extra_characters),
        )
        self.output_dimension = self._get_conv_output(input_shape)

        # define linear layers

        self.fc1 = nn.Sequential(
            nn.Linear(self.output_dimension, 1024), nn.ReLU(), nn.Dropout(0.5)
        )

        self.fc2 = nn.Sequential(nn.Linear(1024, 1024), nn.ReLU(), nn.Dropout(0.5))

        self.fc3 = nn.Linear(1024, number_of_classes)

        # initialize weights

        self._create_weights()

    # utility private functions

    def _create_weights(self, mean=0.0, std=0.05):
        for module in self.modules():
            if isinstance(module, nn.Conv1d) or isinstance(module, nn.Linear):
                module.weight.data.normal_(mean, std)

    def _get_conv_output(self, shape):
        x = torch.rand(shape)
        x = x.transpose(1, 2)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.conv6(x)
        x = x.view(x.size(0), -1)
        output_dimension = x.size(1)
        return output_dimension

    # forward

    def forward(self, x):
        x = self.dropout_input(x)
        x = x.transpose(1, 2)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.conv6(x)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x
