
#Text classification with Dual-Head Model

The purpose of this code is to perform text classification using a dual-head neural network model. The objective is to classify text based on both the gender of author (male or female) and the date of the first edition (before or after 1900) simultaneously. The code can be divided into the following sections:

**Setting Up Environment:**<br>

 - Importing necessary libraries (numpy, warnings, random, and torch).
Setting up random seeds for reproducibility.

**Loading and Preprocessing Data:**<br>

 - Loading TF-IDF features from a NumPy file (tf-idf.npy).
 - Reading labels for sex and year from text files (sex.txt and year.txt).
 - Splitting the data into training and testing sets using train_test_split.

**Defining the Neural Network Model:**<br>

 - Creating a neural network class (MyNet) with specific layers (fc1, fc_sex, fc_year).
 - Initializing an instance of the neural network.

**Training the Neural Network:**<br>

 - Defining loss function (CrossEntropyLoss) and optimizer (Adam).
 - Iterating through epochs and batches, performing forward and backward passes, updating weights.

**Testing the Neural Network:**<br>

 - Evaluating the trained model on the test set.
 - Calculating accuracy and F1 scores for both sex and year predictions.



## Part 0. Setting Up Environment

In [None]:
# Import necessary libraries
import numpy as np
import warnings
import random
import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
import torch.optim as optim
import torch.utils.data as Data

# Suppress warnings
warnings.filterwarnings('ignore')

In [None]:
# Set random seed for reproducibility
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True

# Set random seed
setup_seed(20)

## Part 1. Loading and Preprocessing Data

In [None]:
# Load TF-IDF matrix
tf_fit = np.load('tf-idf.npy', allow_pickle=True)
print(tf_fit.shape)

In [None]:
# Load sex labels
with open('sex.txt', 'r') as f:
    sex_list = [line.rstrip('\n') for line in f]
sex_list = [int(x)-1 for x in sex_list]
print(len(sex_list))

# Load year labels
with open('year.txt', 'r') as f:
    year_list = [line.rstrip('\n') for line in f]
year_list = [0 if int(x) < 2000 else 1 for x in year_list]
print(len(year_list))

In [None]:
# Combine labels
labels =  list(zip(sex_list, year_list))
print(labels[0])

In [None]:
# Split the dataset
x_train, x_test, y_train, y_test = train_test_split(tf_fit, labels, test_size=0.3, random_state=2024)

In [None]:
# Extract individual labels
y_train_sex = np.array([x[0] for x in y_train])
y_train_year = np.array([x[1] for x in y_train])
y_test_sex = np.array([x[0] for x in y_test])
y_test_year = np.array([x[1] for x in y_test])

In [None]:
# Choose device (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Convert data to PyTorch tensors
X_trn_torch = torch.from_numpy(x_train)
Y_trn_torch_sex = torch.from_numpy(y_train_sex)
Y_trn_torch_year = torch.from_numpy(y_train_year)
X_tst_torch = torch.from_numpy(x_test)
Y_tst_torch_sex = torch.from_numpy(y_test_sex)
Y_tst_torch_year = torch.from_numpy(y_test_year)

# Create PyTorch datasets
torch_trn_dataset = Data.TensorDataset(X_trn_torch, Y_trn_torch_sex, Y_trn_torch_year)
torch_tst_dataset = Data.TensorDataset(X_tst_torch, Y_tst_torch_sex, Y_tst_torch_year)

# Batch size
bsize = 16

# Create PyTorch data loaders
trainloader = Data.DataLoader(
    dataset=torch_trn_dataset,
    batch_size=bsize,
    shuffle=True,
    num_workers=2,
)

testloader = Data.DataLoader(
    dataset=torch_tst_dataset,
    batch_size=bsize,
    shuffle=True,
    num_workers=2,
)

## Part 2. Defining the Neural Network Model

In [None]:
# Define the neural network model
class MyNet(nn.Module):
    def __init__(self):
        super(MyNet,self).__init__()
        self.fc1 = nn.Linear(10000,100)
        self.fc_sex = nn.Linear(100,2)
        self.fc_year = nn.Linear(100,2)

    def forward(self,x):
        x = F.relu(self.fc1(x))
        x_sex = self.fc_sex(x)
        x_year = self.fc_year(x)
        return x_sex, x_year

## Part 3. Training and Testing the Neural Network

In [None]:
# Create an instance of the neural network
net = MyNet().to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.01)

# Train the network
for epoch in range(20):  # Iterate over 20 epochs
    for i, data in enumerate(trainloader):
        inputs, slabels, ylabels = data
        inputs = inputs.to(torch.float32)
        slabels = slabels.to(torch.int64)
        ylabels = ylabels.to(torch.int64)
        inputs, slabels, ylabels = inputs.to(device), slabels.to(device), ylabels.to(device)
        optimizer.zero_grad()  # Clear gradients
        outputs = net(inputs)  # Forward pass
        loss = criterion(outputs[0], slabels) + criterion(outputs[1], ylabels)  # Calculate loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights

    # Test the network
    scorrect = 0
    stotal = 0
    sall_predicted = []
    sall_labels = []
    ycorrect = 0
    ytotal = 0
    yall_predicted = []
    yall_labels = []
    with torch.no_grad():  # During testing, we don't need to compute gradients
        for inputs, slabels, ylabels in testloader:
            # Ensure data is on the correct device
            inputs = inputs.to(torch.float32)
            slabels = slabels.to(torch.int64)
            ylabels = ylabels.to(torch.int64)

            inputs, slabels, ylabels = inputs.to(device), slabels.to(device), ylabels.to(device)

            outputs = net(inputs)  # Forward pass
            _, spredicted = torch.max(outputs[0].data, 1)  # Get predicted results
            _, ypredicted = torch.max(outputs[1].data, 1)  # Get predicted results
            stotal += slabels.size(0)
            scorrect += (spredicted == slabels).sum().item()

            # Save predicted results and true labels for calculating F1 score
            sall_predicted.extend(spredicted.cpu().numpy())
            sall_labels.extend(slabels.cpu().numpy())

            ytotal += ylabels.size(0)
            ycorrect += (ypredicted == ylabels).sum().item()

            # Save predicted results and true labels for calculating F1 score
            yall_predicted.extend(ypredicted.cpu().numpy())
            yall_labels.extend(ylabels.cpu().numpy())

    saccuracy = 100 * scorrect / stotal
    sf1 = f1_score(sall_labels, sall_predicted, average='macro')
    print(f'Epoch {epoch+1}, sAccuracy: {saccuracy}%, sF1 Score: {sf1}')

    yaccuracy = 100 * ycorrect / ytotal
    yf1 = f1_score(yall_labels, yall_predicted, average='macro')
    print(f'Epoch {epoch+1}, yAccuracy: {yaccuracy}%, yF1 Score: {yf1}')