In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
print(torch.__version__)

2.0.1+cu118


In [4]:
# Check if a GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [5]:
# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size,batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        out = self.fc(h_n[-1])
        return out

    def init_hidden(self, batch_size):
        return (torch.zeros(1, batch_size, self.hidden_size),
                torch.zeros(1, batch_size, self.hidden_size))

In [6]:
# Load the fold datasets
fold1 = pd.read_csv('drive/MyDrive/Colab Notebooks/twitterbotdetection/fold_1.csv')
fold2 = pd.read_csv('drive/MyDrive/Colab Notebooks/twitterbotdetection/fold_2.csv')
fold3 = pd.read_csv('drive/MyDrive/Colab Notebooks/twitterbotdetection/fold_3.csv')
fold4 = pd.read_csv('drive/MyDrive/Colab Notebooks/twitterbotdetection/fold_4.csv')
fold5 = pd.read_csv('drive/MyDrive/Colab Notebooks/twitterbotdetection/fold_5.csv')

In [7]:
# Drop the last row
fold1 = fold1.drop(fold1.index[-1])
fold2 = fold2.drop(fold2.index[-1])

In [8]:
fold1.shape

(559, 20)

In [9]:
fold2.shape

(559, 20)

In [10]:
fold3.shape

(559, 20)

In [11]:
fold4.shape

(559, 20)

In [12]:
fold5.shape

(559, 20)

In [13]:
# Define the setups
setups = [
    {'train_folds': [fold1, fold2, fold3, fold4], 'test_fold': fold5},
    {'train_folds': [fold2, fold3, fold4, fold5], 'test_fold': fold1},
    {'train_folds': [fold3, fold4, fold5, fold1], 'test_fold': fold2},
    {'train_folds': [fold4, fold5, fold1, fold2], 'test_fold': fold3},
    {'train_folds': [fold5, fold1, fold2, fold3], 'test_fold': fold4}
]

In [14]:
setup_no=1

# Train and evaluate for each setup
for i, setup in enumerate(setups):
    len_instances=0
    train_folds = setup['train_folds']
    test_fold = setup['test_fold']

    # Train on each fold one by one
    #model = None  # Replace with your model initialization

    for fold_num, train_fold in enumerate(train_folds):
        # Perform data vectorization
        vectorizer = CountVectorizer()
        X_train_vectorized = vectorizer.fit_transform(train_fold.drop('bot',axis=1).apply(lambda x: ' '.join(map(str,x)), axis=1))
        X_test_vectorized = vectorizer.transform(test_fold.drop('bot',axis=1).apply(lambda x: ' '.join(map(str,x)), axis=1))

        #X_test_vectorized = vectorizer.transform(test_data['text_column'])
        y_train = train_fold['bot']
        y_test = test_fold['bot']

        # Reshape the output labels to a 2D array
        y_reshaped = np.array(y_train).reshape(-1, 1)
        y_test_reshaped = np.array(y_test).reshape(-1, 1)
        # Encode the output labels
        mlb = MultiLabelBinarizer()
        y_encoded = mlb.fit_transform(y_reshaped)
        y_test_encoded = mlb.transform(y_test_reshaped)

        # Convert the data to tensors
        X_train_tensor = torch.tensor(X_train_vectorized.toarray(), dtype=torch.float32)
        y_train_tensor = torch.tensor(y_encoded, dtype=torch.float32)
        X_test_tensor = torch.tensor(X_test_vectorized.toarray(), dtype=torch.float32)
        y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.float32)

        # Set hyperparameters
        input_size = X_train_tensor.shape[1]
        hidden_size = 64
        output_size = y_train_tensor.shape[1]
        num_epochs = 10
        batch_size = 32
        learning_rate = 0.001

        # Create the LSTM model
        model = LSTMModel(input_size, hidden_size, output_size)
        model.to(device)

        # Define the loss function and optimizer
        criterion = nn.BCEWithLogitsLoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        # Create a DataLoader for training
        train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)

        for epoch in range(num_epochs):
            total_samples = 0
            correct_predictions = 0
            test_f1_score = 0.0
            test_accuracy = 0.0

            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                batch_X = batch_X.to(device)
                batch_y = batch_y.to(device)
                outputs = model(batch_X.unsqueeze(1))
                loss = criterion(outputs.squeeze(), batch_y)
                loss.backward()
                optimizer.step()

            print(f"Setup {setup_no}: Epoch: [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")

        len_instances = sum(len(train_fold) for train_fold in train_folds)



        # Evaluate the model on the test fold

    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor.to(device).unsqueeze(1))
        test_predictions = torch.round(torch.sigmoid(test_outputs)).cpu().numpy()
        test_accuracy = accuracy_score(y_test_encoded, test_predictions)
        test_f1_score = f1_score(y_test_encoded, test_predictions, average='weighted')

    print(f"Setup {i+1}: Trained on: {len_instances} instances, Tested on: {len(test_fold)} instances, Test Accuracy: {test_accuracy:.4f}, F1 Score: {test_f1_score:.4f}")

    # Save the model after each setup
    torch.save(model.state_dict(), f"model_setup{setup_no}.pt")
    print('Model saved')
    setup_no=setup_no+1
        # Replace the following lines with your model evaluation code
        # accuracy = model.evaluate(test_fold)

        #print(f"Setup {i+1}: Trained on {len(train_fold)} instances, Tested on {len(test_fold)} instances")
        # Print or save the evaluation results as per your requirements

    print()  # Add an empty line between setups

Setup 1: Epoch: [1/10], Loss: 0.6810
Setup 1: Epoch: [2/10], Loss: 0.6308
Setup 1: Epoch: [3/10], Loss: 0.5417
Setup 1: Epoch: [4/10], Loss: 0.4356
Setup 1: Epoch: [5/10], Loss: 0.3292
Setup 1: Epoch: [6/10], Loss: 0.2096
Setup 1: Epoch: [7/10], Loss: 0.1361
Setup 1: Epoch: [8/10], Loss: 0.1035
Setup 1: Epoch: [9/10], Loss: 0.0836
Setup 1: Epoch: [10/10], Loss: 0.0503
Setup 1: Epoch: [1/10], Loss: 0.6871
Setup 1: Epoch: [2/10], Loss: 0.5919
Setup 1: Epoch: [3/10], Loss: 0.5052
Setup 1: Epoch: [4/10], Loss: 0.3963
Setup 1: Epoch: [5/10], Loss: 0.2614
Setup 1: Epoch: [6/10], Loss: 0.2145
Setup 1: Epoch: [7/10], Loss: 0.1269
Setup 1: Epoch: [8/10], Loss: 0.0950
Setup 1: Epoch: [9/10], Loss: 0.0713
Setup 1: Epoch: [10/10], Loss: 0.0570
Setup 1: Epoch: [1/10], Loss: 0.6597
Setup 1: Epoch: [2/10], Loss: 0.5868
Setup 1: Epoch: [3/10], Loss: 0.4935
Setup 1: Epoch: [4/10], Loss: 0.4275
Setup 1: Epoch: [5/10], Loss: 0.2707
Setup 1: Epoch: [6/10], Loss: 0.1822
Setup 1: Epoch: [7/10], Loss: 0.1079

In [15]:
X_test_tensor.shape

torch.Size([559, 11318])

In [16]:
X_train_tensor.shape

torch.Size([559, 11318])