In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size,batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        out = self.fc(h_n[-1])
        return out

    def init_hidden(self, batch_size):
        return (torch.zeros(1, batch_size, self.hidden_size),
                torch.zeros(1, batch_size, self.hidden_size))

In [13]:
# Load your data into a pandas DataFrame
base_csv = 'drive/MyDrive/Colab Notebooks/twitterbotdetection/training_data_2_csv_UTF.csv'
df = pd.read_csv(base_csv)
df.head()

Unnamed: 0,id,id_str,screen_name,location,description,url,followers_count,friends_count,listed_count,created_at,favourites_count,verified,statuses_count,lang,status,default_profile,default_profile_image,has_extended_profile,name,bot
0,8.16e+17,"""815745789754417152""","""HoustonPokeMap""","""Houston, TX""","""Rare and strong PokŽmon in Houston, TX. See m...","""https://t.co/dnWuDbFRkt""",1291,0,10,"""Mon Jan 02 02:25:26 +0000 2017""",0,False,78554,"""en""","{\r ""created_at"": ""Sun Mar 12 15:44:04 +0...",True,False,False,"""Houston PokŽ Alert""",1
1,4843621000.0,4843621225,kernyeahx,"Templeville town, MD, USA",From late 2014 Socium Marketplace will make sh...,,1,349,0,2/1/2016 7:37,38,False,31,en,,True,False,False,Keri Nelson,1
2,4303727000.0,4303727112,mattlieberisbot,,"Inspired by the smart, funny folks at @replyal...",https://t.co/P1e1o0m4KC,1086,0,14,Fri Nov 20 18:53:22 +0000 2015,0,False,713,en,"{'retweeted': False, 'is_quote_status': False,...",True,False,False,Matt Lieber Is Bot,1
3,3063139000.0,3063139353,sc_papers,,,,33,0,8,2/25/2015 20:11,0,False,676,en,Construction of human anti-tetanus single-chai...,True,True,False,single cell papers,1
4,2955142000.0,2955142070,lucarivera16,"Dublin, United States",Inspiring cooks everywhere since 1956.,,11,745,0,1/1/2015 17:44,146,False,185,en,,False,False,False,lucarivera16,1


In [14]:
df_imputed=df.fillna(df.mode().iloc[0])

In [15]:
# Define the number of desired folds
num_folds = 5

# Initialize the KFold object
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)

In [16]:
# Create empty lists to store the indices of each fold
fold_indices = []

In [17]:
# Split the data into folds
for train_index, _ in kfold.split(df_imputed):
    fold_indices.append(train_index)

In [18]:
# Train the model on the first 4 folds
for fold in range(num_folds - 1):  # Iterate over the first 4 folds
    train_indices = fold_indices[fold]
    test_indices = fold_indices[num_folds - 1]  # Index of the corresponding test fold

    # Get the data for the current fold
    train_data = df_imputed.iloc[train_indices]
    test_data = df_imputed.iloc[test_indices]
    test_data.to_csv('fifthfold.csv', index=False)

    # Perform data vectorization
    vectorizer = CountVectorizer()
    X_train_vectorized = vectorizer.fit_transform(train_data.drop('bot',axis=1).apply(lambda x: ' '.join(map(str,x)), axis=1))
    #X_test_vectorized = vectorizer.transform(test_data['text_column'])
    y_train = train_data['bot']


    # Reshape the output labels to a 2D array
    y_reshaped = np.array(y_train).reshape(-1, 1)
    # Encode the output labels
    mlb = MultiLabelBinarizer()
    y_encoded = mlb.fit_transform(y_reshaped)

    # Convert the data to tensors
    X_train_tensor = torch.tensor(X_train_vectorized.toarray(), dtype=torch.float32)
    y_train_tensor = torch.tensor(y_encoded, dtype=torch.float32)

    # Set hyperparameters
    input_size = X_train_tensor.shape[1]
    hidden_size = 64
    output_size = y_train_tensor.shape[1]
    num_epochs = 10
    batch_size = 32
    learning_rate = 0.001

    # Create the LSTM model
    model = LSTMModel(input_size, hidden_size, output_size)

    # Define the loss function and optimizer
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create a DataLoader for training
    train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)

    for epoch in range(num_epochs):
        total_samples = 0
        correct_predictions = 0
        test_f1_score = 0.0
        test_accuracy = 0.0

        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X.unsqueeze(1))
            loss = criterion(outputs.squeeze(), batch_y)
            loss.backward()
            optimizer.step()

        print(f"Fold {fold} : Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}, Train Indices: {train_indices}")

    # Save the trained model for each fold
    LSTM5folds = f"model_fold_{fold}.pth"
    torch.save(model.state_dict(), LSTM5folds)



Fold 0 : Epoch [1/10], Loss: 0.6157, Train Indices: [   0    1    2 ... 2793 2794 2795]
Fold 0 : Epoch [2/10], Loss: 0.3246, Train Indices: [   0    1    2 ... 2793 2794 2795]
Fold 0 : Epoch [3/10], Loss: 0.2372, Train Indices: [   0    1    2 ... 2793 2794 2795]
Fold 0 : Epoch [4/10], Loss: 0.0504, Train Indices: [   0    1    2 ... 2793 2794 2795]
Fold 0 : Epoch [5/10], Loss: 0.0538, Train Indices: [   0    1    2 ... 2793 2794 2795]
Fold 0 : Epoch [6/10], Loss: 0.0159, Train Indices: [   0    1    2 ... 2793 2794 2795]
Fold 0 : Epoch [7/10], Loss: 0.0143, Train Indices: [   0    1    2 ... 2793 2794 2795]
Fold 0 : Epoch [8/10], Loss: 0.0066, Train Indices: [   0    1    2 ... 2793 2794 2795]
Fold 0 : Epoch [9/10], Loss: 0.0050, Train Indices: [   0    1    2 ... 2793 2794 2795]
Fold 0 : Epoch [10/10], Loss: 0.0226, Train Indices: [   0    1    2 ... 2793 2794 2795]
Fold 1 : Epoch [1/10], Loss: 0.5472, Train Indices: [   0    1    2 ... 2794 2795 2796]
Fold 1 : Epoch [2/10], Loss: 0.

In [19]:
    # Get the data for the test fold
    y_test = test_data['bot']
    # Vectorize the test data
    X_test_vectorized = vectorizer.transform(test_data.apply(lambda x: ' '.join(map(str, x)), axis=1))
    # Reshape the output labels to a 2D array
    y_test_reshaped = np.array(y_test).reshape(-1, 1)
    y_test_encoded = mlb.fit_transform(y_test_reshaped)

    # Convert the test data to tensors
    X_test_tensor = torch.tensor(X_test_vectorized.toarray(), dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.float32)

    model = LSTMModel(input_size, hidden_size, output_size)
    model.load_state_dict(torch.load(LSTM5folds))

    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor.unsqueeze(1))
        test_predictions = torch.round(torch.sigmoid(test_outputs)).numpy()
        test_accuracy = accuracy_score(y_test_encoded, test_predictions)
        test_f1_score = f1_score(y_test_encoded, test_predictions, average='weighted')

    print(f"Fold 5: Test Accuracy: {test_accuracy:.4f}, F1 Score: {test_f1_score:.4f}")

Fold 5: Test Accuracy: 0.9678, F1 Score: 0.9696


In [20]:
X_test_tensor.shape

torch.Size([2238, 34776])

In [21]:
X_train_tensor.shape

torch.Size([2238, 34776])

In [22]:
# Set up the five training and testing setups
setups = [
    {'train': [0, 1, 2, 3], 'test': [4]},
    {'train': [1, 2, 3, 4], 'test': [0]},
    {'train': [2, 3, 4, 0], 'test': [1]},
    {'train': [3, 4, 0, 1], 'test': [2]},
    {'train': [4, 0, 1, 2], 'test': [3]}
]

In [23]:
# Iterate over the setups
for setup in setups:
    # Get the training and testing folds for the current setup
    train_indices = np.concatenate([fold_indices[fold] for fold in setup['train']])
    test_indices = fold_indices[setup['test'][0]]


    # Get the data for the current fold
    train_data = df_imputed.iloc[train_indices]
    test_data = df_imputed.iloc[test_indices]
    test_data.to_csv('fifthfold.csv', index=False)
    # Get the data for the training and testing folds
    X_train = df_imputed.iloc[train_indices]
    y_train = train_data['bot']
    X_test = df_imputed.iloc[test_indices]
    y_test = test_data['bot']

    # Tokenize the input columns
    vectorizer = CountVectorizer()
    X_train_vectorized = vectorizer.fit_transform(train_data.drop('bot',axis=1).apply(lambda x: ' '.join(map(str, x)), axis=1))
    X_test_vectorized = vectorizer.transform(X_test.apply(lambda x: ' '.join(map(str, x)), axis=1))

    # Reshape the output labels to a 2D array
    y_train_reshaped = np.array(y_train).reshape(-1, 1)
    y_test_reshaped = np.array(y_test).reshape(-1, 1)

    # Encode the output labels
    mlb = MultiLabelBinarizer()
    y_train_encoded = mlb.fit_transform(y_train_reshaped)
    y_test_encoded = mlb.transform(y_test_reshaped)

    # Convert the data to tensors
    X_train_tensor = torch.tensor(X_train_vectorized.toarray(), dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test_vectorized.toarray(), dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.float32)

    # Set hyperparameters
    input_size = X_train_tensor.shape[1]
    hidden_size = 64
    output_size = y_train_tensor.shape[1]
    num_epochs = 10
    batch_size = 32
    learning_rate = 0.001

    # Create the LSTM model
    model = LSTMModel(input_size, hidden_size, output_size)

    # Define the loss function and optimizer
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create a DataLoader for training
    train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    print(f"Number of training instances: {len(train_indices)}")
    print(f"Number of test instances: {len(test_indices)}")

    # Training loop
    for epoch in range(num_epochs):
        total_samples = 0
        correct_predictions = 0
        test_f1_score = 0.0
        test_accuracy = 0.0

        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X.unsqueeze(1))
            loss = criterion(outputs.squeeze(), batch_y)
            loss.backward()
            optimizer.step()

            # Print the training loss for each epoch
        print(f"Setup: {setup}, Fold: {fold},  Epoch: {epoch + 1}, Loss: {loss.item()}")

    # Save the trained model for each fold
    LSTM5folds = f"final_model_fold_{fold}.pth"
    torch.save(model.state_dict(), LSTM5folds)
    print("Model saved")


    # Evaluation
    model.eval()
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor.unsqueeze(1))
        test_predictions = torch.round(torch.sigmoid(test_outputs)).numpy()
        test_accuracy = accuracy_score(y_test_encoded, test_predictions)
        test_f1_score = f1_score(y_test_encoded, test_predictions, average='weighted')
        print(f"Setup: {setup}, Fold: {fold}, Test Accuracy: {test_accuracy:.4f}, F1 Score: {test_f1_score:.4f}")

Number of training instances: 8950
Number of test instances: 2238
Setup: {'train': [0, 1, 2, 3], 'test': [4]}, Fold: 3,  Epoch: 1, Loss: 0.04586196318268776
Setup: {'train': [0, 1, 2, 3], 'test': [4]}, Fold: 3,  Epoch: 2, Loss: 0.002304759807884693
Setup: {'train': [0, 1, 2, 3], 'test': [4]}, Fold: 3,  Epoch: 3, Loss: 0.001643808209337294
Setup: {'train': [0, 1, 2, 3], 'test': [4]}, Fold: 3,  Epoch: 4, Loss: 0.0008973200456239283
Setup: {'train': [0, 1, 2, 3], 'test': [4]}, Fold: 3,  Epoch: 5, Loss: 0.0004927057307213545
Setup: {'train': [0, 1, 2, 3], 'test': [4]}, Fold: 3,  Epoch: 6, Loss: 0.10285720974206924
Setup: {'train': [0, 1, 2, 3], 'test': [4]}, Fold: 3,  Epoch: 7, Loss: 0.0004226563323754817
Setup: {'train': [0, 1, 2, 3], 'test': [4]}, Fold: 3,  Epoch: 8, Loss: 0.0003712282341439277
Setup: {'train': [0, 1, 2, 3], 'test': [4]}, Fold: 3,  Epoch: 9, Loss: 0.00045409679296426475
Setup: {'train': [0, 1, 2, 3], 'test': [4]}, Fold: 3,  Epoch: 10, Loss: 0.000972344889305532
Model sav