In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer
import json
import random

In [None]:
# Set the random seed
seed = 123
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

In [None]:
# Check if a GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size,batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        out = self.fc(h_n[-1])
        return out

    def init_hidden(self, batch_size):
        return (torch.zeros(1, batch_size, self.hidden_size),
                torch.zeros(1, batch_size, self.hidden_size))

In [None]:
# Load the fold datasets
fold1 = pd.read_csv('drive/MyDrive/Colab Notebooks/twitterbotdetection/cleanfold_1.csv')
fold2 = pd.read_csv('drive/MyDrive/Colab Notebooks/twitterbotdetection/cleanfold_2.csv')
fold3 = pd.read_csv('drive/MyDrive/Colab Notebooks/twitterbotdetection/cleanfold_3.csv')
fold4 = pd.read_csv('drive/MyDrive/Colab Notebooks/twitterbotdetection/cleanfold_4.csv')
fold5 = pd.read_csv('drive/MyDrive/Colab Notebooks/twitterbotdetection/cleanfold_5.csv')

In [None]:
fold1.shape

(410, 15)

In [None]:
fold2.shape

(410, 15)

In [None]:
fold3.shape

(410, 15)

In [None]:
fold4.shape

(410, 15)

In [None]:
fold5.shape

(410, 15)

In [None]:
# Define the setups
setups = [
    {'train_folds': [fold2, fold3, fold4, fold5], 'test_fold': fold1},
    {'train_folds': [fold3, fold4, fold5, fold1], 'test_fold': fold2},
    {'train_folds': [fold4, fold5, fold1, fold2], 'test_fold': fold3},
    {'train_folds': [fold5, fold1, fold2, fold3], 'test_fold': fold4},
    {'train_folds': [fold1, fold2, fold3, fold4], 'test_fold': fold5},
]

In [None]:
setup_no=1
# Create empty lists to store precision and recall values
precision_0_list = []
recall_0_list = []
precision_1_list = []
recall_1_list = []

precision_0_list_para = []
recall_0_list_para = []
precision_1_list_para = []
recall_1_list_para = []

f1_orig_0_list = []
f1_orig_1_list = []
f1_para_0_list = []
f1_para_1_list = []

# Train and evaluate for each setup
for i, setup in enumerate(setups):
    len_instances=0
    train_folds = setup['train_folds']
    test_fold = setup['test_fold']

    # Concatenate all the train folds
    train_data = pd.concat(train_folds)

    train_data_cols = train_data[['location','description','verified']]
    test_data_cols = test_fold[['location','description','verified']]

    tested_ids = []


    # Perform data vectorization
    vectorizer = CountVectorizer(stop_words='english')
    X_train_vectorized = vectorizer.fit_transform(train_data_cols.apply(lambda x: ' '.join(map(str,x)), axis=1))
    X_test_vectorized = vectorizer.transform(test_data_cols.apply(lambda x: ' '.join(map(str,x)), axis=1))

    # Get the vocabulary (i.e., mapping from words to indices)
    vocabulary = vectorizer.vocabulary_

    # Convert the vocabulary to a JSON string
    vocabulary_json = json.dumps(vocabulary)

    # Specify the file path to save the vocabulary
    file_path = f"vocabulary_setup{setup_no}.txt"

    # Save the vocabulary to the text file
    with open(file_path, 'w') as file:
        file.write(vocabulary_json)

    print(f"Vocabulary saved to: {file_path}")

    #X_test_vectorized = vectorizer.transform(test_data['text_column'])
    y_train = train_data['bot']
    y_test = test_fold['bot']

    # Reshape the output labels to a 1D array
    y_reshaped = np.array(y_train).reshape(-1)
    y_test_reshaped = np.array(y_test).reshape(-1)

    # Encode the output labels using LabelEncoder
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y_reshaped)
    y_test_encoded = label_encoder.transform(y_test_reshaped)

    # Convert the data to tensors
    X_train_tensor = torch.tensor(X_train_vectorized.toarray(), dtype=torch.float32)
    y_train_tensor = torch.tensor(y_encoded, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test_vectorized.toarray(), dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.float32)


    # Set hyperparameters
    input_size = X_train_tensor.shape[1]
    hidden_size = 64
    output_size = 1
    num_epochs = 10
    batch_size = 32
    learning_rate = 0.001

    # Create the LSTM model
    model = LSTMModel(input_size, hidden_size, output_size)
    model.to(device)

    # Define the loss function and optimizer
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create a DataLoader for training
    train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)

    for epoch in range(num_epochs):
        total_samples = 0
        correct_predictions = 0
        test_f1_score = 0.0
        test_accuracy = 0.0

        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            outputs = model(batch_X.unsqueeze(1))
            batch_size = batch_X.size(0)

            # Reshape the output tensor to match the target tensor shape
            outputs = outputs.view(batch_size)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

        print(f"Setup {setup_no}: Epoch: [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")

    len_instances = sum(len(train_fold) for train_fold in train_folds)



    # Evaluate the model on the test fold

    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor.to(device).unsqueeze(1))
        test_predictions = torch.round(torch.sigmoid(test_outputs)).cpu().numpy()
        test_accuracy = accuracy_score(y_test_encoded, test_predictions)
        test_f1_score_orig_label0 = f1_score(y_test_encoded==0, test_predictions==0)
        test_f1_score_orig_label1 = f1_score(y_test_encoded==1, test_predictions==1)

        test_probabilities = torch.sigmoid(test_outputs).cpu().numpy()

        # Calculate precision and recall for class 0
        precision_0 = precision_score(y_test_encoded, test_predictions, pos_label=0)
        recall_0 = recall_score(y_test_encoded, test_predictions, pos_label=0)

        # Calculate precision and recall for class 1
        precision_1 = precision_score(y_test_encoded, test_predictions, pos_label=1)
        recall_1 = recall_score(y_test_encoded, test_predictions, pos_label=1)

        # Calculate the confidence score for label 1
        confidence_scores_label1 = test_probabilities[:, 0]

        # Ensure test_predictions and y_test_encoded have the same length
        num_instances = len(test_fold)
        test_predictions = test_predictions[:num_instances]

        # Convert the predicted labels to strings
        predicted_labels = test_predictions.astype(str)

        # Create a DataFrame with the predicted labels
        predictions_df = pd.DataFrame({
            'id_str': test_fold['id_str'],  # Include the 'id_str' column
            'Predicted Label': [', '.join(labels) for labels in predicted_labels]
        })

        # Save the DataFrame to a CSV file
        predictions_df.to_csv(f"predictions_setup{setup_no}.csv", index=False)

        # Create a DataFrame with id_str and confidence scores
        confidence_df = pd.DataFrame({
            'id_str': test_fold['id_str'],
            'Confidence Label 1 Original': confidence_scores_label1
        })

        # Specify the output CSV file path
        csv_file = f"confidence_original{setup_no}.csv"

        # Save the DataFrame to a CSV file
        confidence_df.to_csv(csv_file, index=False)

        print(f"Confidence scores saved to '{csv_file}'")

        tested_ids.extend(test_fold['id_str'])

        # Append precision and recall values to their respective lists
        precision_0_list.append(precision_0)
        recall_0_list.append(recall_0)
        precision_1_list.append(precision_1)
        recall_1_list.append(recall_1)

        f1_orig_0_list.append(test_f1_score_orig_label0)
        f1_orig_1_list.append(test_f1_score_orig_label1)



    print(f"For Paraphrased Fold {setup_no}: Tested on: {len(test_fold)} instances, Test Accuracy: {test_accuracy:.4f}, F1 Score for Label 0: {test_f1_score_orig_label0:.4f}, F1 Score for Label 1: {test_f1_score_orig_label1:.4f}")

    #Forparaphrasedfifthfold
    paraphrased_fold=pd.read_csv(f'drive/MyDrive/Colab Notebooks/twitterbotdetection/cleanfold_{setup_no}.csv')
    paraphrased_fold_filtered = paraphrased_fold[paraphrased_fold['id_str'].isin(tested_ids)]
    X_test_vectorized = vectorizer.transform(paraphrased_fold_filtered[['location','description','verified']].apply(lambda x: ' '.join(map(str,x)), axis=1))

    y_test = paraphrased_fold_filtered['bot']

    y_test_reshaped = np.array(y_test).reshape(-1)

    y_test_encoded = label_encoder.transform(y_test_reshaped)

    X_test_tensor = torch.tensor(X_test_vectorized.toarray(), dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.float32)

    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor.to(device).unsqueeze(1))
        test_predictions = torch.round(torch.sigmoid(test_outputs)).cpu().numpy()
        test_accuracy = accuracy_score(y_test_encoded, test_predictions)
        test_f1_score_para_label0 = f1_score(y_test_encoded==0, test_predictions==0)
        test_f1_score_para_label1 = f1_score(y_test_encoded==1, test_predictions==1)


        test_probabilities = torch.sigmoid(test_outputs).cpu().numpy()

        # Calculate precision and recall for class 0
        precision_0_para = precision_score(y_test_encoded, test_predictions, pos_label=0)
        recall_0_para = recall_score(y_test_encoded, test_predictions, pos_label=0)

        # Calculate precision and recall for class 1
        precision_1_para = precision_score(y_test_encoded, test_predictions, pos_label=1)
        recall_1_para = recall_score(y_test_encoded, test_predictions, pos_label=1)

        # Calculate the confidence score for label 1
        confidence_paraphrased_label1 = test_probabilities[:, 0]

        # Ensure test_predictions and y_test_encoded have the same length
        num_instances = len(paraphrased_fold_filtered)
        test_predictions = test_predictions[:num_instances]

        # Convert the predicted labels to strings
        predicted_labels = test_predictions.astype(str)

        # Create a DataFrame with the predicted labels
        predictions_df = pd.DataFrame({
            'id_str': paraphrased_fold_filtered['id_str'],  # Include the 'id_str' column
            'Predicted Label Original': [', '.join(labels) for labels in predicted_labels]
        })

        # Save the DataFrame to a CSV file
        predictions_df.to_csv(f"f{setup_no}_predictions.csv", index=False)

        # Create a DataFrame with id_str and confidence scores
        confidence_df = pd.DataFrame({
            'id_str': paraphrased_fold_filtered['id_str'],
            'Confidence Label 1 Paraphrased': confidence_paraphrased_label1
        })

        # Specify the output CSV file path
        csv_file = f"confidence_paraphrased{setup_no}.csv"

        # Save the DataFrame to a CSV file
        confidence_df.to_csv(csv_file, index=False)

        print(f"Confidence_paraphrased scores saved to '{csv_file}'")

        # Append precision and recall values to their respective lists
        precision_0_list_para.append(precision_0_para)
        recall_0_list_para.append(recall_0_para)
        precision_1_list_para.append(precision_1_para)
        recall_1_list_para.append(recall_1_para)

        f1_para_0_list.append(test_f1_score_para_label0)
        f1_para_1_list.append(test_f1_score_para_label1)

    print(f"For Paraphrased Fold {setup_no}: Tested on: {len(paraphrased_fold_filtered)} instances, Test Accuracy: {test_accuracy:.4f}, F1 Score for Label 0: {test_f1_score_para_label0:.4f}, F1 Score for Label 1: {test_f1_score_para_label1:.4f}")

    # Save the model after each setup
    torch.save(model.state_dict(), f"model_setup{setup_no}.pt")
    print('Model saved')
    setup_no=setup_no+1

    print()  # Add an empty line between setups

Vocabulary saved to: vocabulary_setup1.txt
Setup 1: Epoch: [1/10], Loss: 0.6594
Setup 1: Epoch: [2/10], Loss: 0.5820
Setup 1: Epoch: [3/10], Loss: 0.4055
Setup 1: Epoch: [4/10], Loss: 0.3113
Setup 1: Epoch: [5/10], Loss: 0.2132
Setup 1: Epoch: [6/10], Loss: 0.2198
Setup 1: Epoch: [7/10], Loss: 0.1071
Setup 1: Epoch: [8/10], Loss: 0.0474
Setup 1: Epoch: [9/10], Loss: 0.0624
Setup 1: Epoch: [10/10], Loss: 0.0442
Confidence scores saved to 'confidence_original1.csv'
For Paraphrased Fold 1: Tested on: 410 instances, Test Accuracy: 0.8634, F1 Score for Label 0: 0.8733, F1 Score for Label 1: 0.8519
Confidence_paraphrased scores saved to 'confidence_paraphrased1.csv'
For Paraphrased Fold 1: Tested on: 410 instances, Test Accuracy: 0.8634, F1 Score for Label 0: 0.8733, F1 Score for Label 1: 0.8519
Model saved

Vocabulary saved to: vocabulary_setup2.txt
Setup 2: Epoch: [1/10], Loss: 0.6556
Setup 2: Epoch: [2/10], Loss: 0.5855
Setup 2: Epoch: [3/10], Loss: 0.4055
Setup 2: Epoch: [4/10], Loss: 0.

In [None]:
# Get the 'id_str' column values from each DataFrame
id_str1 = test_fold['id_str']
id_str2 = paraphrased_fold['id_str']

# Compare the 'id_str' values and find the matching instances
matching_instances = id_str1.isin(id_str2)

# Count the number of matching instances
num_matching_instances = matching_instances.sum()

# Display the result
print(f"Number of matching instances: {num_matching_instances}")

Number of matching instances: 410


In [None]:
#Check the performance metrics
print(precision_0_list)
print(recall_0_list)
print(precision_1_list)
print(recall_1_list)
print()
print(precision_0_list_para)
print(recall_0_list_para)
print(precision_1_list_para)
print(recall_1_list_para)
print()
print(f1_orig_0_list)
print(f1_orig_1_list)
print()
print(f1_para_0_list)
print(f1_para_1_list)

[0.8654708520179372, 0.8564814814814815, 0.8829268292682927, 0.8436018957345972, 0.8975609756097561]
[0.8812785388127854, 0.8564814814814815, 0.8537735849056604, 0.89, 0.8720379146919431]
[0.8609625668449198, 0.8402061855670103, 0.848780487804878, 0.8894472361809045, 0.8682926829268293]
[0.8429319371727748, 0.8402061855670103, 0.8787878787878788, 0.8428571428571429, 0.8944723618090452]

[0.8654708520179372, 0.8564814814814815, 0.8829268292682927, 0.8436018957345972, 0.8975609756097561]
[0.8812785388127854, 0.8564814814814815, 0.8537735849056604, 0.89, 0.8720379146919431]
[0.8609625668449198, 0.8402061855670103, 0.848780487804878, 0.8894472361809045, 0.8682926829268293]
[0.8429319371727748, 0.8402061855670103, 0.8787878787878788, 0.8428571428571429, 0.8944723618090452]

[0.8733031674208145, 0.8564814814814815, 0.8681055155875299, 0.8661800486618004, 0.8846153846153846]
[0.8518518518518517, 0.8402061855670103, 0.8635235732009925, 0.8655256723716381, 0.8811881188118812]

[0.87330316742081