In [None]:
#Import Libraries
!pip install wandb
import wandb
from sklearn.metrics import accuracy_score, precision_score, f1_score, classification_report
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.utils import shuffle
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn import metrics
import pandas as pd
from wordcloud import WordCloud
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
import string
import itertools
import nltk
import re
import tensorflow as tf
from tensorflow import keras
import tensorflow_text as text  # Registers the ops.
import tensorflow_hub as hub
import os

# Set a random seed for reproducibility
np.random.seed(42)

from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

my_secret = user_secrets.get_secret("wandb_api_key") 

wandb.login(key=my_secret)

In [None]:
#Importing Datasets
data = pd.read_csv("/kaggle/input/abstracts/research-abstracts-labeled.csv")
Test_data = pd.read_csv("/kaggle/input/gpt-vs-human-a-corpus-of-research-abstracts/data_set.csv", low_memory=False)
#Checking if Dataset has any null Values
data.isnull().sum()


data = shuffle(data)
data = data.reset_index( drop=True )

#Removed duplicate data
data.drop_duplicates(subset=['text'])
data.dropna()
data.describe()

data.head()

In [None]:
#removing unnecessary columns
columns_to_keep = ['title','abstract','is_ai_generated']
Test_data = Test_data[columns_to_keep]
#Test Data Cleaning
Test_data.isnull().sum()
Test_data = Test_data.dropna()

Test_data

In [None]:
#Parts of Speech Code
HumanDataset = data[data['label'] == 0]
HumanDataset = HumanDataset.sample()
AIDataset = data[data['label'] == 1]
AIDataset = AIDataset.sample()
import spacy
import seaborn as sns

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to perform POS tagging
def pos_tagging(text):
    doc = nlp(text)
    return [(token.text, token.pos_) for token in doc]

# Apply POS tagging to each dataframe
HumanDataset['pos_tags'] = HumanDataset['text'].apply(pos_tagging)
AIDataset['pos_tags'] = AIDataset['text'].apply(pos_tagging)

# Flatten the list of POS tags in each data frame
pos_list_human = [pos for sublist in HumanDataset['pos_tags'] for _, pos in sublist]
pos_list_AI = [pos for sublist in AIDataset['pos_tags'] for _, pos in sublist]

# Create a frequency distribution of POS tags
pos_freq_human = pd.Series(pos_list_human).value_counts()
pos_freq_AI = pd.Series(pos_list_AI).value_counts()

# Combine the frequency distributions into a single dataframe
pos_freq_combined = pd.DataFrame({
    'Human': pos_freq_human,
    'AI': pos_freq_AI
}).fillna(0)

# Reset index for plotting
pos_freq_combined = pos_freq_combined.reset_index().melt(id_vars='index', var_name='Dataset', value_name='Frequency')
pos_freq_combined.columns = ['POS Tag', 'Dataset', 'Frequency']

# Sorting the DataFrame by frequency within each Dataset
pos_freq_combined['Frequency'] = pos_freq_combined.groupby('Dataset')['Frequency'].transform(lambda x: x.sort_values(ascending=False).values)

# Plotting the POS tag frequencies
plt.figure(figsize=(14, 7))
sns.barplot(x='POS Tag', y='Frequency', hue='Dataset', data=pos_freq_combined, palette='viridis')
plt.title('Part of Speech Tag Distribution: Human vs AI')
plt.xlabel('POS Tag')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.legend(title='Dataset')
plt.show()

In [None]:
#data Cleaning
nltk.download('stopwords')
stop =  stopwords.words('english')

def remove_punctuation(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)
data['text'] = data['text'].apply(remove_punctuation)
data['text'] = data['text'].str.replace('\n','')
data['text'] = data['text'].str.replace('\d+', '', regex=True)
data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
data['text'] = data['text'].str.lower()
data.head()


In [None]:
#Visuzlise the Data
label_count = data.label.value_counts()
plt.pie(label_count,labels= label_count)
plt.title('Labels')
plt.legend(label_count.keys().tolist())

var_data = data.label
fig = plt.figure(figsize=(10,4))
plt.boxplot(var_data)
plt.title('data distribution')

In [None]:
#word clouds
all_words = ' '.join([text for text in data['text'][data['label'] == 1] ])
wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=110).generate(all_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

all_words = ' '.join([text for text in data['text'][data['label'] == 0] ])
wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=110).generate(all_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
#show word length distribution
import seaborn as sns
g = sns.FacetGrid(data,col='label', col_wrap = 2)
g.map(plt.hist,'word_count')
g.set_axis_labels("Word Length")
plt.show()

#remove unnecessary columns from the data
data.drop(["title","word_count"],axis=1,inplace=True)

In [None]:
X = data['text'].values
y = data['label'].values

# Using TF-IDF vectorization to convert text data into numerical features
tfidf_vectorizer = TfidfVectorizer(max_features=1500)
X_tfidf = tfidf_vectorizer.fit_transform(X).toarray()

# Split the data into training and testing sets for SVM
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3,random_state=42)
#Mention the amount of in Training and in Testing
print(f'Training {X_train_tfidf.shape[0]}\nTesting {X_test_tfidf.shape[0]}')

In [None]:
#SVM hyper parameter tuning
param_grid = {'C': [0.1, 1, 10],
              'gamma': [1, 0.1, 0.01],
              'kernel': ['rbf','linear']}
svm = SVC()

grid = RandomizedSearchCV(estimator = svm,
                           param_distributions = param_grid,
                           cv=5)

#Fitting the model
grid.fit(X_train_tfidf,y_train)
print(f'Best Paramters:  {grid.best_estimator_}')

In [None]:
#Predictions
predictions = grid.best_estimator_.predict(X_test_tfidf)
accuracy_score(y_test,predictions)

In [None]:
#Classification report for the tuned model
print(classification_report(y_test,predictions))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
#creation of the confusion matrix with the y_test and the prediction value generated 
cm = metrics.confusion_matrix(y_test, predictions)
sns.heatmap(cm, 
            annot=True,
            fmt='g', 
            xticklabels=['Machine','Human'],
            yticklabels=['Machine','Human'])
plt.ylabel('Prediction',fontsize=13)
plt.xlabel('Actual',fontsize=13)
plt.title('Confusion Matrix',fontsize=17)
plt.show()

In [None]:
#dictionary for the weights and baised information logging
CFG = dict(
    optimiser = 'Adam',
    learning_rate = 1e-6,
)

In [None]:
#BERT Model
# Function to create the BERT model
def bert_model():
    model_path ="/kaggle/input/bert/tensorflow2/bert-en-uncased-l-12-h-768-a-12/2"
    preprocess_path = "/kaggle/input/bert/tensorflow2/en-uncased-preprocess/3/"
    
    # Input layer for text data
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
    
      # Preprocessor layer to convert text input to BERT-compatible format
    preprocessor = hub.KerasLayer(preprocess_path)
    encoder_inputs  = preprocessor(text_input) # preprocessed text
    input_ids = encoder_inputs['input_word_ids']
    attention_mask = encoder_inputs['input_mask']
    # Convert labels to tensor format 
    labels = torch.tensor(data['label'].values)
    
    # Encoder layer using the BERT model
    encoder = hub.KerasLayer(model_path,trainable=True)
    outputs = encoder(encoder_inputs)
    
    # Extract pooled output and sequence output from the encoder
    pooled_output = outputs['pooled_output'] # [batch_size, 512].
    sequence_output = outputs["sequence_output"] # [batch_size, seq_length, 512].
    
    # Add dropout and dense layers for classification
    dropout = tf.keras.layers.Dropout(0.51 , name="dropout1")(pooled_output)
    dense_2 = tf.keras.layers.Dense(64 , activation='relu')(dropout)
    dropout = tf.keras.layers.Dropout(0.3 , name="dropout2")(dense_2)

    dense_out = tf.keras.layers.Dense(1 , activation='sigmoid', name='output')(dropout)

    model = tf.keras.Model(inputs=text_input, outputs=dense_out)
    model.summary()
    
    # Compile the model with optimizer, loss function, and evaluation metrics
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=CFG['learning_rate']),
              loss='binary_crossentropy',
              metrics=["acc"])
    return model

In [None]:
from sklearn.metrics import recall_score
from wandb.keras import WandbCallback
model = bert_model()

#Lists for Hyperparameter Tuning
epochs = [1,2,3,4,5]

In [None]:
#Splitting the Data
X_train,X_test,y_train,y_test = train_test_split(data['text'], np.ravel(data.label), test_size=0.3,random_state=42)

#Mention the amount of in Training and in Testing
print(f'Training {X_train.shape[0]}\nTesting {X_test.shape[0]}')

In [None]:
for x in epochs:
   # Initialise a new run in Weights and Biases (W&B) for tracking this experiment
    run = wandb.init(name = "BERT_Hyperparameter",
               project = "Dissertation_FinalResults",
               config = CFG,
    )
    # Train the model with the current number of epochs
    history = model.fit(X_train, y_train , 
                        batch_size=8, 
                        callbacks=[WandbCallback()],
                        verbose = True,
                        epochs=x , 
                        validation_data=(X_test, y_test)
                       )

    loss_train , acc_train = model.evaluate(X_train, y_train)
    print("Acc. Train data using epoch",x,":",acc_train)
    loss_test , acc_test = model.evaluate(X_test, y_test)
    print("Acc. Test data using epoch",x,":",acc_test)
    
    # Make predictions on the test data
    y_pred = model.predict(X_test)
    
    # Calculate performance metrics
    accuracy = accuracy_score(y_test,y_pred.round())
    precision = precision_score(y_test,y_pred.round(),average='weighted')
    f1 = f1_score(y_test,y_pred.round(),average='weighted')
    recall = recall_score(y_test, y_pred.round(), average='weighted')

    # Log the metrics to W&B
    wandb.log({
        "Epoch":x,
        "Train Loss":loss_train,
        "Train Accuracy": acc_train,
        "Accuracy": accuracy,
        "Precision": precision,
        "F1 Score": f1,
        "Recall": recall,
        "Val Loss": loss_test,
        "Val Accuracy": acc_test
    })
    # Complete W&B run
    run.finish()

In [None]:
#Convert the predicitions to either 0 or 1
y_pred = np.array(y_pred.round()).flatten()
y_pred

In [None]:
#predictions on the testing data
Test_Labels = pd.DataFrame({
    'Original Labels' : y_test,
    'Predicted Labels': y_pred.round()
})
Test_Labels

In [None]:
#Export file to CSV
Test_Labels.to_csv("test_predict_labels.csv", index=False)
pd.read_csv("/kaggle/working/test_predict_labels.csv")

In [None]:
#Confusion matrix
import seaborn as sns
cm = metrics.confusion_matrix(y_test,y_pred.round())

sns.heatmap(cm, 
            annot=True,
            fmt='g', 
            xticklabels=['Machine','Human'],
            yticklabels=['Machine','Human'])
plt.ylabel('Prediction',fontsize=13)
plt.xlabel('Actual',fontsize=13)
plt.title('Confusion Matrix',fontsize=17)
plt.show()

In [None]:
#Prediction on Test Data the model has never seen
test_data_pred = model.predict(Test_data['abstract'])
test_data_pred = np.array(test_data_pred).flatten()

In [None]:
test_data_pred = test_data_pred.round()

In [None]:
#Saving the output in a CSV file
submission = pd.DataFrame({'abstract': Test_data['abstract'] ,'original Label':Test_data['is_ai_generated'] ,'generated Percentage': test_data_pred})
submission.to_csv('submissionBERT.csv', index=False)  # Save the CSV file
pd.read_csv("/kaggle/working/submissionBERT.csv")