In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load code datasets
bird_code = pd.read_csv('/content/drive/MyDrive/data/BARD/BARD_pycode.csv')  # columns: index, prompts, BARD
gpt_code = pd.read_csv('/content/drive/MyDrive/data/GPT/ChatGPT_pycode.csv')    # columns: prompts, responses
human_code = pd.read_csv('/content/drive/MyDrive/data/Human/human_code.csv') # columns: Task, Code

# Prepare the texts and add source labels
bird_code['text'] = bird_code['BARD']
gpt_code['text'] = gpt_code['responses']
human_code['text'] = human_code['Code']

# Add source labels
bird_code['source'] = 1  # BARD
gpt_code['source'] = 0    # GPT
human_code['source'] = 2   # Human

# Combine datasets
code_data = pd.concat([bird_code[['text', 'source']], gpt_code[['text', 'source']], human_code[['text', 'source']]], ignore_index=True)

# Remove NaN values from the 'text' column
code_data = code_data.dropna(subset=['text'])

# Clean the text (example cleaning steps)
code_data['text'] = (
    code_data['text'].str.replace(r'http\S+|www\S+|https\S+', '', case=False)  # Remove URLs
    .str.replace(r'<.*?>', '', case=False)  # Remove HTML tags
    .str.replace(r'\s+', ' ', regex=True)  # Remove extra whitespaces
    .str.lower()  # Convert to lowercase
)

# Prepare features and labels
X = code_data['text']
y = code_data['source']

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

# Calculate max sequence length and vocabulary size
max_length = max(len(seq) for seq in X_sequences)
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding

# Pad sequences
X_padded = pad_sequences(X_sequences, maxlen=max_length)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42, stratify=y)

# CNN model architecture
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length))
model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))  # Convolutional layer
model.add(MaxPooling1D(pool_size=2))  # Max pooling layer
model.add(Flatten())  # Flatten the output
model.add(Dense(64, activation='relu'))  # Fully connected layer
model.add(Dropout(0.5))  # Dropout for regularization
model.add(Dense(3, activation='softmax'))  # 3 classes for multiclass classification

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)

# Evaluate the model
y_pred = np.argmax(model.predict(X_test), axis=-1)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(classification_report(y_test, y_pred, target_names=['GPT', 'BARD', 'Human']))




Epoch 1/10
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 586ms/step - accuracy: 0.8724 - loss: 0.3934 - val_accuracy: 0.9605 - val_loss: 0.1093
Epoch 2/10
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 632ms/step - accuracy: 0.9599 - loss: 0.1021 - val_accuracy: 0.9740 - val_loss: 0.0843
Epoch 3/10
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 582ms/step - accuracy: 0.9704 - loss: 0.0907 - val_accuracy: 0.9834 - val_loss: 0.0476
Epoch 4/10
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 595ms/step - accuracy: 0.9823 - loss: 0.0497 - val_accuracy: 0.9906 - val_loss: 0.0217
Epoch 5/10
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 598ms/step - accuracy: 0.9933 - loss: 0.0200 - val_accuracy: 0.9917 - val_loss: 0.0231
Epoch 6/10
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 573ms/step - accuracy: 0.9915 - loss: 0.0229 - val_accuracy: 0.9896 - val_loss: 0.0238
Epoch 7/10