In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load poetry datasets
bird_poetry = pd.read_csv('/content/drive/MyDrive/data/BARD/BARD_poetry.csv')  # columns: index, prompts, BARD
gpt_poetry = pd.read_csv('/content/drive/MyDrive/data/GPT/ChatGPT_poetry.csv')    # columns: prompts, responses
human_poetry = pd.read_csv('/content/drive/MyDrive/data/Human/human_poetry.csv') # columns: index, Title, Poem, Poet, Tags

# Prepare the texts and add source labels
bird_poetry['text'] = bird_poetry['BARD']
gpt_poetry['text'] = gpt_poetry['responses']
human_poetry['text'] = human_poetry['Poem'] + ' ' + human_poetry['Poet'] + ' ' + human_poetry['Tags']

# Add source labels
bird_poetry['source'] = 1  # BARD
gpt_poetry['source'] = 0    # GPT
human_poetry['source'] = 2   # Human

# Use only the first 2500 rows of the human dataset
human_poetry = human_poetry.head(2500)

# Combine datasets
poetry_data = pd.concat([bird_poetry[['text', 'source']], gpt_poetry[['text', 'source']], human_poetry[['text', 'source']]], ignore_index=True)

# Handle NaN values: replace NaN with an empty string
poetry_data['text'] = poetry_data['text'].fillna('')

# Clean the text (example cleaning steps)
poetry_data['text'] = (
    poetry_data['text'].str.replace(r'http\S+|www\S+|https\S+', '', case=False)  # Remove URLs
    .str.replace(r'<.*?>', '', case=False)  # Remove HTML tags
    .str.replace(r'\s+', ' ', regex=True)  # Remove extra whitespaces
    .str.lower()  # Convert to lowercase
)

# Prepare features and labels
X = poetry_data['text']
y = poetry_data['source']

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

# Calculate max sequence length and vocabulary size
max_length = max(len(seq) for seq in X_sequences)
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding

# Pad sequences
X_padded = pad_sequences(X_sequences, maxlen=max_length)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42, stratify=y)

# Hybrid CNN-LSTM model architecture
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length))
model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))  # Convolutional layer
model.add(MaxPooling1D(pool_size=2))  # Max pooling layer
model.add(LSTM(64, return_sequences=False))  # LSTM layer for sequential data
model.add(Dropout(0.5))  # Dropout for regularization
model.add(Dense(64, activation='relu'))  # Fully connected layer
model.add(Dense(3, activation='softmax'))  # 3 classes for multiclass classification

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)

# Evaluate the model
y_pred = np.argmax(model.predict(X_test), axis=-1)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(classification_report(y_test, y_pred, target_names=['GPT', 'BARD', 'Human']))




Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m567s[0m 7s/step - accuracy: 0.7777 - loss: 0.6259 - val_accuracy: 0.9100 - val_loss: 0.1754
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m543s[0m 7s/step - accuracy: 0.9316 - loss: 0.1150 - val_accuracy: 0.9217 - val_loss: 0.1478
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m539s[0m 7s/step - accuracy: 0.9189 - loss: 0.1213 - val_accuracy: 0.9067 - val_loss: 0.1758
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m566s[0m 7s/step - accuracy: 0.9381 - loss: 0.1188 - val_accuracy: 0.9683 - val_loss: 0.1211
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m566s[0m 7s/step - accuracy: 0.9848 - loss: 0.0498 - val_accuracy: 0.9683 - val_loss: 0.0889
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m569s[0m 7s/step - accuracy: 0.9957 - loss: 0.0164 - val_accuracy: 0.9783 - val_loss: 0.0830
Epoch 7/10
[1m75/75[0m [32m━━━━