In [None]:
import os
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


# Ensure the correct environment variable is set to avoid oneDNN warnings
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

# Load the dataset
data = pd.read_csv('final_data 3 genres.csv')


# Encode genres to numeric labels
label_encoder = LabelEncoder()
data['Genre_Encoded'] = label_encoder.fit_transform(data['Genre'])

# Split data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(data['Lyrics'].astype(str), data['Genre_Encoded'], test_size=0.05, random_state=42)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize texts
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128)

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(len(train_texts)).batch(16)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
)).batch(16)

# Load pre-trained BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))
model.trainable = True  # Fine-tune BERT

# Define additional layers
inputs = {key: tf.keras.Input(shape=(128,), dtype=tf.int32, name=key) for key in ['input_ids', 'token_type_ids', 'attention_mask']}
bert_outputs = model(inputs).logits
dropout_output = tf.keras.layers.Dropout(0.5)(bert_outputs)
dense_output = tf.keras.layers.Dense(128, activation='relu')(dropout_output)
final_output = tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')(dense_output)
dropout_model = tf.keras.Model(inputs=inputs, outputs=final_output)

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
metrics = ['accuracy']
dropout_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
class_weights = {i: class_weights[i] for i in range(len(class_weights))}

# Define early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

# Train the model with early stopping
history = dropout_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10,
    callbacks=[early_stopping],
    class_weight=class_weights
)


# Evaluate the model
val_loss, val_accuracy = dropout_model.evaluate(val_dataset)

# Print evaluation metrics
print(f'Validation Loss: {val_loss:.4f}')
print(f'Validation Accuracy: {val_accuracy:.4f}')

# Predict on validation set
val_predictions = dropout_model.predict(val_dataset)
val_pred_labels = np.argmax(val_predictions, axis=1)

# Print classification report
print(classification_report(val_labels, val_pred_labels, target_names=label_encoder.classes_))


# Assume val_labels and val_pred_labels are defined from previous steps
conf_matrix = confusion_matrix(val_labels, val_pred_labels)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# Save the trained model
model_save_path = r'C:\python\pythonPtixiaki\best model'
dropout_model.save(model_save_path)
