# Import Libraries and Check GPU Availability

In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

# Check if TensorFlow detects a GPU 


In [2]:
gpus = tf.config.list_physical_devices('GPU')
print("GPUs detected:", gpus)

# Load and Inspect the Dataset


In [3]:
# Path to your CSV file
data_path = r"C:\Users\rayon\Jam3a AI\Graduation project\(FINAL)merged_dataset.csv"

# Load the dataset
df = pd.read_csv(data_path)

# Display the first few rows to check the format
print(df.head())


# Preprocess the Data and Split into Training and Validation

In [4]:
# Map textual labels to integers
label_mapping = {"depressed": 1, "non_depressed": 0}
df['label'] = df['label'].map(label_mapping)

# Print the distribution of labels
print(df['label'].value_counts())

# Split the dataset (80% training, 20% validation)
train_df, val_df = train_test_split(
    df, 
    test_size=0.2, 
    random_state=42, 
    stratify=df['label']
)

print("Number of training samples:", len(train_df))
print("Number of validation samples:", len(val_df))


# Create a Text Vectorization Layer and Prepare tf.data Datasets


## Hyper parameters

In [5]:
max_tokens = 10000          
sequence_length = 200       
embedding_dim = 128         
batch_size = 512

In [6]:
# Create a TextVectorization layer
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode='int',
    output_sequence_length=sequence_length
)

# Adapt the vectorization layer to the training text data
vectorize_layer.adapt(train_df['text'].values)

# Create TensorFlow datasets from the Pandas DataFrame
train_ds = tf.data.Dataset.from_tensor_slices((train_df['text'].values, train_df['label'].values))
val_ds = tf.data.Dataset.from_tensor_slices((val_df['text'].values, val_df['label'].values))

# Filter out any empty strings (if any slipped through)
def non_empty_text(text, label):
    return tf.strings.length(tf.strings.strip(text)) > 0

train_ds = train_ds.filter(non_empty_text)
val_ds = val_ds.filter(non_empty_text)

# Map the datasets to vectorize the text, then batch and prefetch
train_ds = train_ds.map(lambda x, y: (vectorize_layer(x), y)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.map(lambda x, y: (vectorize_layer(x), y)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [7]:
len(vectorize_layer.get_vocabulary())  # Actual size learned

In [8]:
vectorize_layer.get_vocabulary()[:10]  # Most frequent words

#  Build the CNN Model for Text Classification

In [9]:
model = models.Sequential([
    # Embedding layer: converts integer sequences to dense vectors
    layers.Embedding(input_dim=max_tokens, output_dim=embedding_dim, input_length=sequence_length),

    # Convolutional layer for feature extraction
    #          filters,kernel size 
    layers.Conv1D(128, 5, activation='relu'),
    
    # Global max pooling to reduce each feature map to a single value
    layers.GlobalMaxPooling1D(),
            
    # Fully connected layer with dropout for regularization
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    
    # Output layer: Sigmoid activation for binary classification
    layers.Dense(1, activation='sigmoid')
])

# Compile the model with an appropriate loss and optimizer
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])
model.summary()


# Train the Model

## Early stop

In [None]:

# Create the EarlyStopping callback
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1)

EPOCHS = 200

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=[early_stop]
)


# Evaluate and Visualize Training History

In [None]:
import matplotlib.pyplot as plt

# Retrieve the training history
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

# Use the actual number of epochs that ran for the x-axis
epochs_range = range(len(acc))

plt.figure(figsize=(12, 6))

# Plot Accuracy
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

# Plot Loss
plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')

plt.show()


# Evaluate model performance on the validation set using additional metrics


In [None]:

import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

# Get predicted probabilities on the validation dataset
y_pred_prob = model.predict(val_ds)

# Convert predicted probabilities to binary predictions (threshold: 0.5)
y_pred = (y_pred_prob > 0.5).astype("int32").flatten()

# Retrieve true labels from the validation dataset
# We concatenate all the label batches into one numpy array
y_true = np.concatenate([y for x, y in val_ds], axis=0)

# Calculate evaluation metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
report = classification_report(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)

# Print the evaluation metrics
print("Validation Accuracy: {:.4f}".format(accuracy))
print("Validation Precision: {:.4f}".format(precision))
print("Validation Recall: {:.4f}".format(recall))
print("Validation F1 Score: {:.4f}".format(f1))
print("\nClassification Report:\n", report)
print("Confusion Matrix:\n", cm)


## Barchart graph

In [None]:
# Create a dictionary with the computed metrics
metrics_dict = {
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1 Score": f1
}

# Plot the metrics as a bar chart
plt.figure(figsize=(8, 6))
bars = plt.bar(metrics_dict.keys(), metrics_dict.values(), color=["blue", "green", "orange", "purple"])
plt.ylim(0, 1)
plt.ylabel("Score")
plt.title("Validation Metrics")

# Annotate each bar with its value
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, height + 0.01, f"{height:.2f}", ha="center", va="bottom")

plt.show()
