In [1]:
!pip install numpy
!pip install pandas
!pip install tensorflow
!pip install scikit-learn


Collecting ml-dtypes~=0.3.1 (from tensorflow)
  Downloading ml_dtypes-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Downloading ml_dtypes-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: ml-dtypes
  Attempting uninstall: ml-dtypes
    Found existing installation: ml_dtypes 0.5.0
    Uninstalling ml_dtypes-0.5.0:
      Successfully uninstalled ml_dtypes-0.5.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jax 0.4.35 requires ml-dtypes>=0.4.0, but you have ml-dtypes 0.3.2 which is incompatible.[0m[31m
[0mSuccessfully installed ml-dtypes-0.3.2


In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split

# Define the file path for the dataset
file_path = "/kaggle/input/combined-data/Combined Data.csv"

try:
    # Read the dataset using the ISO-8859-1 encoding, suitable for Windows-formatted files
    df = pd.read_csv(file_path, index_col=0, encoding="ISO-8859-1")
    # Display a random sample of 5 rows from the dataset to verify data integrity
    print(df.sample(5))
except FileNotFoundError:
    # Handle the case where the specified file is not found
    print(f"Error: File not found at {file_path}. Please check the file path.")
except Exception as e:
    # Catch and display any other exceptions encountered during file reading
    print(f"An error occurred while reading the file: {e}")

# Extract features (statements) and labels (statuses) from the dataset
X = df['statement'].tolist()
y = df['status'].tolist()

# Replace missing values in the features with empty strings
X = [str(text) if not pd.isnull(text) else '' for text in X]

# Initialize the tokenizer for text preprocessing
max_words = 5000  # Maximum number of words to keep in the vocabulary
max_len = 100     # Maximum sequence length for padding

# Create the tokenizer and fit it on the text data
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(X)

# Convert text data to sequences of integers
X_sequences = tokenizer.texts_to_sequences(X)

# Pad the sequences to ensure uniform length
X_padded = pad_sequences(X_sequences, maxlen=max_len, truncating='post')

# Encode the labels into numerical format using LabelEncoder
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Convert the encoded labels to categorical format
y_categorical = tf.keras.utils.to_categorical(y_encoded, num_classes=len(label_encoder.classes_))

# Split the data into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_categorical, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define the architecture of the sequential model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),  # Embedding layer for text input
    LSTM(128, return_sequences=True),  # First LSTM layer with output returned at each timestep
    Dropout(0.5),  # Dropout for regularization
    LSTM(64),  # Second LSTM layer
    Dense(64, activation='relu'),  # Fully connected layer with ReLU activation
    Dropout(0.3),  # Dropout for regularization
    Dense(len(label_encoder.classes_), activation='softmax')  # Output layer with softmax activation
])

# Compile the model with Adam optimizer and categorical crossentropy loss
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Set up callbacks for early stopping and model checkpointing
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_accuracy')

# Train the model on the training data, with validation during training
history = model.fit(
    X_train, y_train,
    epochs=50,  # Maximum number of epochs
    batch_size=32,  # Batch size for training
    validation_data=(X_val, y_val),  # Validation data
    callbacks=[early_stopping, model_checkpoint]  # Callbacks for monitoring training
)

# Evaluate the model performance on the test data
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')


                                               statement    status
71457  It must feel like theres a clock ticking loud...    Stress
22206  I am a pitiful being and I just want to kill m...  Suicidal
49119  How do I stop losing track of all the things I...    Stress
45098  to transcode a 00mb wmv to a 00mb flv file wit...    Normal
43624  is looking at the gray sky the sun ha been sto...    Normal




Epoch 1/50
[1m2001/2001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 12ms/step - accuracy: 0.7017 - loss: 0.8355 - val_accuracy: 0.8038 - val_loss: 0.4559
Epoch 2/50
[1m2001/2001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 11ms/step - accuracy: 0.8156 - loss: 0.4358 - val_accuracy: 0.8328 - val_loss: 0.3922
Epoch 3/50
[1m2001/2001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 11ms/step - accuracy: 0.8594 - loss: 0.3555 - val_accuracy: 0.8696 - val_loss: 0.3378
Epoch 4/50
[1m2001/2001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 12ms/step - accuracy: 0.8832 - loss: 0.2890 - val_accuracy: 0.8744 - val_loss: 0.3224
Epoch 5/50
[1m2001/2001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 11ms/step - accuracy: 0.8997 - loss: 0.2539 - val_accuracy: 0.8776 - val_loss: 0.3255
Epoch 6/50
[1m2001/2001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 11ms/step - accuracy: 0.9146 - loss: 0.2208 - val_accuracy: 0.8776 - val_loss: 0.3357
Epoc