In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Load your data
train_df = pd.read_csv('train_selected.csv')
test_df = pd.read_csv('test_selected.csv')

# Assuming 'emotions' contains multiple labels separated by commas
train_df['emotions'] = train_df['emotions'].apply(lambda x: x.split(','))
test_df['emotions'] = test_df['emotions'].apply(lambda x: x.split(','))

# Step 4: Preprocess the Data
# Encode the labels for multilabel classification
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_df['emotions'])
y_test = mlb.transform(test_df['emotions'])

# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_df['text'])

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_df['text'])
test_sequences = tokenizer.texts_to_sequences(test_df['text'])

# Pad the sequences
max_length = max(max(len(x) for x in train_sequences), max(len(x) for x in test_sequences))
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

# Step 5: Build the LSTM Model

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_length))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64))  # No additional arguments
model.add(Dropout(0.5))
model.add(Dense(len(mlb.classes_), activation='sigmoid'))
from tensorflow.keras.optimizers import Adam

model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])


# Step 6: Train the Model
history = model.fit(train_padded, y_train, epochs=10, batch_size=64, validation_split=0.1)

# Save the tokenizer and label binarizer
import joblib
with open('tokenizer.pkl', 'wb') as handle:
    joblib.dump(tokenizer, handle)

with open('mlb.pkl', 'wb') as handle:
    joblib.dump(mlb, handle)

# Step 7: Evaluate the Model
loss, accuracy = model.evaluate(test_padded, y_test)
print(f'Test Accuracy: {accuracy:.2f}')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.02


In [None]:
model.save('model.h5')


  saving_api.save_model(


In [None]:
from google.colab import files
files.download('model.h5')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:


# Example text input for prediction
example_text = "I can't do this anymore, I feel sad."

# Load the model
model = load_model('model.h5')

# Load the tokenizer and label binarizer
with open('tokenizer.pkl', 'rb') as handle:
    tokenizer = joblib.load(handle)

with open('mlb.pkl', 'rb') as handle:
    mlb = joblib.load(handle)

# Tokenize and pad the input text
sequences = tokenizer.texts_to_sequences([example_text])
padded_sequence = pad_sequences(sequences, maxlen=max_length, padding='post')

# Make a prediction
predictions = model.predict(padded_sequence)

# Adjust threshold
threshold = 0.3
predicted_labels = mlb.inverse_transform(predictions > threshold)

# Clean the predicted labels
cleaned_labels = [label.strip() for label in predicted_labels[0]]  # Clean up whitespace

# Output the predictions
print(f"Predicted labels for the input '{example_text}': {cleaned_labels}")



Predicted labels for the input 'I can't do this anymore, I feel sad.': ["'hopelessness'", "'sadness'"]
