In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report

In [2]:
# Load dataset
train_data = pd.read_csv('/content/drive/MyDrive/DATASET/malayalam_train (1).tsv', sep='\t')
test_data = pd.read_csv('/content/drive/MyDrive/DATASET/malayalam_test_results - malayalam_test_results.tsv', sep='\t')

In [3]:
# Remove the 'id' column from the test data
test_data.drop(columns=['id'], inplace=True)

In [4]:

# Display one instance from the train and test sets
print("Train Example:")
print(train_data.sample(1))

print("\nTest Example:")
print(test_data.sample(1))

Train Example:
                                text   category
1877   Ithu vere level aane makkale.  Positive 

Test Example:
                                                   text  category
1110  Ammbo onnum parayan illa enna oru bgm annu cla...  Positive


In [5]:
# Check unique categories in both datasets
print("Unique categories in training data:")
print(train_data['category'].unique())

print("\nUnique categories in test data:")
print(test_data['category'].unique())

Unique categories in training data:
['Positive ' 'not-malayalam ' 'unknown_state ' 'Mixed_feelings '
 'Negative ']

Unique categories in test data:
['unknown_state' 'Negative' 'not-malayalam' 'Positive' 'Mixed_feelings']


In [6]:
# Check for leading/trailing spaces and strip them
train_data['category'] = train_data['category'].str.strip()
test_data['category'] = test_data['category'].str.strip()

In [15]:
# Encode the labels
le = LabelEncoder()
train_data['category'] = le.fit_transform(train_data['category'])
test_data['category'] = le.transform(test_data['category'])  # Use the same encoder for test data

In [8]:
# Split data into features and labels
X_train = train_data['text'].values
y_train = train_data['category'].values
X_test = test_data['text'].values
y_test = test_data['category'].values

In [9]:

# Tokenize the text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [10]:
# Pad sequences
maxlen = 100  # Maximum length of sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)

In [11]:
# Build LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=maxlen))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.5))  # Dropout to prevent overfitting
model.add(LSTM(32))
model.add(Dropout(0.5))
model.add(Dense(len(le.classes_), activation='softmax'))  # Number of classes




In [12]:
# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train_pad, y_train, validation_split=0.2, epochs=10, batch_size=32, callbacks=[early_stopping])

Epoch 1/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 133ms/step - accuracy: 0.3851 - loss: 1.4684 - val_accuracy: 0.4367 - val_loss: 1.3111
Epoch 2/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 120ms/step - accuracy: 0.5566 - loss: 1.2051 - val_accuracy: 0.6519 - val_loss: 0.9115
Epoch 3/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 121ms/step - accuracy: 0.7853 - loss: 0.6837 - val_accuracy: 0.6550 - val_loss: 0.8859
Epoch 4/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 122ms/step - accuracy: 0.8616 - loss: 0.4480 - val_accuracy: 0.6591 - val_loss: 0.9517
Epoch 5/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 124ms/step - accuracy: 0.9011 - loss: 0.3115 - val_accuracy: 0.6529 - val_loss: 1.0484
Epoch 6/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 123ms/step - accuracy: 0.9466 - loss: 0.2081 - val_accuracy: 0.6457 - val_loss: 1.2088


In [None]:
# Evaluate the model
y_pred = model.predict(X_test_pad)
y_pred_classes = np.argmax(y_pred, axis=1)

# Get original class labels from LabelEncoder
class_labels = le.inverse_transform(np.unique(y_test))

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_classes, target_names=class_labels))

In [19]:
# Example: Select one instance from the test set
index_to_check = 0  # Change this to any valid index
original_text = test_data['text'].iloc[index_to_check]
original_category = test_data['category'].iloc[index_to_check]

In [20]:
# Preprocess the text for prediction
text_sequence = tokenizer.texts_to_sequences([original_text])
text_padded = pad_sequences(text_sequence, maxlen=maxlen)

# Get prediction
prediction = model.predict(text_padded)
predicted_category_index = np.argmax(prediction, axis=1)[0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step


In [21]:
# Map the predicted index back to the category
categories = {i: category for i, category in enumerate(le.classes_)}  # Dynamic mapping
predicted_category = categories.get(predicted_category_index, "Unknown category")

# Print the results
print(f"\nOriginal Text: {original_text}")
print(f"Original Category: {le.inverse_transform([original_category])[0]}")
print(f"Predicted Category: {predicted_category}")


Original Text: Bollywood film Newton inte remake aano?
Original Category: 4
Predicted Category: 4
