In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/chipsal-2025-t1/Task-A(indextext)test.csv
/kaggle/input/chipsal-2025-t1/Task-A(indextext)val.csv
/kaggle/input/chipsal-2025-t1/SubTask-A-train.csv
/kaggle/input/chipsal-2025-t1/submission_CNN_BiLSTM.json
/kaggle/input/chipsal-2025-t1/Task-A(indexlabel)val.csv
/kaggle/input/chipsal-2025-t1/Task-A(indexlabel)test.csv
/kaggle/input/chipsal-2025-t1/stopwords-hi.json


# Import Necessary Libraries and Data Loading

In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Input
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

# Load the dataset
train_data_path = '/kaggle/input/chipsal-2025-t1/SubTask-A-train.csv'
test_data_path = '/kaggle/input/chipsal-2025-t1/Task-A(indextext)test.csv'

train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)


# Simple Data Preprocessing

In [5]:
# Preprocess data
text_column = 'text'
label_column = 'label'
train_df[text_column] = train_df[text_column].fillna('')
test_df[text_column] = test_df[text_column].fillna('')

# Tokenization

In [6]:
# Tokenization
max_vocab_size = 10000
max_sequence_length = 100

tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df[text_column])

X_train = tokenizer.texts_to_sequences(train_df[text_column])
X_test = tokenizer.texts_to_sequences(test_df[text_column])
X_train = pad_sequences(X_train, maxlen=max_sequence_length, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_sequence_length, padding='post', truncating='post')

In [7]:
# Label Encoding
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df[label_column])

# Data Splitting

In [8]:
# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [9]:
# Compute class weights
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = {i: class_weights[i] for i in range(len(class_weights))}

# Model (CNN+BiLSTM) Definition

In [10]:
# Define the Hybrid Model
embedding_dim = 128

# Input layer
input_layer = Input(shape=(max_sequence_length,))

# CNN Branch
cnn_layer = Embedding(input_dim=max_vocab_size, output_dim=embedding_dim)(input_layer)
cnn_layer = Conv1D(filters=64, kernel_size=5, activation='relu')(cnn_layer)
cnn_layer = MaxPooling1D(pool_size=2)(cnn_layer)
cnn_layer = GlobalMaxPooling1D()(cnn_layer)

# BiLSTM Branch
lstm_layer = Embedding(input_dim=max_vocab_size, output_dim=embedding_dim)(input_layer)
lstm_layer = Bidirectional(LSTM(64, return_sequences=False))(lstm_layer)

# Merge both branches
merged = tf.keras.layers.concatenate([cnn_layer, lstm_layer])

# Dense layers
dense_layer = Dense(64, activation='relu')(merged)
dense_layer = Dropout(0.5)(dense_layer)
output_layer = Dense(len(np.unique(y_train)), activation='softmax')(dense_layer)  # Number of classes

# Create model
model = Model(inputs=input_layer, outputs=output_layer)

In [11]:
# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [12]:
# Callbacks
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6, verbose=1)

# Model Train

In [13]:
# Train the model
batch_size = 32
epochs = 45

history = model.fit(X_train, y_train, validation_data=(X_val, y_val), 
                    batch_size=batch_size, epochs=epochs, 
                    class_weight=class_weights,
                    callbacks=[reduce_lr])


Epoch 1/45
[1m1311/1311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 84ms/step - accuracy: 0.6281 - loss: 0.9890 - val_accuracy: 0.9754 - val_loss: 0.0861 - learning_rate: 1.0000e-04
Epoch 2/45
[1m1311/1311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 83ms/step - accuracy: 0.9812 - loss: 0.0928 - val_accuracy: 0.9906 - val_loss: 0.0364 - learning_rate: 1.0000e-04
Epoch 3/45
[1m1311/1311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 83ms/step - accuracy: 0.9893 - loss: 0.0470 - val_accuracy: 0.9905 - val_loss: 0.0363 - learning_rate: 1.0000e-04
Epoch 4/45
[1m1311/1311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 83ms/step - accuracy: 0.9943 - loss: 0.0273 - val_accuracy: 0.9795 - val_loss: 0.0639 - learning_rate: 1.0000e-04
Epoch 5/45
[1m1311/1311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 83ms/step - accuracy: 0.9951 - loss: 0.0228 - val_accuracy: 0.9941 - val_loss: 0.0231 - learning_rate: 1.0000e-04
Epoch 6/45
[1m1311/1311

In [14]:
# Predict the test data
y_pred = np.argmax(model.predict(X_test), axis=1)

# Create submission file
submission_df = pd.DataFrame({'index': test_df['index'], 'prediction': y_pred})
submission_df = submission_df.sort_values(by='index')

[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 22ms/step


In [16]:
import json
# Write the submission to JSON in the required format
submission_json_path = 'submission_CNN_BiLSTM.json'
with open(submission_json_path, 'w') as f:
    for _, row in submission_df.iterrows():
        json_line = json.dumps({'index': int(row['index']), 'prediction': int(row['prediction'])})
        f.write(json_line + '\n')

# Zip the JSON file
import zipfile
with zipfile.ZipFile('submission_CNN_BiLSTM_45.zip', 'w') as zipf:
    zipf.write(submission_json_path)

print("Submission JSON and ZIP created successfully.")

Submission JSON and ZIP created successfully.
