### Importing Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,Bidirectional
from keras.models import Model
from keras.models import Sequential

# task_1

### Handling Pre-processed data

In [2]:
data = pd.read_csv('../preprocess_data.csv')
data.drop(['task_2','text'], axis=1, inplace=True)
data.head()

Unnamed: 0,_id,task_1,task_3,task_4,text_clean
0,Gujarati_image_1618.jpg,Sarcastic,Vulgar,Abusive,છોકર). ટીચર તમાર તાજમહેલ\r\n\r\nદેખ/ય છે.\r\n\...
1,Gujarati_image_31.jpg,Sarcastic,Vulgar,Abusive,છોકરો : ના.\r\n છોકરી : કેમ?\r\n \r\n છોકરી : ...
2,Gujarati_image_1144.jpg,Sarcastic,Vulgar,Abusive,"છોકરીઓ ગમે તેટલી\r\n ચાલક હોય,\r\n \r\n પણ છોક..."
3,Gujarati_image_1184.jpg,Sarcastic,Vulgar,Abusive,"દોસ્તી કરો,પ્રેમ કરો, વફા કરો...\r\n અને બહુ મ..."
4,Gujarati_image_1643.jpg,Sarcastic,Vulgar,Abusive,"છોકરીઓ ગમે તેટલી\r\nચાલક હોય,\r\n\r\nપણ છોકરા ..."


In [3]:
test_data = pd.read_csv('../preprocess_test_data.csv') 
test_data.drop(['task_2','text'], axis=1, inplace=True)
test_data = test_data.drop(['Unnamed: 0'],axis=1)
test_data.head()

Unnamed: 0,_id,task_1,task_3,task_4,text_clean
0,Gujarati_image_1225.jpg,Non-Sarcastic,Non Vulgar,Non-abusive,॥વિંદેશીગામડિયો\r\n અ |
1,Gujarati_image_1583.jpg,Sarcastic,Vulgar,Abusive,ટીચર : સૌથી વધારે દુખાવો ક્યારે\r\nથાય?\r\nછોક...
2,Gujarati_image_1502.jpg,Sarcastic,Vulgar,Abusive,પતિ: તુંમને જરાય પ્રેમ\r\nનથી કરતી...\r\n\r\nપ...
3,Gujarati_image_1487.jpg,Sarcastic,Vulgar,Abusive,આખા ગોમ ના લોડા\r\nભોસ મા ભરી ને બેઠી\r\nહોય અ...
4,Gujarati_image_1497.jpg,Non-Sarcastic,Vulgar,Abusive,મિનરલ વોટર સિવાય ક્યારેય\r\nબીજું\r\nપાણી નો પ...


In [4]:
sentences = data['text_clean'].astype(str)
tokenizer = Tokenizer(num_words = 1500,split=' ')
tokenizer.fit_on_texts(sentences)
sequence = tokenizer.texts_to_sequences(sentences)

In [5]:
test_sentences = test_data['text_clean'].astype(str)
test_sequence = tokenizer.texts_to_sequences(test_sentences)

In [6]:
max_seq_len = 2500

index_of_words = tokenizer.word_index
print("No of unique words : ",len(index_of_words))

X = pad_sequences(sequence , maxlen = max_seq_len )
Y = data['task_1']

print(X)

No of unique words :  7000
[[   0    0    0 ...  536  134    1]
 [   0    0    0 ...  246 1062  318]
 [   0    0    0 ... 1067 1068  174]
 ...
 [   0    0    0 ...    0    0    5]
 [   0    0    0 ...    0    0    5]
 [   0    0    0 ...    1   17  433]]


In [7]:
test_X = pad_sequences(test_sequence , maxlen = max_seq_len )
test_Y = test_data['task_1']

print(test_X)

[[  0   0   0 ...   0   0 648]
 [  0   0   0 ... 206 394   1]
 [  0   0   0 ...   2   1   1]
 ...
 [  0   0   0 ...   1   1 324]
 [  0   0   0 ...  40   2   1]
 [  0   0   0 ...   0   0   5]]


In [8]:
embed_dim = 256
vocabSize = len(index_of_words)
lstm_out = 64

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, random_state = 0)
Y_true = Y_test
Y_train = pd.get_dummies(Y_train).values
Y_test = pd.get_dummies(Y_test).values

In [10]:
test_Y_true = test_Y
test_Y = pd.get_dummies(test_Y).values
print("test_Y:", test_Y)

test_Y: [[ True False]
 [False  True]
 [False  True]
 ...
 [ True False]
 [ True False]
 [ True False]]


# MODEL 1

In [11]:
model = Sequential()
model.add(Embedding(vocabSize, embed_dim,input_length = 2500))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2500, 256)         1792000   
                                                                 
 lstm (LSTM)                 (None, 64)                82176     
                                                                 
 dense (Dense)               (None, 2)                 130       
                                                                 
Total params: 1874306 (7.15 MB)
Trainable params: 1874306 (7.15 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [12]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(
    "hasoc_a1.h5",
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    save_weights_only=False, 
    mode='auto')

In [13]:
model.fit(X_train, Y_train, batch_size = 32, epochs = 10, validation_data=(X_test,Y_test), callbacks=[checkpoint])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.52641, saving model to hasoc_a1.h5
Epoch 2/10


  saving_api.save_model(


Epoch 2: val_loss improved from 0.52641 to 0.50465, saving model to hasoc_a1.h5
Epoch 3/10
Epoch 3: val_loss improved from 0.50465 to 0.49123, saving model to hasoc_a1.h5
Epoch 4/10
Epoch 4: val_loss did not improve from 0.49123
Epoch 5/10
Epoch 5: val_loss did not improve from 0.49123
Epoch 6/10
Epoch 6: val_loss did not improve from 0.49123
Epoch 7/10
Epoch 7: val_loss did not improve from 0.49123
Epoch 8/10
Epoch 8: val_loss did not improve from 0.49123
Epoch 9/10
Epoch 9: val_loss did not improve from 0.49123
Epoch 10/10
Epoch 10: val_loss did not improve from 0.49123


<keras.src.callbacks.History at 0x23751342c10>

In [14]:
model.load_weights('hasoc_a1.h5')
model.evaluate(X_test,Y_test)



[0.49123457074165344, 0.7910447716712952]

In [15]:
Y_pred = model.predict(test_X)



In [16]:
y_actual = []
for i in test_Y_true:
    if i == 'Sarcastic':
        y_actual.append(1)
    else :
        y_actual.append(0)

pred_class = []
for i in Y_pred:
    pred_class.append(np.argmax(i))

In [17]:
print(classification_report(y_actual, pred_class))

              precision    recall  f1-score   support

           0       0.33      0.00      0.01       410
           1       0.32      0.98      0.48       194

    accuracy                           0.32       604
   macro avg       0.33      0.49      0.24       604
weighted avg       0.33      0.32      0.16       604



In [18]:
print("Y_pred:", Y_pred)

Y_pred: [[0.10724969 0.8927503 ]
 [0.35344046 0.64655954]
 [0.03545646 0.9645435 ]
 ...
 [0.08703656 0.9129634 ]
 [0.09556682 0.9044332 ]
 [0.13938175 0.86061823]]


In [19]:
print("pred_class:", pred_class)

pred_class: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [20]:
pred_actual = []
for i in pred_class:
    if i == 1:
        pred_actual.append('Sarcastic')
    else :
        pred_actual.append('Non-Sarcastic')

In [21]:
test_data = test_data[["_id"]]
test_data["label"] = pred_actual
test_data.to_csv('dl_lstm_a.csv',index=False)
test_data.head()

Unnamed: 0,_id,label
0,Gujarati_image_1225.jpg,Sarcastic
1,Gujarati_image_1583.jpg,Sarcastic
2,Gujarati_image_1502.jpg,Sarcastic
3,Gujarati_image_1487.jpg,Sarcastic
4,Gujarati_image_1497.jpg,Sarcastic


# MODEL 2

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

model = Sequential()

# Embedding层，增加防止过拟合的Dropout
model.add(Embedding(input_dim=vocabSize, output_dim=embed_dim, input_length=2500))
model.add(Dropout(0.3))

# LSTM层，增加recurrent_dropout 和 output dropout
model.add(LSTM(units=lstm_out, dropout=0.3, recurrent_dropout=0.3, return_sequences=False))

# Batch Normalization增强泛化
model.add(BatchNormalization())

# 全连接层，Softmax输出2分类，建议用categorical_crossentropy
model.add(Dense(2, activation='softmax'))

# 编译
optimizer = Adam(learning_rate=0.001)  # 学习率也可调整
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2500, 256)         1792000   
                                                                 
 dropout (Dropout)           (None, 2500, 256)         0         
                                                                 
 lstm (LSTM)                 (None, 64)                82176     
                                                                 
 batch_normalization (Batch  (None, 64)                256       
 Normalization)                                                  
                                                                 
 dense (Dense)               (None, 2)                 130       
                                                                 
Total params: 1874562 (7.15 MB)
Trainable params: 1874434 (7.15 MB)
Non-trainable params: 128 (512.00 Byte)
______________

In [12]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint("hasoc_b1.h5", monitor='val_loss', verbose=1, save_best_only=True,
save_weights_only=False, mode='auto')

In [13]:
model.fit(X_train, Y_train, batch_size = 32, epochs = 10, validation_data = (X_test,Y_test), callbacks = [checkpoint])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.60993, saving model to hasoc_b1.h5
Epoch 2/10


  saving_api.save_model(


Epoch 2: val_loss improved from 0.60993 to 0.56328, saving model to hasoc_b1.h5
Epoch 3/10
Epoch 3: val_loss improved from 0.56328 to 0.53399, saving model to hasoc_b1.h5
Epoch 4/10
Epoch 4: val_loss improved from 0.53399 to 0.51653, saving model to hasoc_b1.h5
Epoch 5/10
Epoch 5: val_loss improved from 0.51653 to 0.51339, saving model to hasoc_b1.h5
Epoch 6/10
Epoch 6: val_loss did not improve from 0.51339
Epoch 7/10
Epoch 7: val_loss did not improve from 0.51339
Epoch 8/10
Epoch 8: val_loss did not improve from 0.51339
Epoch 9/10
Epoch 9: val_loss did not improve from 0.51339
Epoch 10/10
Epoch 10: val_loss did not improve from 0.51339


<keras.src.callbacks.History at 0x1b1ad28ceb0>

In [14]:
model.load_weights('hasoc_b1.h5')
model.evaluate(X_test,Y_test)



[0.513394296169281, 0.7835820913314819]

In [15]:
Y_pred = model.predict(test_X)



In [16]:
y_actual = []
for i in test_Y_true:
    if i == 'Sarcastic':
        y_actual.append(1)
    else :
        y_actual.append(0)

pred_class = []
for i in Y_pred:
    pred_class.append(np.argmax(i))

In [17]:
print(classification_report(y_actual , pred_class))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       410
           1       0.32      1.00      0.49       194

    accuracy                           0.32       604
   macro avg       0.16      0.50      0.24       604
weighted avg       0.10      0.32      0.16       604



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
pred_actual = []
for i in pred_class:
    if i == 1:
        pred_actual.append('Sarcastic')
    else :
        pred_actual.append('Non-Sarcastic')

In [19]:
test_data = test_data[["_id"]]
test_data["label"] = pred_actual
test_data.to_csv('dl_lstm_b.csv',index=False)
test_data.head()

Unnamed: 0,_id,label
0,Gujarati_image_1225.jpg,Sarcastic
1,Gujarati_image_1583.jpg,Sarcastic
2,Gujarati_image_1502.jpg,Sarcastic
3,Gujarati_image_1487.jpg,Sarcastic
4,Gujarati_image_1497.jpg,Sarcastic


# MODEL 3

In [11]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(
    "hasoc_c1.h5",
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    save_weights_only=False, 
    mode='auto')

In [12]:
model = Sequential()
model.add(Embedding(vocabSize, embed_dim, input_length=2500))
model.add(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

from sklearn.utils.class_weight import compute_class_weight
# 假设Y_train已独热编码
class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=np.argmax(Y_train, axis=1))
class_weight_dict = dict(enumerate(class_weights))

model.fit(X_train, Y_train, epochs=10, batch_size=64, class_weight=class_weight_dict, validation_data=(X_test,Y_test), callbacks=[checkpoint])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.67589, saving model to hasoc_c1.h5
Epoch 2/10


  saving_api.save_model(


Epoch 2: val_loss improved from 0.67589 to 0.65912, saving model to hasoc_c1.h5
Epoch 3/10
Epoch 3: val_loss improved from 0.65912 to 0.56992, saving model to hasoc_c1.h5
Epoch 4/10
Epoch 4: val_loss did not improve from 0.56992
Epoch 5/10
Epoch 5: val_loss did not improve from 0.56992
Epoch 6/10
Epoch 6: val_loss did not improve from 0.56992
Epoch 7/10
Epoch 7: val_loss did not improve from 0.56992
Epoch 8/10
Epoch 8: val_loss did not improve from 0.56992
Epoch 9/10
Epoch 9: val_loss did not improve from 0.56992
Epoch 10/10
Epoch 10: val_loss did not improve from 0.56992


<keras.src.callbacks.History at 0x193e2d35880>

In [13]:
model.load_weights('hasoc_c1.h5')
model.evaluate(X_test,Y_test)



[0.5699222087860107, 0.8059701323509216]

In [14]:
Y_pred = model.predict(test_X)



In [15]:
y_actual = []
for i in test_Y_true:
    if i == 'Sarcastic':
        y_actual.append(1)
    else :
        y_actual.append(0)

pred_class = []
for i in Y_pred:
    pred_class.append(np.argmax(i))

In [16]:
print(classification_report(y_actual , pred_class))

              precision    recall  f1-score   support

           0       0.66      0.05      0.10       410
           1       0.32      0.94      0.48       194

    accuracy                           0.34       604
   macro avg       0.49      0.50      0.29       604
weighted avg       0.55      0.34      0.22       604



In [17]:
print("Y_pred:",Y_pred)

Y_pred: [[0.4008175  0.5991824 ]
 [0.5229386  0.47706142]
 [0.32581317 0.6741869 ]
 ...
 [0.36569163 0.6343084 ]
 [0.4074059  0.5925941 ]
 [0.44256213 0.55743796]]


In [18]:
pred_actual = []
for i in pred_class:
    if i == 1:
        pred_actual.append('Sarcastic')
    else :
        pred_actual.append('Non-Sarcastic')

In [19]:
test_data = test_data[["_id"]]
test_data["label"] = pred_actual
test_data.to_csv('dl_lstm_c.csv',index=False)
test_data.head()

Unnamed: 0,_id,label
0,Gujarati_image_1225.jpg,Sarcastic
1,Gujarati_image_1583.jpg,Non-Sarcastic
2,Gujarati_image_1502.jpg,Sarcastic
3,Gujarati_image_1487.jpg,Sarcastic
4,Gujarati_image_1497.jpg,Non-Sarcastic


# task_3

# Handling Pre-processed data

In [2]:
data = pd.read_csv('../preprocess_data.csv')
data.drop(['task_2','text'], axis=1, inplace=True)
data.head()

Unnamed: 0,_id,task_1,task_3,task_4,text_clean
0,Gujarati_image_1618.jpg,Sarcastic,Vulgar,Abusive,છોકર). ટીચર તમાર તાજમહેલ\r\n\r\nદેખ/ય છે.\r\n\...
1,Gujarati_image_31.jpg,Sarcastic,Vulgar,Abusive,છોકરો : ના.\r\n છોકરી : કેમ?\r\n \r\n છોકરી : ...
2,Gujarati_image_1144.jpg,Sarcastic,Vulgar,Abusive,"છોકરીઓ ગમે તેટલી\r\n ચાલક હોય,\r\n \r\n પણ છોક..."
3,Gujarati_image_1184.jpg,Sarcastic,Vulgar,Abusive,"દોસ્તી કરો,પ્રેમ કરો, વફા કરો...\r\n અને બહુ મ..."
4,Gujarati_image_1643.jpg,Sarcastic,Vulgar,Abusive,"છોકરીઓ ગમે તેટલી\r\nચાલક હોય,\r\n\r\nપણ છોકરા ..."


In [3]:
test_data = pd.read_csv('../preprocess_test_data.csv') 
test_data.drop(['task_2','text'], axis=1, inplace=True)
test_data = test_data.drop(['Unnamed: 0'],axis=1)
test_data.head()

Unnamed: 0,_id,task_1,task_3,task_4,text_clean
0,Gujarati_image_1225.jpg,Non-Sarcastic,Non Vulgar,Non-abusive,॥વિંદેશીગામડિયો\r\n અ |
1,Gujarati_image_1583.jpg,Sarcastic,Vulgar,Abusive,ટીચર : સૌથી વધારે દુખાવો ક્યારે\r\nથાય?\r\nછોક...
2,Gujarati_image_1502.jpg,Sarcastic,Vulgar,Abusive,પતિ: તુંમને જરાય પ્રેમ\r\nનથી કરતી...\r\n\r\nપ...
3,Gujarati_image_1487.jpg,Sarcastic,Vulgar,Abusive,આખા ગોમ ના લોડા\r\nભોસ મા ભરી ને બેઠી\r\nહોય અ...
4,Gujarati_image_1497.jpg,Non-Sarcastic,Vulgar,Abusive,મિનરલ વોટર સિવાય ક્યારેય\r\nબીજું\r\nપાણી નો પ...


In [4]:
sentences = data['text_clean'].astype(str)
tokenizer = Tokenizer(num_words = 1500,split=' ')
tokenizer.fit_on_texts(sentences)
sequence = tokenizer.texts_to_sequences(sentences)

In [5]:
test_sentences = test_data['text_clean'].astype(str)
test_sequence = tokenizer.texts_to_sequences(test_sentences)

In [6]:
max_seq_len = 2500

index_of_words = tokenizer.word_index
print("No of unique words : ",len(index_of_words))

X = pad_sequences(sequence , maxlen = max_seq_len )
Y = data['task_3']

print(X)

No of unique words :  7000
[[   0    0    0 ...  536  134    1]
 [   0    0    0 ...  246 1062  318]
 [   0    0    0 ... 1067 1068  174]
 ...
 [   0    0    0 ...    0    0    5]
 [   0    0    0 ...    0    0    5]
 [   0    0    0 ...    1   17  433]]


In [7]:
test_X = pad_sequences(test_sequence , maxlen = max_seq_len )
test_Y = test_data['task_3']

print(test_X)

[[  0   0   0 ...   0   0 648]
 [  0   0   0 ... 206 394   1]
 [  0   0   0 ...   2   1   1]
 ...
 [  0   0   0 ...   1   1 324]
 [  0   0   0 ...  40   2   1]
 [  0   0   0 ...   0   0   5]]


In [8]:
embed_dim = 256
vocabSize = len(index_of_words)
lstm_out = 64

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, random_state = 0)
Y_true = Y_test
Y_train = pd.get_dummies(Y_train).values
Y_test = pd.get_dummies(Y_test).values

In [10]:
test_Y_true = test_Y
test_Y = pd.get_dummies(test_Y).values
print("test_Y:",test_Y)

test_Y: [[ True False]
 [False  True]
 [False  True]
 ...
 [ True False]
 [ True False]
 [ True False]]


# MODEL 1

In [12]:
model = Sequential()
model.add(Embedding(vocabSize, embed_dim,input_length = 2500))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2500, 256)         1792000   
                                                                 
 lstm (LSTM)                 (None, 64)                82176     
                                                                 
 dense (Dense)               (None, 2)                 130       
                                                                 
Total params: 1874306 (7.15 MB)
Trainable params: 1874306 (7.15 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [13]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(
    "hasoc_a3.h5",
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    save_weights_only=False, 
    mode='auto')

In [14]:
model.fit(X_train, Y_train, batch_size = 32, epochs = 10, validation_data = (X_test,Y_test), callbacks=[checkpoint])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.56218, saving model to hasoc_a3.h5
Epoch 2/10


  saving_api.save_model(


Epoch 2: val_loss improved from 0.56218 to 0.52034, saving model to hasoc_a3.h5
Epoch 3/10
Epoch 3: val_loss improved from 0.52034 to 0.46331, saving model to hasoc_a3.h5
Epoch 4/10
Epoch 4: val_loss did not improve from 0.46331
Epoch 5/10
Epoch 5: val_loss did not improve from 0.46331
Epoch 6/10
Epoch 6: val_loss did not improve from 0.46331
Epoch 7/10
Epoch 7: val_loss did not improve from 0.46331
Epoch 8/10
Epoch 8: val_loss did not improve from 0.46331
Epoch 9/10
Epoch 9: val_loss did not improve from 0.46331
Epoch 10/10
Epoch 10: val_loss did not improve from 0.46331


<keras.src.callbacks.History at 0x23f09475a30>

In [15]:
model.load_weights('hasoc_a3.h5')
model.evaluate(X_test,Y_test)



[0.4633116126060486, 0.7985074520111084]

In [16]:
Y_pred = model.predict(test_X)



In [17]:
y_actual = []
for i in test_Y_true:
    if i =='Vulgar':
        y_actual.append(1)
    else :
        y_actual.append(0)

pred_class = []
for i in Y_pred:
    pred_class.append(np.argmax(i))

In [18]:
print(classification_report(y_actual, pred_class))

              precision    recall  f1-score   support

           0       0.89      0.83      0.86       496
           1       0.41      0.54      0.46       108

    accuracy                           0.78       604
   macro avg       0.65      0.68      0.66       604
weighted avg       0.80      0.78      0.79       604



In [19]:
print("Y_pred:", Y_pred)

Y_pred: [[0.9903398  0.00966018]
 [0.04106864 0.9589314 ]
 [0.6246746  0.3753254 ]
 ...
 [0.96826684 0.0317331 ]
 [0.961996   0.03800403]
 [0.9824645  0.01753556]]


In [20]:
print("pred_class:", pred_class)

pred_class: [0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 

In [21]:
pred_actual = []
for i in pred_class:
    if i == 1:
        pred_actual.append('Vulgar')
    else :
        pred_actual.append('Non Vulgar')

In [22]:
test_data = test_data[["_id"]]
test_data["label"] = pred_actual
test_data.to_csv('dl_lstm_a3.csv', index=False)
test_data.head()

Unnamed: 0,_id,label
0,Gujarati_image_1225.jpg,Non Vulgar
1,Gujarati_image_1583.jpg,Vulgar
2,Gujarati_image_1502.jpg,Non Vulgar
3,Gujarati_image_1487.jpg,Non Vulgar
4,Gujarati_image_1497.jpg,Non Vulgar


# MODEL 2

In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

model = Sequential()

# Embedding层，增加防止过拟合的Dropout
model.add(Embedding(input_dim=vocabSize, output_dim=embed_dim, input_length=2500))
model.add(Dropout(0.3))

# LSTM层，增加recurrent_dropout 和 output dropout
model.add(LSTM(units=lstm_out, dropout=0.3, recurrent_dropout=0.3, return_sequences=False))

# Batch Normalization增强泛化
model.add(BatchNormalization())

# 全连接层，Softmax输出2分类，建议用categorical_crossentropy
model.add(Dense(2, activation='softmax'))

# 编译
optimizer = Adam(learning_rate=0.001)  # 学习率也可调整
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 2500, 256)         1792000   
                                                                 
 dropout (Dropout)           (None, 2500, 256)         0         
                                                                 
 lstm_1 (LSTM)               (None, 64)                82176     
                                                                 
 batch_normalization (Batch  (None, 64)                256       
 Normalization)                                                  
                                                                 
 dense_1 (Dense)             (None, 2)                 130       
                                                                 
Total params: 1874562 (7.15 MB)
Trainable params: 1874434 (7.15 MB)
Non-trainable params: 128 (512.00 Byte)
____________

In [24]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(
    "hasoc_b3.h5",
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    save_weights_only=False, 
    mode='auto')

In [25]:
model.fit(X_train, Y_train, batch_size = 32, epochs = 10, validation_data = (X_test,Y_test), callbacks = [checkpoint])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.64467, saving model to hasoc_b3.h5
Epoch 2/10


  saving_api.save_model(


Epoch 2: val_loss improved from 0.64467 to 0.62699, saving model to hasoc_b3.h5
Epoch 3/10
Epoch 3: val_loss improved from 0.62699 to 0.61049, saving model to hasoc_b3.h5
Epoch 4/10
Epoch 4: val_loss improved from 0.61049 to 0.60861, saving model to hasoc_b3.h5
Epoch 5/10
Epoch 5: val_loss improved from 0.60861 to 0.59023, saving model to hasoc_b3.h5
Epoch 6/10
Epoch 6: val_loss improved from 0.59023 to 0.58422, saving model to hasoc_b3.h5
Epoch 7/10
Epoch 7: val_loss improved from 0.58422 to 0.54906, saving model to hasoc_b3.h5
Epoch 8/10
Epoch 8: val_loss improved from 0.54906 to 0.53040, saving model to hasoc_b3.h5
Epoch 9/10
Epoch 9: val_loss did not improve from 0.53040
Epoch 10/10
Epoch 10: val_loss improved from 0.53040 to 0.52165, saving model to hasoc_b3.h5


<keras.src.callbacks.History at 0x23f09c1fb50>

In [26]:
model.load_weights('hasoc_b3.h5')
model.evaluate(X_test,Y_test)



[0.52164626121521, 0.7388059496879578]

In [27]:
Y_pred = model.predict(test_X)



In [28]:
y_actual = []
for i in test_Y_true:
    if i == 'Vulgar':
        y_actual.append(1)
    else :
        y_actual.append(0)

pred_class = []
for i in Y_pred:
    pred_class.append(np.argmax(i))

In [29]:
print(classification_report(y_actual, pred_class))

              precision    recall  f1-score   support

           0       0.87      0.91      0.89       496
           1       0.48      0.36      0.41       108

    accuracy                           0.81       604
   macro avg       0.67      0.64      0.65       604
weighted avg       0.80      0.81      0.80       604



In [30]:
pred_actual = []
for i in pred_class:
    if i == 1:
        pred_actual.append('Vulgar')
    else :
        pred_actual.append('Non Vulgar')

In [31]:
test_data = test_data[["_id"]]
test_data["label"] = pred_actual
test_data.to_csv('dl_lstm_b3.csv',index=False)
test_data.head()

Unnamed: 0,_id,label
0,Gujarati_image_1225.jpg,Non Vulgar
1,Gujarati_image_1583.jpg,Vulgar
2,Gujarati_image_1502.jpg,Non Vulgar
3,Gujarati_image_1487.jpg,Non Vulgar
4,Gujarati_image_1497.jpg,Non Vulgar


# MODEL 3

In [11]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(
    "hasoc_c3.h5",
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    save_weights_only=False, 
    mode='auto')

In [12]:
model = Sequential()
model.add(Embedding(vocabSize, embed_dim, input_length=2500))
model.add(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

from sklearn.utils.class_weight import compute_class_weight
# 假设Y_train已独热编码
class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=np.argmax(Y_train, axis=1))
class_weight_dict = dict(enumerate(class_weights))

model.fit(X_train, Y_train, epochs=10, batch_size=64, class_weight=class_weight_dict, validation_data=(X_test,Y_test), callbacks=[checkpoint])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.59976, saving model to hasoc_c3.h5
Epoch 2/10


  saving_api.save_model(


Epoch 2: val_loss did not improve from 0.59976
Epoch 3/10
Epoch 3: val_loss improved from 0.59976 to 0.59414, saving model to hasoc_c3.h5
Epoch 4/10
Epoch 4: val_loss improved from 0.59414 to 0.50209, saving model to hasoc_c3.h5
Epoch 5/10
Epoch 5: val_loss did not improve from 0.50209
Epoch 6/10
Epoch 6: val_loss did not improve from 0.50209
Epoch 7/10
Epoch 7: val_loss did not improve from 0.50209
Epoch 8/10
Epoch 8: val_loss did not improve from 0.50209
Epoch 9/10
Epoch 9: val_loss did not improve from 0.50209
Epoch 10/10
Epoch 10: val_loss did not improve from 0.50209


<keras.src.callbacks.History at 0x1f80ba34490>

In [13]:
model.load_weights('hasoc_c3.h5')
model.evaluate(X_test,Y_test)



[0.5020859241485596, 0.7985074520111084]

In [14]:
Y_pred = model.predict(test_X)



In [15]:
y_actual = []
for i in test_Y_true:
    if i == 'Vulgar':
        y_actual.append(1)
    else :
        y_actual.append(0)

pred_class = []
for i in Y_pred:
    pred_class.append(np.argmax(i))

In [16]:
print(classification_report(y_actual, pred_class))

              precision    recall  f1-score   support

           0       0.94      0.67      0.79       496
           1       0.35      0.81      0.49       108

    accuracy                           0.70       604
   macro avg       0.65      0.74      0.64       604
weighted avg       0.84      0.70      0.73       604



In [17]:
pred_actual = []
for i in pred_class:
    if i == 1:
        pred_actual.append('Vulgar')
    else :
        pred_actual.append('Non Vulgar')

In [18]:
test_data = test_data[["_id"]]
test_data["label"] = pred_actual
test_data.to_csv('dl_lstm_c3.csv',index=False)
test_data.head()

Unnamed: 0,_id,label
0,Gujarati_image_1225.jpg,Non Vulgar
1,Gujarati_image_1583.jpg,Vulgar
2,Gujarati_image_1502.jpg,Vulgar
3,Gujarati_image_1487.jpg,Vulgar
4,Gujarati_image_1497.jpg,Vulgar


# task_4

# Handling Pre-processed data

In [2]:
data = pd.read_csv('../preprocess_data.csv')
data.drop(['task_2','text'], axis=1, inplace=True)
data.head()

Unnamed: 0,_id,task_1,task_3,task_4,text_clean
0,Gujarati_image_1618.jpg,Sarcastic,Vulgar,Abusive,છોકર). ટીચર તમાર તાજમહેલ\r\n\r\nદેખ/ય છે.\r\n\...
1,Gujarati_image_31.jpg,Sarcastic,Vulgar,Abusive,છોકરો : ના.\r\n છોકરી : કેમ?\r\n \r\n છોકરી : ...
2,Gujarati_image_1144.jpg,Sarcastic,Vulgar,Abusive,"છોકરીઓ ગમે તેટલી\r\n ચાલક હોય,\r\n \r\n પણ છોક..."
3,Gujarati_image_1184.jpg,Sarcastic,Vulgar,Abusive,"દોસ્તી કરો,પ્રેમ કરો, વફા કરો...\r\n અને બહુ મ..."
4,Gujarati_image_1643.jpg,Sarcastic,Vulgar,Abusive,"છોકરીઓ ગમે તેટલી\r\nચાલક હોય,\r\n\r\nપણ છોકરા ..."


In [3]:
test_data = pd.read_csv('../preprocess_test_data.csv') 
test_data.drop(['task_2','text'], axis=1, inplace=True)
test_data = test_data.drop(['Unnamed: 0'],axis=1)
test_data.head()

Unnamed: 0,_id,task_1,task_3,task_4,text_clean
0,Gujarati_image_1225.jpg,Non-Sarcastic,Non Vulgar,Non-abusive,॥વિંદેશીગામડિયો\r\n અ |
1,Gujarati_image_1583.jpg,Sarcastic,Vulgar,Abusive,ટીચર : સૌથી વધારે દુખાવો ક્યારે\r\nથાય?\r\nછોક...
2,Gujarati_image_1502.jpg,Sarcastic,Vulgar,Abusive,પતિ: તુંમને જરાય પ્રેમ\r\nનથી કરતી...\r\n\r\nપ...
3,Gujarati_image_1487.jpg,Sarcastic,Vulgar,Abusive,આખા ગોમ ના લોડા\r\nભોસ મા ભરી ને બેઠી\r\nહોય અ...
4,Gujarati_image_1497.jpg,Non-Sarcastic,Vulgar,Abusive,મિનરલ વોટર સિવાય ક્યારેય\r\nબીજું\r\nપાણી નો પ...


In [4]:
sentences = data['text_clean'].astype(str)
tokenizer = Tokenizer(num_words = 1500,split=' ')
tokenizer.fit_on_texts(sentences)
sequence = tokenizer.texts_to_sequences(sentences)

In [5]:
test_sentences = test_data['text_clean'].astype(str)
test_sequence = tokenizer.texts_to_sequences(test_sentences)

In [6]:
max_seq_len = 2500

index_of_words = tokenizer.word_index
print("No of unique words : ",len(index_of_words))

X = pad_sequences(sequence , maxlen = max_seq_len )
Y = data['task_4']

print(X)

No of unique words :  7000
[[   0    0    0 ...  536  134    1]
 [   0    0    0 ...  246 1062  318]
 [   0    0    0 ... 1067 1068  174]
 ...
 [   0    0    0 ...    0    0    5]
 [   0    0    0 ...    0    0    5]
 [   0    0    0 ...    1   17  433]]


In [7]:
test_X = pad_sequences(test_sequence , maxlen = max_seq_len )
test_Y = test_data['task_4']

print(test_X)

[[  0   0   0 ...   0   0 648]
 [  0   0   0 ... 206 394   1]
 [  0   0   0 ...   2   1   1]
 ...
 [  0   0   0 ...   1   1 324]
 [  0   0   0 ...  40   2   1]
 [  0   0   0 ...   0   0   5]]


In [8]:
embed_dim = 256
vocabSize = len(index_of_words)
lstm_out = 64

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, random_state = 0)
Y_true = Y_test
Y_train = pd.get_dummies(Y_train).values
Y_test = pd.get_dummies(Y_test).values

In [10]:
test_Y_true = test_Y
test_Y = pd.get_dummies(test_Y).values
print("test_Y:", test_Y)

test_Y: [[False  True]
 [ True False]
 [ True False]
 ...
 [False  True]
 [False  True]
 [False  True]]


# MODEL 1

In [11]:
model = Sequential()
model.add(Embedding(vocabSize, embed_dim,input_length = 2500))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2500, 256)         1792000   
                                                                 
 lstm (LSTM)                 (None, 64)                82176     
                                                                 
 dense (Dense)               (None, 2)                 130       
                                                                 
Total params: 1874306 (7.15 MB)
Trainable params: 1874306 (7.15 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [12]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(
    "hasoc_a4.h5",
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    save_weights_only=False, 
    mode='auto')

In [13]:
model.fit(X_train, Y_train, batch_size = 32, epochs = 10, validation_data=(X_test,Y_test), callbacks=[checkpoint])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.38842, saving model to hasoc_a4.h5
Epoch 2/10


  saving_api.save_model(


Epoch 2: val_loss improved from 0.38842 to 0.36340, saving model to hasoc_a4.h5
Epoch 3/10
Epoch 3: val_loss improved from 0.36340 to 0.33369, saving model to hasoc_a4.h5
Epoch 4/10
Epoch 4: val_loss did not improve from 0.33369
Epoch 5/10
Epoch 5: val_loss did not improve from 0.33369
Epoch 6/10
Epoch 6: val_loss did not improve from 0.33369
Epoch 7/10
Epoch 7: val_loss did not improve from 0.33369
Epoch 8/10
Epoch 8: val_loss did not improve from 0.33369
Epoch 9/10
Epoch 9: val_loss did not improve from 0.33369
Epoch 10/10
Epoch 10: val_loss did not improve from 0.33369


<keras.src.callbacks.History at 0x224af1d28b0>

In [14]:
model.load_weights('hasoc_a4.h5')
model.evaluate(X_test,Y_test)



[0.3336866497993469, 0.8805969953536987]

In [15]:
Y_pred = model.predict(test_X)



In [16]:
y_actual = []
for i in test_Y_true:
    if i == 'Non-abusive':
        y_actual.append(1)
    else :
        y_actual.append(0)

pred_class = []
for i in Y_pred:
    pred_class.append(np.argmax(i))

In [17]:
print(classification_report(y_actual , pred_class))

              precision    recall  f1-score   support

           0       0.81      0.19      0.31       205
           1       0.70      0.98      0.82       399

    accuracy                           0.71       604
   macro avg       0.76      0.58      0.56       604
weighted avg       0.74      0.71      0.64       604



In [18]:
print("Y_pred:", Y_pred)

Y_pred: [[0.00141402 0.99858606]
 [0.8391441  0.16085593]
 [0.30731696 0.69268304]
 ...
 [0.05582917 0.9441708 ]
 [0.00347404 0.99652594]
 [0.00159899 0.998401  ]]


In [19]:
print("pred_class:", pred_class)

pred_class: [1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [20]:
pred_actual = []
for i in pred_class:
    if i == 1:
        pred_actual.append('Non-abusive')
    else :
        pred_actual.append('Abusive')

In [21]:
test_data = test_data[["_id"]]
test_data["label"] = pred_actual
test_data.to_csv('dl_lstm_a4.csv',index=False)
test_data.head()

Unnamed: 0,_id,label
0,Gujarati_image_1225.jpg,Non-abusive
1,Gujarati_image_1583.jpg,Abusive
2,Gujarati_image_1502.jpg,Non-abusive
3,Gujarati_image_1487.jpg,Non-abusive
4,Gujarati_image_1497.jpg,Non-abusive


# MODEL 2

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

model = Sequential()

# Embedding层，增加防止过拟合的Dropout
model.add(Embedding(input_dim=vocabSize, output_dim=embed_dim, input_length=2500))
model.add(Dropout(0.3))

# LSTM层，增加recurrent_dropout 和 output_dropout
model.add(LSTM(units=lstm_out, dropout=0.3, recurrent_dropout=0.3, return_sequences=False))

# Batch Normalization增强泛化
model.add(BatchNormalization())

# 全连接层，Softmax输出2分类，建议用categorical_crossentropy
model.add(Dense(2, activation='softmax'))

# 编译
optimizer = Adam(learning_rate=0.001)  # 学习率也可调整
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2500, 256)         1792000   
                                                                 
 dropout (Dropout)           (None, 2500, 256)         0         
                                                                 
 lstm (LSTM)                 (None, 64)                82176     
                                                                 
 batch_normalization (Batch  (None, 64)                256       
 Normalization)                                                  
                                                                 
 dense (Dense)               (None, 2)                 130       
                                                                 
Total params: 1874562 (7.15 MB)
Trainable params: 1874434 (7.15 MB)
Non-trainable params: 128 (512.00 Byte)
______________

In [12]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(
    "hasoc_b4.h5",
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    save_weights_only=False, 
    mode='auto')

In [13]:
model.fit(X_train, Y_train, batch_size = 32, epochs = 10, validation_data=(X_test,Y_test), callbacks=[checkpoint])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.57835, saving model to hasoc_b4.h5
Epoch 2/10


  saving_api.save_model(


Epoch 2: val_loss improved from 0.57835 to 0.50496, saving model to hasoc_b4.h5
Epoch 3/10
Epoch 3: val_loss improved from 0.50496 to 0.44701, saving model to hasoc_b4.h5
Epoch 4/10
Epoch 4: val_loss improved from 0.44701 to 0.41452, saving model to hasoc_b4.h5
Epoch 5/10
Epoch 5: val_loss improved from 0.41452 to 0.40030, saving model to hasoc_b4.h5
Epoch 6/10
Epoch 6: val_loss improved from 0.40030 to 0.38297, saving model to hasoc_b4.h5
Epoch 7/10
Epoch 7: val_loss did not improve from 0.38297
Epoch 8/10
Epoch 8: val_loss improved from 0.38297 to 0.37311, saving model to hasoc_b4.h5
Epoch 9/10
Epoch 9: val_loss improved from 0.37311 to 0.35972, saving model to hasoc_b4.h5
Epoch 10/10
Epoch 10: val_loss did not improve from 0.35972


<keras.src.callbacks.History at 0x1f509e3ff70>

In [14]:
model.load_weights('hasoc_b4.h5')
model.evaluate(X_test,Y_test)



[0.35971522331237793, 0.858208954334259]

In [15]:
Y_pred = model.predict(test_X)



In [16]:
y_actual = []
for i in test_Y_true:
    if i == 'Non-abusive':
        y_actual.append(1)
    else :
        y_actual.append(0)

pred_class = []
for i in Y_pred:
    pred_class.append(np.argmax(i))

In [17]:
print(classification_report(y_actual , pred_class))

              precision    recall  f1-score   support

           0       0.95      0.10      0.19       205
           1       0.68      1.00      0.81       399

    accuracy                           0.69       604
   macro avg       0.82      0.55      0.50       604
weighted avg       0.78      0.69      0.60       604



In [18]:
pred_actual = []
for i in pred_class:
    if i == 1:
        pred_actual.append('Non-abusive')
    else :
        pred_actual.append('Abusive')

In [19]:
test_data = test_data[["_id"]]
test_data["label"] = pred_actual
test_data.to_csv('dl_lstm_b4.csv',index=False)
test_data.head()

Unnamed: 0,_id,label
0,Gujarati_image_1225.jpg,Non-abusive
1,Gujarati_image_1583.jpg,Abusive
2,Gujarati_image_1502.jpg,Non-abusive
3,Gujarati_image_1487.jpg,Non-abusive
4,Gujarati_image_1497.jpg,Non-abusive


# MODEL 3

In [11]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(
    "hasoc_c4.h5",
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    save_weights_only=False, 
    mode='auto')

In [12]:
model = Sequential()
model.add(Embedding(vocabSize, embed_dim, input_length=2500))
model.add(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

from sklearn.utils.class_weight import compute_class_weight
# 假设Y_train已独热编码
class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=np.argmax(Y_train, axis=1))
class_weight_dict = dict(enumerate(class_weights))

model.fit(X_train, Y_train, epochs=10, batch_size=64, class_weight=class_weight_dict, validation_data=(X_test,Y_test), callbacks=[checkpoint])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.59755, saving model to hasoc_c4.h5
Epoch 2/10


  saving_api.save_model(


Epoch 2: val_loss did not improve from 0.59755
Epoch 3/10
Epoch 3: val_loss improved from 0.59755 to 0.53907, saving model to hasoc_c4.h5
Epoch 4/10
Epoch 4: val_loss improved from 0.53907 to 0.52551, saving model to hasoc_c4.h5
Epoch 5/10
Epoch 5: val_loss did not improve from 0.52551
Epoch 6/10
Epoch 6: val_loss improved from 0.52551 to 0.43790, saving model to hasoc_c4.h5
Epoch 7/10
Epoch 7: val_loss did not improve from 0.43790
Epoch 8/10
Epoch 8: val_loss did not improve from 0.43790
Epoch 9/10
Epoch 9: val_loss did not improve from 0.43790
Epoch 10/10
Epoch 10: val_loss did not improve from 0.43790


<keras.src.callbacks.History at 0x2466b745e80>

In [13]:
model.load_weights('hasoc_c4.h5')
model.evaluate(X_test,Y_test)



[0.437895804643631, 0.858208954334259]

In [14]:
Y_pred = model.predict(test_X)



In [15]:
y_actual = []
for i in test_Y_true:
    if i == 'Non-abusive':
        y_actual.append(1)
    else :
        y_actual.append(0)

pred_class = []
for i in Y_pred:
    pred_class.append(np.argmax(i))

In [16]:
print(classification_report(y_actual , pred_class))

              precision    recall  f1-score   support

           0       0.68      0.35      0.46       205
           1       0.73      0.91      0.81       399

    accuracy                           0.72       604
   macro avg       0.71      0.63      0.64       604
weighted avg       0.71      0.72      0.69       604



In [17]:
pred_actual = []
for i in pred_class:
    if i == 1:
        pred_actual.append('Non-abusive')
    else :
        pred_actual.append('Abusive')

In [18]:
test_data = test_data[["_id"]]
test_data["label"] = pred_actual
test_data.to_csv('dl_lstm_c4.csv',index=False)
test_data.head()

Unnamed: 0,_id,label
0,Gujarati_image_1225.jpg,Non-abusive
1,Gujarati_image_1583.jpg,Abusive
2,Gujarati_image_1502.jpg,Abusive
3,Gujarati_image_1487.jpg,Abusive
4,Gujarati_image_1497.jpg,Abusive
