### Importing Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,Bidirectional
from keras.models import Model
from keras.models import Sequential

# task_1

### Handling Pre-processed data

In [2]:
data = pd.read_csv('../preprocess_data.csv')
data.drop(['task_2','text'], axis=1, inplace=True)
data.head()

Unnamed: 0,_id,task_1,task_3,task_4,text_clean
0,Hindi_image_1817.jpg,Sarcastic,Vulgar,Abusive,Ba8@ DaNn G@rainiD IR T३ PDBB WRHE W PRD BCEN ...
1,Hindi_image_7.jpg,Non-Sarcastic,Vulgar,Abusive,"Nari nari mat kar pagle, Nari he nark ka dwar...."
2,Hindi_image_1.jpg,Sarcastic,Non Vulgar,Abusive,Kitni push ops maarsakte ho dafly? 5 aur agar ...
3,Hindi_image_32.jpg,Sarcastic,Vulgar,Abusive,अब इसमें मेरी कहां गलती है बताओ.. तरबूज़ वाली क...
4,Hindi_image_1714.jpg,Sarcastic,Non Vulgar,Abusive,"""KUDI MENU KEHNDl... 'MENU JUTI LA DE SONIYE....."


In [3]:
test_data = pd.read_csv('../preprocess_test_data.csv') 
test_data.drop(['task_2','text'], axis=1, inplace=True)
test_data = test_data.drop(['Unnamed: 0'],axis=1)
test_data.head()

Unnamed: 0,_id,task_1,task_3,task_4,text_clean
0,Hindi_image_410.jpg,Sarcastic,Vulgar,Non-abusive,"Sign You are Bancho a] _ ~""11|7 have best ffen..."
1,Hindi_image_114.jpg,Non-Sarcastic,Vulgar,Abusive,एक महिला घोडे़ के लिंग लिया| घोड़ा उत्साहित हो...
2,Hindi_image_101.jpg,Non-Sarcastic,Non Vulgar,Non-abusive,एक टीचर ने एक लड़के को पेपर में नक़ल करते पकड लि...
3,Hindi_image_1747.jpg,Sarcastic,Vulgar,Abusive,show me Sckht Launda Kisslay Jha CTrollerlzabu...
4,Hindi_image_19.jpg,Non-Sarcastic,Non Vulgar,Abusive,पति सुहागरात में पत्नी की निप्पल चूसते हुए बोल...


In [4]:
sentences = data['text_clean'].astype(str)
tokenizer = Tokenizer(num_words = 1500,split=' ')
tokenizer.fit_on_texts(sentences)
sequence = tokenizer.texts_to_sequences(sentences)

In [5]:
test_sentences = test_data['text_clean'].astype(str)
test_sequence = tokenizer.texts_to_sequences(test_sentences)

In [6]:
max_seq_len = 2500

index_of_words = tokenizer.word_index
print("No of unique words : ",len(index_of_words))

X = pad_sequences(sequence , maxlen = max_seq_len )
Y = data['task_1']

print(X)

No of unique words :  8859
[[   0    0    0 ... 1173 1174  571]
 [   0    0    0 ...   19   15 1176]
 [   0    0    0 ...    4  773   17]
 ...
 [   0    0    0 ...   36  377   30]
 [   0    0    0 ...   27  122  333]
 [   0    0    0 ...  118  739   89]]


In [7]:
test_X = pad_sequences(test_sequence , maxlen = max_seq_len )
test_Y = test_data['task_1']

print(test_X)

[[   0    0    0 ...   46  963    9]
 [   0    0    0 ...   45   35   26]
 [   0    0    0 ...  545  310    2]
 ...
 [   0    0    0 ...   87  143  333]
 [   0    0    0 ...    0    0  318]
 [   0    0    0 ...    0    0 1297]]


In [8]:
embed_dim = 256
vocabSize = len(index_of_words)
lstm_out = 64

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, random_state = 0)
Y_true = Y_test
Y_train = pd.get_dummies(Y_train).values
Y_test = pd.get_dummies(Y_test).values

In [10]:
test_Y_true = test_Y
test_Y = pd.get_dummies(test_Y).values
print("test_Y:", test_Y)

test_Y: [[False  True]
 [ True False]
 [ True False]
 ...
 [False  True]
 [ True False]
 [ True False]]


# MODEL 1

In [11]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization, Bidirectional
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# ===== 类别权重计算 =====
class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=np.argmax(Y_train, axis=1))
class_weight_dict = dict(enumerate(class_weights))
print("Class weights:", class_weight_dict)

Class weights: {0: 1.5781758957654723, 1: 0.7318731117824774}


In [12]:
model = Sequential()
model.add(Embedding(vocabSize, embed_dim,input_length = 2500))
model.add(Dropout(0.3))
model.add(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.3))
model.add(BatchNormalization())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(2, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2500, 256)         2267904   
                                                                 
 dropout (Dropout)           (None, 2500, 256)         0         
                                                                 
 lstm (LSTM)                 (None, 64)                82176     
                                                                 
 batch_normalization (Batch  (None, 64)                256       
 Normalization)                                                  
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                        

In [13]:
from keras.callbacks import ModelCheckpoint, EarlyStopping
checkpoint = ModelCheckpoint(
    "hasoc_a1.h5",
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    save_weights_only=False, 
    mode='auto')
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    verbose=1,
    restore_best_weights=True
)

In [15]:
model.fit(X_train, Y_train, batch_size = 32, epochs = 10, validation_data=(X_test,Y_test), class_weight=class_weight_dict, callbacks=[checkpoint, early_stop])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.69344, saving model to hasoc_a1.h5
Epoch 2/10


  saving_api.save_model(


Epoch 2: val_loss improved from 0.69344 to 0.68652, saving model to hasoc_a1.h5
Epoch 3/10
Epoch 3: val_loss did not improve from 0.68652
Epoch 4/10
Epoch 4: val_loss improved from 0.68652 to 0.67687, saving model to hasoc_a1.h5
Epoch 5/10
Epoch 5: val_loss improved from 0.67687 to 0.66471, saving model to hasoc_a1.h5
Epoch 6/10
Epoch 6: val_loss did not improve from 0.66471
Epoch 7/10
Epoch 7: val_loss did not improve from 0.66471
Epoch 8/10
Epoch 8: val_loss did not improve from 0.66471
Restoring model weights from the end of the best epoch: 5.
Epoch 8: early stopping


<keras.src.callbacks.History at 0x285159767c0>

In [16]:
model.load_weights('hasoc_a1.h5')
model.evaluate(X_test,Y_test)



[0.6647120714187622, 0.6279069781303406]

In [17]:
Y_pred = model.predict(test_X)



In [18]:
y_actual = []
for i in test_Y_true:
    if i == 'Sarcastic':
        y_actual.append(1)
    else :
        y_actual.append(0)

pred_class = []
for i in Y_pred:
    pred_class.append(np.argmax(i))

In [19]:
print(classification_report(y_actual , pred_class))

              precision    recall  f1-score   support

           0       0.82      0.03      0.05       676
           1       0.12      0.96      0.21        93

    accuracy                           0.14       769
   macro avg       0.47      0.49      0.13       769
weighted avg       0.73      0.14      0.07       769



In [20]:
print("Y_pred:", Y_pred)

Y_pred: [[0.4288956  0.5711044 ]
 [0.34199494 0.65800506]
 [0.38701785 0.61298215]
 ...
 [0.31160644 0.6883936 ]
 [0.43951362 0.5604863 ]
 [0.4552256  0.5447745 ]]


In [21]:
print("pred_class:", pred_class)

pred_class: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [22]:
pred_actual = []
for i in pred_class:
    if i == 1:
        pred_actual.append('Sarcastic')
    else :
        pred_actual.append('Non-Sarcastic')

In [23]:
test_data = test_data[["_id"]]
test_data["label"] = pred_actual
test_data.to_csv('dl_lstm_a.csv',index=False)
test_data.head()

Unnamed: 0,_id,label
0,Hindi_image_410.jpg,Sarcastic
1,Hindi_image_114.jpg,Sarcastic
2,Hindi_image_101.jpg,Sarcastic
3,Hindi_image_1747.jpg,Sarcastic
4,Hindi_image_19.jpg,Sarcastic


# MODEL 2

# Importing Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,Bidirectional
from keras.models import Model
from keras.models import Sequential

# Handling Pre-processed data

In [2]:
data = pd.read_csv('../preprocess_data.csv')
data.drop(['task_2','text'], axis=1, inplace=True)
data.head()

Unnamed: 0,_id,task_1,task_3,task_4,text_clean
0,Hindi_image_1817.jpg,Sarcastic,Vulgar,Abusive,Ba8@ DaNn G@rainiD IR T३ PDBB WRHE W PRD BCEN ...
1,Hindi_image_7.jpg,Non-Sarcastic,Vulgar,Abusive,"Nari nari mat kar pagle, Nari he nark ka dwar...."
2,Hindi_image_1.jpg,Sarcastic,Non Vulgar,Abusive,Kitni push ops maarsakte ho dafly? 5 aur agar ...
3,Hindi_image_32.jpg,Sarcastic,Vulgar,Abusive,अब इसमें मेरी कहां गलती है बताओ.. तरबूज़ वाली क...
4,Hindi_image_1714.jpg,Sarcastic,Non Vulgar,Abusive,"""KUDI MENU KEHNDl... 'MENU JUTI LA DE SONIYE....."


In [3]:
test_data = pd.read_csv('../preprocess_test_data.csv') 
test_data.drop(['task_2','text'], axis=1, inplace=True)
test_data = test_data.drop(['Unnamed: 0'],axis=1)
test_data.head()

Unnamed: 0,_id,task_1,task_3,task_4,text_clean
0,Hindi_image_410.jpg,Sarcastic,Vulgar,Non-abusive,"Sign You are Bancho a] _ ~""11|7 have best ffen..."
1,Hindi_image_114.jpg,Non-Sarcastic,Vulgar,Abusive,एक महिला घोडे़ के लिंग लिया| घोड़ा उत्साहित हो...
2,Hindi_image_101.jpg,Non-Sarcastic,Non Vulgar,Non-abusive,एक टीचर ने एक लड़के को पेपर में नक़ल करते पकड लि...
3,Hindi_image_1747.jpg,Sarcastic,Vulgar,Abusive,show me Sckht Launda Kisslay Jha CTrollerlzabu...
4,Hindi_image_19.jpg,Non-Sarcastic,Non Vulgar,Abusive,पति सुहागरात में पत्नी की निप्पल चूसते हुए बोल...


In [4]:
sentences = data['text_clean'].astype(str)
tokenizer = Tokenizer(num_words = 1500,split=' ')
tokenizer.fit_on_texts(sentences)
sequence = tokenizer.texts_to_sequences(sentences)

In [5]:
test_sentences = test_data['text_clean'].astype(str)
test_sequence = tokenizer.texts_to_sequences(test_sentences)

In [6]:
max_seq_len = 2500

index_of_words = tokenizer.word_index
print("No of unique words : ",len(index_of_words))

X = pad_sequences(sequence , maxlen = max_seq_len )
Y = data['task_1']

print(X)

No of unique words :  8859
[[   0    0    0 ... 1173 1174  571]
 [   0    0    0 ...   19   15 1176]
 [   0    0    0 ...    4  773   17]
 ...
 [   0    0    0 ...   36  377   30]
 [   0    0    0 ...   27  122  333]
 [   0    0    0 ...  118  739   89]]


In [7]:
test_X = pad_sequences(test_sequence , maxlen = max_seq_len )
test_Y = test_data['task_1']

print(test_X)

[[   0    0    0 ...   46  963    9]
 [   0    0    0 ...   45   35   26]
 [   0    0    0 ...  545  310    2]
 ...
 [   0    0    0 ...   87  143  333]
 [   0    0    0 ...    0    0  318]
 [   0    0    0 ...    0    0 1297]]


In [8]:
embed_dim = 256
vocabSize = len(index_of_words)
lstm_out = 64

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, random_state = 0)
Y_true = Y_test
Y_train = pd.get_dummies(Y_train).values
Y_test = pd.get_dummies(Y_test).values

In [10]:
test_Y_true = test_Y
test_Y = pd.get_dummies(test_Y).values
print("test_Y:", test_Y)

test_Y: [[False  True]
 [ True False]
 [ True False]
 ...
 [False  True]
 [ True False]
 [ True False]]


In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization, Bidirectional
from tensorflow.keras.optimizers import Adam

model = Sequential()

# Embedding 层
model.add(Embedding(input_dim=vocabSize, output_dim=embed_dim, input_length=2500))
model.add(Dropout(0.4))

# 双向 LSTM + 正则化
model.add(Bidirectional(LSTM(units=lstm_out, dropout=0.4, recurrent_dropout=0.3)))

# BN 层
model.add(BatchNormalization())

# 全连接 + Dropout
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))

# 输出层（sigmoid）
model.add(Dense(2, activation='sigmoid'))

# 编译
optimizer = Adam(learning_rate=1e-4)  # 学习率调低
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2500, 256)         2267904   
                                                                 
 dropout (Dropout)           (None, 2500, 256)         0         
                                                                 
 bidirectional (Bidirection  (None, 128)               164352    
 al)                                                             
                                                                 
 batch_normalization (Batch  (None, 128)               512       
 Normalization)                                                  
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0

In [12]:
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
checkpoint = ModelCheckpoint(
    "hasoc_b1.h5",
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    save_weights_only=False, 
    mode='auto')
early_stop = EarlyStopping(
    monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss', factor=0.5, patience=2, verbose=1)

In [13]:
from sklearn.utils.class_weight import compute_class_weight
# 假设Y_train已独热编码
class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=np.argmax(Y_train, axis=1))
class_weight_dict = dict(enumerate(class_weights))

model.fit(X_train, Y_train, batch_size = 32, epochs = 5, class_weight=class_weight_dict, validation_data=(X_test, Y_test), callbacks=[checkpoint, early_stop, reduce_lr])

Epoch 1/5
Epoch 1: val_loss improved from inf to 0.69149, saving model to hasoc_b1.h5
Epoch 2/5


  saving_api.save_model(


Epoch 2: val_loss improved from 0.69149 to 0.69063, saving model to hasoc_b1.h5
Epoch 3/5
Epoch 3: val_loss did not improve from 0.69063
Epoch 4/5
Epoch 4: val_loss improved from 0.69063 to 0.69034, saving model to hasoc_b1.h5
Epoch 5/5
Epoch 5: val_loss improved from 0.69034 to 0.69008, saving model to hasoc_b1.h5


<keras.src.callbacks.History at 0x22305959460>

In [14]:
model.load_weights('hasoc_b1.h5')
model.evaluate(X_test,Y_test)



[0.6900755167007446, 0.5581395626068115]

In [15]:
Y_pred = model.predict(test_X)



In [16]:
y_actual = []
for i in test_Y_true:
    if i =='Sarcastic':
        y_actual.append(1)
    else :
        y_actual.append(0)

pred_class = []
for i in Y_pred:
    pred_class.append(np.argmax(i))

In [17]:
print(classification_report(y_actual , pred_class))

              precision    recall  f1-score   support

           0       0.95      0.21      0.35       676
           1       0.14      0.92      0.24        93

    accuracy                           0.30       769
   macro avg       0.55      0.57      0.29       769
weighted avg       0.85      0.30      0.33       769



In [18]:
pred_actual = []
for i in pred_class:
    if i == 1:
        pred_actual.append('Sarcastic')
    else :
        pred_actual.append('Non-Sarcastic')

In [19]:
test_data = test_data[["_id"]]
test_data["label"] = pred_actual
test_data.to_csv('dl_lstm_b.csv',index=False)
test_data.head()

Unnamed: 0,_id,label
0,Hindi_image_410.jpg,Sarcastic
1,Hindi_image_114.jpg,Sarcastic
2,Hindi_image_101.jpg,Sarcastic
3,Hindi_image_1747.jpg,Sarcastic
4,Hindi_image_19.jpg,Sarcastic


# MODEL 3

In [11]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(
    "hasoc_c1.h5",
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    save_weights_only=False, 
    mode='auto')

In [12]:
model = Sequential()
model.add(Embedding(vocabSize, embed_dim, input_length=2500))
model.add(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

from sklearn.utils.class_weight import compute_class_weight
# 假设Y_train已独热编码
class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=np.argmax(Y_train, axis=1))
class_weight_dict = dict(enumerate(class_weights))

model.fit(X_train, Y_train, epochs=10, batch_size=64, class_weight=class_weight_dict, validation_data=(X_test,Y_test), callbacks=[checkpoint])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.70045, saving model to hasoc_c1.h5
Epoch 2/10


  saving_api.save_model(


Epoch 2: val_loss did not improve from 0.70045
Epoch 3/10
Epoch 3: val_loss improved from 0.70045 to 0.69831, saving model to hasoc_c1.h5
Epoch 4/10
Epoch 4: val_loss did not improve from 0.69831
Epoch 5/10
Epoch 5: val_loss did not improve from 0.69831
Epoch 6/10
Epoch 6: val_loss did not improve from 0.69831
Epoch 7/10
Epoch 7: val_loss did not improve from 0.69831
Epoch 8/10
Epoch 8: val_loss did not improve from 0.69831
Epoch 9/10
Epoch 9: val_loss did not improve from 0.69831
Epoch 10/10
Epoch 10: val_loss did not improve from 0.69831


<keras.src.callbacks.History at 0x2249fa18880>

In [13]:
model.load_weights('hasoc_c1.h5')
model.evaluate(X_test,Y_test)



[0.6983127593994141, 0.5755813717842102]

In [14]:
Y_pred = model.predict(test_X)



In [15]:
y_actual = []
for i in test_Y_true:
    if i =='Sarcastic':
        y_actual.append(1)
    else :
        y_actual.append(0)

pred_class = []
for i in Y_pred:
    pred_class.append(np.argmax(i))

In [16]:
print(classification_report(y_actual , pred_class))

              precision    recall  f1-score   support

           0       0.92      0.23      0.37       676
           1       0.13      0.86      0.23        93

    accuracy                           0.30       769
   macro avg       0.53      0.54      0.30       769
weighted avg       0.83      0.30      0.35       769



In [17]:
pred_actual = []
for i in pred_class:
    if i == 1:
        pred_actual.append('Sarcastic')
    else :
        pred_actual.append('Non-Sarcastic')

In [18]:
test_data = test_data[["_id"]]
test_data["label"] = pred_actual
test_data.to_csv('dl_lstm_c.csv',index=False)
test_data.head()

Unnamed: 0,_id,label
0,Hindi_image_410.jpg,Sarcastic
1,Hindi_image_114.jpg,Sarcastic
2,Hindi_image_101.jpg,Sarcastic
3,Hindi_image_1747.jpg,Sarcastic
4,Hindi_image_19.jpg,Sarcastic


# task_3

### Handling Pre-processed data

In [4]:
data = pd.read_csv('../preprocess_data.csv')
data.drop(['task_2','text'], axis=1, inplace=True)
data.head()

Unnamed: 0,_id,task_1,task_3,task_4,text_clean
0,Hindi_image_1817.jpg,Sarcastic,Vulgar,Abusive,Ba8@ DaNn G@rainiD IR T३ PDBB WRHE W PRD BCEN ...
1,Hindi_image_7.jpg,Non-Sarcastic,Vulgar,Abusive,"Nari nari mat kar pagle, Nari he nark ka dwar...."
2,Hindi_image_1.jpg,Sarcastic,Non Vulgar,Abusive,Kitni push ops maarsakte ho dafly? 5 aur agar ...
3,Hindi_image_32.jpg,Sarcastic,Vulgar,Abusive,अब इसमें मेरी कहां गलती है बताओ.. तरबूज़ वाली क...
4,Hindi_image_1714.jpg,Sarcastic,Non Vulgar,Abusive,"""KUDI MENU KEHNDl... 'MENU JUTI LA DE SONIYE....."


In [5]:
test_data = pd.read_csv('../preprocess_test_data.csv') 
test_data.drop(['task_2','text'], axis=1, inplace=True)
test_data = test_data.drop(['Unnamed: 0'],axis=1)
test_data.head()

Unnamed: 0,_id,task_1,task_3,task_4,text_clean
0,Hindi_image_410.jpg,Sarcastic,Vulgar,Non-abusive,"Sign You are Bancho a] _ ~""11|7 have best ffen..."
1,Hindi_image_114.jpg,Non-Sarcastic,Vulgar,Abusive,एक महिला घोडे़ के लिंग लिया| घोड़ा उत्साहित हो...
2,Hindi_image_101.jpg,Non-Sarcastic,Non Vulgar,Non-abusive,एक टीचर ने एक लड़के को पेपर में नक़ल करते पकड लि...
3,Hindi_image_1747.jpg,Sarcastic,Vulgar,Abusive,show me Sckht Launda Kisslay Jha CTrollerlzabu...
4,Hindi_image_19.jpg,Non-Sarcastic,Non Vulgar,Abusive,पति सुहागरात में पत्नी की निप्पल चूसते हुए बोल...


In [5]:
sentences = data['text_clean'].astype(str)
tokenizer = Tokenizer(num_words = 1500,split=' ')
tokenizer.fit_on_texts(sentences)
sequence = tokenizer.texts_to_sequences(sentences)

In [6]:
test_sentences = test_data['text_clean'].astype(str)
test_sequence = tokenizer.texts_to_sequences(test_sentences)

In [7]:
max_seq_len = 2500

index_of_words = tokenizer.word_index
print("No of unique words : ",len(index_of_words))

X = pad_sequences(sequence , maxlen = max_seq_len )
Y = data['task_3']

print(X)

No of unique words :  8859
[[   0    0    0 ... 1173 1174  571]
 [   0    0    0 ...   19   15 1176]
 [   0    0    0 ...    4  773   17]
 ...
 [   0    0    0 ...   36  377   30]
 [   0    0    0 ...   27  122  333]
 [   0    0    0 ...  118  739   89]]


In [8]:
test_X = pad_sequences(test_sequence , maxlen = max_seq_len )
test_Y = test_data['task_3']

print(test_X)

[[   0    0    0 ...   46  963    9]
 [   0    0    0 ...   45   35   26]
 [   0    0    0 ...  545  310    2]
 ...
 [   0    0    0 ...   87  143  333]
 [   0    0    0 ...    0    0  318]
 [   0    0    0 ...    0    0 1297]]


In [9]:
embed_dim = 256
vocabSize = len(index_of_words)
lstm_out = 64

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, random_state = 0)
Y_true = Y_test
Y_train = pd.get_dummies(Y_train).values
Y_test = pd.get_dummies(Y_test).values

In [11]:
test_Y_true = test_Y
test_Y = pd.get_dummies(test_Y).values
print("test_Y:",test_Y)

test_Y: [[False  True]
 [False  True]
 [ True False]
 ...
 [ True False]
 [ True False]
 [ True False]]


# MODEL 1

In [11]:
model = Sequential()
model.add(Embedding(vocabSize, embed_dim,input_length = 2500))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2500, 256)         2267904   
                                                                 
 lstm (LSTM)                 (None, 64)                82176     
                                                                 
 dense (Dense)               (None, 2)                 130       
                                                                 
Total params: 2350210 (8.97 MB)
Trainable params: 2350210 (8.97 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [12]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(
    "hasoc_a3.h5",
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    save_weights_only=False, 
    mode='auto')

In [13]:
model.fit(X_train, Y_train, batch_size = 32, epochs = 10 , validation_data = (X_test,Y_test), callbacks=[checkpoint])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.61600, saving model to hasoc_a3.h5
Epoch 2/10


  saving_api.save_model(


Epoch 2: val_loss improved from 0.61600 to 0.58734, saving model to hasoc_a3.h5
Epoch 3/10
Epoch 3: val_loss did not improve from 0.58734
Epoch 4/10
Epoch 4: val_loss did not improve from 0.58734
Epoch 5/10
Epoch 5: val_loss did not improve from 0.58734
Epoch 6/10
Epoch 6: val_loss did not improve from 0.58734
Epoch 7/10
Epoch 7: val_loss did not improve from 0.58734
Epoch 8/10
Epoch 8: val_loss did not improve from 0.58734
Epoch 9/10
Epoch 9: val_loss did not improve from 0.58734
Epoch 10/10
Epoch 10: val_loss did not improve from 0.58734


<keras.src.callbacks.History at 0x1c561745820>

In [14]:
model.load_weights('hasoc_a3.h5')
model.evaluate(X_test,Y_test)



[0.5873427987098694, 0.75]

In [15]:
Y_pred = model.predict(test_X)



In [16]:
y_actual = []
for i in test_Y_true:
    if i =='Vulgar':
        y_actual.append(1)
    else :
        y_actual.append(0)

pred_class = []
for i in Y_pred:
    pred_class.append(np.argmax(i))

In [17]:
print(classification_report(y_actual , pred_class))

              precision    recall  f1-score   support

           0       0.92      0.90      0.91       708
           1       0.03      0.03      0.03        61

    accuracy                           0.83       769
   macro avg       0.47      0.47      0.47       769
weighted avg       0.85      0.83      0.84       769



In [18]:
print("Y_pred:", Y_pred)

Y_pred: [[0.8319924  0.16800761]
 [0.28091756 0.7190825 ]
 [0.41645622 0.5835437 ]
 ...
 [0.9059605  0.09403948]
 [0.9685721  0.03142795]
 [0.9639709  0.03602908]]


In [19]:
print("pred_class:", pred_class)

pred_class: [0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [20]:
pred_actual = []
for i in pred_class:
    if i == 1:
        pred_actual.append('Vulgar')
    else :
        pred_actual.append('Non Vulgar')

In [21]:
test_data = test_data[["_id"]]
test_data["label"] = pred_actual
test_data.to_csv('dl_lstm_a3.csv',index=False)
test_data.head()

Unnamed: 0,_id,label
0,Hindi_image_410.jpg,Non Vulgar
1,Hindi_image_114.jpg,Vulgar
2,Hindi_image_101.jpg,Vulgar
3,Hindi_image_1747.jpg,Non Vulgar
4,Hindi_image_19.jpg,Vulgar


# MODEL 2

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

model = Sequential()

# Embedding层，增加防止过拟合的Dropout
model.add(Embedding(input_dim=vocabSize, output_dim=embed_dim, input_length=2500))
model.add(Dropout(0.3))

# LSTM层，增加recurrent_dropout 和 output dropout
model.add(LSTM(units=lstm_out, dropout=0.3, recurrent_dropout=0.3, return_sequences=False))

# Batch Normalization增强泛化
model.add(BatchNormalization())

# 全连接层，Softmax输出2分类，建议用categorical_crossentropy
model.add(Dense(2, activation='softmax'))

# 编译
optimizer = Adam(learning_rate=0.001)  # 学习率也可调整
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2500, 256)         2267904   
                                                                 
 dropout (Dropout)           (None, 2500, 256)         0         
                                                                 
 lstm (LSTM)                 (None, 64)                82176     
                                                                 
 batch_normalization (Batch  (None, 64)                256       
 Normalization)                                                  
                                                                 
 dense (Dense)               (None, 2)                 130       
                                                                 
Total params: 2350466 (8.97 MB)
Trainable params: 2350338 (8.97 MB)
Non-trainable params: 128 (512.00 Byte)
______________

In [12]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(
    "hasoc_b3.h5",
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    save_weights_only=False, 
    mode='auto')

In [13]:
model.fit(X_train, Y_train, batch_size = 32, epochs = 10, validation_data = (X_test,Y_test), callbacks = [checkpoint])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.63237, saving model to hasoc_b3.h5
Epoch 2/10


  saving_api.save_model(


Epoch 2: val_loss improved from 0.63237 to 0.61393, saving model to hasoc_b3.h5
Epoch 3/10
Epoch 3: val_loss improved from 0.61393 to 0.61031, saving model to hasoc_b3.h5
Epoch 4/10
Epoch 4: val_loss improved from 0.61031 to 0.60756, saving model to hasoc_b3.h5
Epoch 5/10
Epoch 5: val_loss improved from 0.60756 to 0.60431, saving model to hasoc_b3.h5
Epoch 6/10
Epoch 6: val_loss improved from 0.60431 to 0.60421, saving model to hasoc_b3.h5
Epoch 7/10
Epoch 7: val_loss improved from 0.60421 to 0.60018, saving model to hasoc_b3.h5
Epoch 8/10
Epoch 8: val_loss improved from 0.60018 to 0.59818, saving model to hasoc_b3.h5
Epoch 9/10
Epoch 9: val_loss improved from 0.59818 to 0.58181, saving model to hasoc_b3.h5
Epoch 10/10
Epoch 10: val_loss improved from 0.58181 to 0.57877, saving model to hasoc_b3.h5


<keras.src.callbacks.History at 0x1eacd5f0b50>

In [14]:
model.load_weights('hasoc_b3.h5')
model.evaluate(X_test,Y_test)



[0.5787697434425354, 0.7093023061752319]

In [16]:
Y_pred = model.predict(test_X)



In [17]:
y_actual = []
for i in test_Y_true:
    if i =='Vulgar':
        y_actual.append(1)
    else :
        y_actual.append(0)

pred_class = []
for i in Y_pred:
    pred_class.append(np.argmax(i))

In [18]:
print(classification_report(y_actual , pred_class))

              precision    recall  f1-score   support

           0       0.92      0.97      0.94       708
           1       0.00      0.00      0.00        61

    accuracy                           0.89       769
   macro avg       0.46      0.49      0.47       769
weighted avg       0.85      0.89      0.87       769



In [19]:
pred_actual = []
for i in pred_class:
    if i == 1:
        pred_actual.append('Vulgar')
    else :
        pred_actual.append('Non Vulgar')

In [20]:
test_data = test_data[["_id"]]
test_data["label"] = pred_actual
test_data.to_csv('dl_lstm_b3.csv',index=False)
test_data.head()

Unnamed: 0,_id,label
0,Hindi_image_410.jpg,Non Vulgar
1,Hindi_image_114.jpg,Non Vulgar
2,Hindi_image_101.jpg,Non Vulgar
3,Hindi_image_1747.jpg,Non Vulgar
4,Hindi_image_19.jpg,Non Vulgar


# MODEL 3

In [12]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(
    "hasoc_c3.h5",
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    save_weights_only=False, 
    mode='auto')

In [13]:
model = Sequential()
model.add(Embedding(vocabSize, embed_dim, input_length=2500))
model.add(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

from sklearn.utils.class_weight import compute_class_weight
# 假设Y_train已独热编码
class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=np.argmax(Y_train, axis=1))
class_weight_dict = dict(enumerate(class_weights))

model.fit(X_train, Y_train, epochs=10, batch_size=64, class_weight=class_weight_dict, validation_data=(X_test,Y_test), callbacks=[checkpoint])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.66124, saving model to hasoc_c3.h5
Epoch 2/10


  saving_api.save_model(


Epoch 2: val_loss improved from 0.66124 to 0.64862, saving model to hasoc_c3.h5
Epoch 3/10
Epoch 3: val_loss did not improve from 0.64862
Epoch 4/10
Epoch 4: val_loss did not improve from 0.64862
Epoch 5/10
Epoch 5: val_loss did not improve from 0.64862
Epoch 6/10
Epoch 6: val_loss did not improve from 0.64862
Epoch 7/10
Epoch 7: val_loss did not improve from 0.64862
Epoch 8/10
Epoch 8: val_loss did not improve from 0.64862
Epoch 9/10
Epoch 9: val_loss did not improve from 0.64862
Epoch 10/10
Epoch 10: val_loss did not improve from 0.64862


<keras.src.callbacks.History at 0x1d385595460>

In [14]:
model.load_weights('hasoc_c3.h5')
model.evaluate(X_test,Y_test)



[0.6486157178878784, 0.6395348906517029]

In [15]:
Y_pred = model.predict(test_X)



In [16]:
y_actual = []
for i in test_Y_true:
    if i == 'Vulgar':
        y_actual.append(1)
    else :
        y_actual.append(0)

pred_class = []
for i in Y_pred:
    pred_class.append(np.argmax(i))

In [17]:
print(classification_report(y_actual, pred_class))

              precision    recall  f1-score   support

           0       0.91      0.71      0.80       708
           1       0.06      0.21      0.09        61

    accuracy                           0.67       769
   macro avg       0.49      0.46      0.44       769
weighted avg       0.84      0.67      0.74       769



In [18]:
pred_actual = []
for i in pred_class:
    if i == 1:
        pred_actual.append('Vulgar')
    else :
        pred_actual.append('Non Vulgar')

In [19]:
test_data = test_data[["_id"]]
test_data["label"] = pred_actual
test_data.to_csv('dl_lstm_c3.csv',index=False)
test_data.head()

Unnamed: 0,_id,label
0,Hindi_image_410.jpg,Non Vulgar
1,Hindi_image_114.jpg,Vulgar
2,Hindi_image_101.jpg,Vulgar
3,Hindi_image_1747.jpg,Non Vulgar
4,Hindi_image_19.jpg,Vulgar


# task_4

### Handling Pre-processed data

In [2]:
data = pd.read_csv('../preprocess_data.csv')
data.drop(['task_2','text'], axis=1, inplace=True)
data.head()

Unnamed: 0,_id,task_1,task_3,task_4,text_clean
0,Hindi_image_1817.jpg,Sarcastic,Vulgar,Abusive,Ba8@ DaNn G@rainiD IR T३ PDBB WRHE W PRD BCEN ...
1,Hindi_image_7.jpg,Non-Sarcastic,Vulgar,Abusive,"Nari nari mat kar pagle, Nari he nark ka dwar...."
2,Hindi_image_1.jpg,Sarcastic,Non Vulgar,Abusive,Kitni push ops maarsakte ho dafly? 5 aur agar ...
3,Hindi_image_32.jpg,Sarcastic,Vulgar,Abusive,अब इसमें मेरी कहां गलती है बताओ.. तरबूज़ वाली क...
4,Hindi_image_1714.jpg,Sarcastic,Non Vulgar,Abusive,"""KUDI MENU KEHNDl... 'MENU JUTI LA DE SONIYE....."


In [3]:
test_data = pd.read_csv('../preprocess_test_data.csv') 
test_data.drop(['task_2','text'], axis=1, inplace=True)
test_data = test_data.drop(['Unnamed: 0'],axis=1)
test_data.head()

Unnamed: 0,_id,task_1,task_3,task_4,text_clean
0,Hindi_image_410.jpg,Sarcastic,Vulgar,Non-abusive,"Sign You are Bancho a] _ ~""11|7 have best ffen..."
1,Hindi_image_114.jpg,Non-Sarcastic,Vulgar,Abusive,एक महिला घोडे़ के लिंग लिया| घोड़ा उत्साहित हो...
2,Hindi_image_101.jpg,Non-Sarcastic,Non Vulgar,Non-abusive,एक टीचर ने एक लड़के को पेपर में नक़ल करते पकड लि...
3,Hindi_image_1747.jpg,Sarcastic,Vulgar,Abusive,show me Sckht Launda Kisslay Jha CTrollerlzabu...
4,Hindi_image_19.jpg,Non-Sarcastic,Non Vulgar,Abusive,पति सुहागरात में पत्नी की निप्पल चूसते हुए बोल...


In [4]:
sentences = data['text_clean'].astype(str)
tokenizer = Tokenizer(num_words = 1500,split=' ')
tokenizer.fit_on_texts(sentences)
sequence = tokenizer.texts_to_sequences(sentences)

In [5]:
test_sentences = test_data['text_clean'].astype(str)
test_sequence = tokenizer.texts_to_sequences(test_sentences)

In [6]:
max_seq_len = 2500

index_of_words = tokenizer.word_index
print("No of unique words : ",len(index_of_words))

X = pad_sequences(sequence , maxlen = max_seq_len )
Y = data['task_4']

print(X)

No of unique words :  8859
[[   0    0    0 ... 1173 1174  571]
 [   0    0    0 ...   19   15 1176]
 [   0    0    0 ...    4  773   17]
 ...
 [   0    0    0 ...   36  377   30]
 [   0    0    0 ...   27  122  333]
 [   0    0    0 ...  118  739   89]]


In [7]:
test_X = pad_sequences(test_sequence , maxlen = max_seq_len )
test_Y = test_data['task_4']

print(test_X)

[[   0    0    0 ...   46  963    9]
 [   0    0    0 ...   45   35   26]
 [   0    0    0 ...  545  310    2]
 ...
 [   0    0    0 ...   87  143  333]
 [   0    0    0 ...    0    0  318]
 [   0    0    0 ...    0    0 1297]]


In [8]:
embed_dim = 256
vocabSize = len(index_of_words)
lstm_out = 64

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, random_state = 0)
Y_true = Y_test
Y_train = pd.get_dummies(Y_train).values
Y_test = pd.get_dummies(Y_test).values

In [10]:
test_Y_true = test_Y
test_Y = pd.get_dummies(test_Y).values
print("test_Y:", test_Y)

test_Y: [[False  True]
 [ True False]
 [False  True]
 ...
 [False  True]
 [False  True]
 [False  True]]


# MODEL 1

In [11]:
model = Sequential()
model.add(Embedding(vocabSize, embed_dim,input_length = 2500))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2500, 256)         2267904   
                                                                 
 lstm (LSTM)                 (None, 64)                82176     
                                                                 
 dense (Dense)               (None, 2)                 130       
                                                                 
Total params: 2350210 (8.97 MB)
Trainable params: 2350210 (8.97 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [12]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(
    "hasoc_a4.h5",
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    save_weights_only=False, 
    mode='auto')

In [13]:
model.fit(X_train,Y_train ,batch_size = 32, epochs = 10 ,validation_data=(X_test,Y_test) , callbacks=[checkpoint])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.54065, saving model to hasoc_a4.h5
Epoch 2/10


  saving_api.save_model(


Epoch 2: val_loss improved from 0.54065 to 0.51541, saving model to hasoc_a4.h5
Epoch 3/10
Epoch 3: val_loss improved from 0.51541 to 0.48112, saving model to hasoc_a4.h5
Epoch 4/10
Epoch 4: val_loss did not improve from 0.48112
Epoch 5/10
Epoch 5: val_loss did not improve from 0.48112
Epoch 6/10
Epoch 6: val_loss did not improve from 0.48112
Epoch 7/10
Epoch 7: val_loss did not improve from 0.48112
Epoch 8/10
Epoch 8: val_loss did not improve from 0.48112
Epoch 9/10
Epoch 9: val_loss did not improve from 0.48112
Epoch 10/10
Epoch 10: val_loss did not improve from 0.48112


<keras.src.callbacks.History at 0x1d2073845e0>

In [14]:
model.load_weights('hasoc_a4.h5')
model.evaluate(X_test,Y_test)



[0.48112398386001587, 0.7616279125213623]

In [15]:
Y_pred = model.predict(test_X)



In [16]:
y_actual = []
for i in test_Y_true:
    if i == 'Non-abusive':
        y_actual.append(1)
    else :
        y_actual.append(0)

pred_class = []
for i in Y_pred:
    pred_class.append(np.argmax(i))

In [17]:
print(classification_report(y_actual , pred_class))

              precision    recall  f1-score   support

           0       0.06      0.14      0.08        21
           1       0.97      0.94      0.96       748

    accuracy                           0.92       769
   macro avg       0.52      0.54      0.52       769
weighted avg       0.95      0.92      0.93       769



In [18]:
print("Y_pred:", Y_pred)

Y_pred: [[0.09236269 0.9076373 ]
 [0.18006909 0.8199309 ]
 [0.20177443 0.7982255 ]
 ...
 [0.0238966  0.97610337]
 [0.351206   0.648794  ]
 [0.43851286 0.56148714]]


In [19]:
print("pred_class:", pred_class)

pred_class: [1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 

In [20]:
pred_actual = []
for i in pred_class:
    if i == 1:
        pred_actual.append('Non-abusive')
    else :
        pred_actual.append('Abusive')

In [21]:
test_data = test_data[["_id"]]
test_data["label"] = pred_actual
test_data.to_csv('dl_lstm_a4.csv',index=False)
test_data.head()

Unnamed: 0,_id,label
0,Hindi_image_410.jpg,Non-abusive
1,Hindi_image_114.jpg,Non-abusive
2,Hindi_image_101.jpg,Non-abusive
3,Hindi_image_1747.jpg,Non-abusive
4,Hindi_image_19.jpg,Non-abusive


# MODEL 2

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

model = Sequential()

# Embedding层，增加防止过拟合的Dropout
model.add(Embedding(input_dim=vocabSize, output_dim=embed_dim, input_length=2500))
model.add(Dropout(0.3))

# LSTM层，增加recurrent_dropout 和 output_dropout
model.add(LSTM(units=lstm_out, dropout=0.3, recurrent_dropout=0.3, return_sequences=False))

# Batch Normalization增强泛化
model.add(BatchNormalization())

# 全连接层，Softmax输出2分类，建议用categorical_crossentropy
model.add(Dense(2, activation='softmax'))

# 编译
optimizer = Adam(learning_rate=0.001)  # 学习率也可调整
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2500, 256)         2267904   
                                                                 
 dropout (Dropout)           (None, 2500, 256)         0         
                                                                 
 lstm (LSTM)                 (None, 64)                82176     
                                                                 
 batch_normalization (Batch  (None, 64)                256       
 Normalization)                                                  
                                                                 
 dense (Dense)               (None, 2)                 130       
                                                                 
Total params: 2350466 (8.97 MB)
Trainable params: 2350338 (8.97 MB)
Non-trainable params: 128 (512.00 Byte)
______________

In [12]:
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
checkpoint = ModelCheckpoint(
    "hasoc_b4.h5",
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    save_weights_only=False, 
    mode='auto')

early_stop = EarlyStopping(
    monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss', factor=0.5, patience=2, verbose=1)

In [13]:
from sklearn.utils.class_weight import compute_class_weight
# 假设Y_train已独热编码
class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=np.argmax(Y_train, axis=1))
class_weight_dict = dict(enumerate(class_weights))

model.fit(X_train, Y_train, batch_size = 32, class_weight=class_weight_dict, epochs = 10, validation_data=(X_test,Y_test), callbacks=[checkpoint, early_stop, reduce_lr])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.68081, saving model to hasoc_b4.h5
Epoch 2/10


  saving_api.save_model(


Epoch 2: val_loss improved from 0.68081 to 0.60954, saving model to hasoc_b4.h5
Epoch 3/10
Epoch 3: val_loss improved from 0.60954 to 0.55656, saving model to hasoc_b4.h5
Epoch 4/10
Epoch 4: val_loss improved from 0.55656 to 0.53398, saving model to hasoc_b4.h5
Epoch 5/10
Epoch 5: val_loss improved from 0.53398 to 0.51784, saving model to hasoc_b4.h5
Epoch 6/10
Epoch 6: val_loss improved from 0.51784 to 0.51004, saving model to hasoc_b4.h5
Epoch 7/10
Epoch 7: val_loss improved from 0.51004 to 0.49740, saving model to hasoc_b4.h5
Epoch 8/10
Epoch 8: val_loss improved from 0.49740 to 0.49284, saving model to hasoc_b4.h5
Epoch 9/10
Epoch 9: val_loss did not improve from 0.49284
Epoch 10/10
Epoch 10: val_loss did not improve from 0.49284

Epoch 10: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.


<keras.src.callbacks.History at 0x1d0eba98100>

In [14]:
model.load_weights('hasoc_b4.h5')
model.evaluate(X_test,Y_test)



[0.49283891916275024, 0.7790697813034058]

In [15]:
Y_pred = model.predict(test_X)



In [16]:
y_actual = []
for i in test_Y_true:
    if i == 'Non-abusive':
        y_actual.append(1)
    else :
        y_actual.append(0)

pred_class = []
for i in Y_pred:
    pred_class.append(np.argmax(i))

In [17]:
print(classification_report(y_actual , pred_class))

              precision    recall  f1-score   support

           0       0.07      0.05      0.06        21
           1       0.97      0.98      0.98       748

    accuracy                           0.96       769
   macro avg       0.52      0.51      0.52       769
weighted avg       0.95      0.96      0.95       769



In [18]:
pred_actual = []
for i in pred_class:
    if i == 1:
        pred_actual.append('Non-abusive')
    else :
        pred_actual.append('Abusive')

In [19]:
test_data = test_data[["_id"]]
test_data["label"] = pred_actual
test_data.to_csv('dl_lstm_b4.csv',index=False)
test_data.head()

Unnamed: 0,_id,label
0,Hindi_image_410.jpg,Non-abusive
1,Hindi_image_114.jpg,Non-abusive
2,Hindi_image_101.jpg,Non-abusive
3,Hindi_image_1747.jpg,Non-abusive
4,Hindi_image_19.jpg,Non-abusive


# MODEL 3

In [11]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(
    "hasoc_c4.h5",
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    save_weights_only=False, 
    mode='auto')

In [12]:
model = Sequential()
model.add(Embedding(vocabSize, embed_dim, input_length=2500))
model.add(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

from sklearn.utils.class_weight import compute_class_weight
# 假设Y_train已独热编码
class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=np.argmax(Y_train, axis=1))
class_weight_dict = dict(enumerate(class_weights))

model.fit(X_train, Y_train, epochs=10, batch_size=64, class_weight=class_weight_dict, validation_data=(X_test,Y_test), callbacks=[checkpoint])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.67930, saving model to hasoc_c4.h5
Epoch 2/10


  saving_api.save_model(


Epoch 2: val_loss improved from 0.67930 to 0.63836, saving model to hasoc_c4.h5
Epoch 3/10
Epoch 3: val_loss improved from 0.63836 to 0.54868, saving model to hasoc_c4.h5
Epoch 4/10
Epoch 4: val_loss did not improve from 0.54868
Epoch 5/10
Epoch 5: val_loss did not improve from 0.54868
Epoch 6/10
Epoch 6: val_loss did not improve from 0.54868
Epoch 7/10
Epoch 7: val_loss did not improve from 0.54868
Epoch 8/10
Epoch 8: val_loss did not improve from 0.54868
Epoch 9/10
Epoch 9: val_loss did not improve from 0.54868
Epoch 10/10
Epoch 10: val_loss did not improve from 0.54868


<keras.src.callbacks.History at 0x249b0a85880>

In [13]:
model.load_weights('hasoc_c4.h5')
model.evaluate(X_test,Y_test)



[0.5486761927604675, 0.7093023061752319]

In [14]:
Y_pred = model.predict(test_X)



In [15]:
y_actual = []
for i in test_Y_true:
    if i == 'Non-abusive':
        y_actual.append(1)
    else :
        y_actual.append(0)

pred_class = []
for i in Y_pred:
    pred_class.append(np.argmax(i))

In [16]:
print(classification_report(y_actual , pred_class))

              precision    recall  f1-score   support

           0       0.02      0.29      0.04        21
           1       0.97      0.67      0.79       748

    accuracy                           0.66       769
   macro avg       0.50      0.48      0.42       769
weighted avg       0.94      0.66      0.77       769



In [17]:
pred_actual = []
for i in pred_class:
    if i == 1:
        pred_actual.append('Non-abusive')
    else :
        pred_actual.append('Abusive')

In [18]:
test_data = test_data[["_id"]]
test_data["label"] = pred_actual
test_data.to_csv('dl_lstm_c4.csv',index=False)
test_data.head()

Unnamed: 0,_id,label
0,Hindi_image_410.jpg,Non-abusive
1,Hindi_image_114.jpg,Non-abusive
2,Hindi_image_101.jpg,Non-abusive
3,Hindi_image_1747.jpg,Non-abusive
4,Hindi_image_19.jpg,Non-abusive
