In [1]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import MaxPool1D, Flatten, Concatenate, TextVectorization, Embedding, Bidirectional, LSTM, Conv1D, Dense, Dropout
from tensorflow.keras import Model, Input
from keras.callbacks import EarlyStopping,ModelCheckpoint
from imblearn.over_sampling import SMOTE

In [3]:
import json
import nltk

In [4]:
def read_json_to_dataframe(file_path):
    data = []  # Initialize an empty list to hold the JSON objects
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))  # Load each line as a JSON object
    return pd.DataFrame(data)

# Example usage
train1 = read_json_to_dataframe('domain1_train_data.json')
train2 = read_json_to_dataframe('domain2_train_data.json')
test = read_json_to_dataframe('test_data.json')

In [5]:
train1['domain'] = 1
train1

Unnamed: 0,text,label,id,domain
0,"[16, 231, 543, 5, 15, 43, 8282, 94, 231, 1129,...",1,0,1
1,"[16, 4046, 138, 10, 2, 1809, 2007, 3763, 14, 4...",1,1,1
2,"[1108, 16550, 3, 6168, 3, 160, 284, 19, 49, 46...",1,2,1
3,"[1802, 27, 16, 25, 48, 451, 632, 3, 2, 2164, 2...",1,3,1
4,"[16, 19, 302, 93, 97, 43, 952, 118, 1, 16, 528...",1,4,1
...,...,...,...,...
4995,"[43, 529, 16, 19, 775, 201, 20, 48, 10, 550, 2...",0,4995,1
4996,"[12, 158, 97, 5, 543, 174, 1396, 2, 506, 287, ...",0,4996,1
4997,"[15319, 27775, 9, 27, 2847, 7207, 8, 3234, 1, ...",0,4997,1
4998,"[16, 373, 177, 76, 5, 35, 1342, 1318, 196, 16,...",0,4998,1


In [6]:
train2['domain'] = 2
train2

Unnamed: 0,text,label,id,domain
0,"[12, 920, 7, 1266, 28, 9884, 1640, 116, 11, 13...",1,5000,2
1,"[783, 397, 253, 5797, 9379, 22, 793, 11838, 10...",1,5001,2
2,"[888, 14851, 323, 9, 27, 1377, 584, 195, 3, 13...",1,5002,2
3,"[228, 1161, 5815, 379, 9, 941, 10, 2, 316, 4, ...",1,5003,2
4,"[736, 19, 37, 813, 45, 6723, 27, 626, 8, 2, 34...",1,5004,2
...,...,...,...,...
12995,"[8, 15, 71, 12, 155, 6903, 3, 7, 2300, 352, 37...",0,17995,2
12996,"[12, 155, 7, 420, 4, 228, 89, 206, 5157, 10, 5...",0,17996,2
12997,"[216, 2, 379, 4, 7, 332, 179, 386, 160, 28, 11...",0,17997,2
12998,"[7, 2787, 9, 1026, 7, 5376, 620, 1, 4303, 7, 1...",0,17998,2


In [7]:
train1['text'] = train1['text'].apply(lambda x: ' '.join(map(str, x)))
train2['text'] = train2['text'].apply(lambda x: ' '.join(map(str, x)))
test['text'] = test['text'].apply(lambda x: ' '.join(map(str, x)))

In [8]:
tokenized_texts_test = test['text']
tokenized_texts_train1 = train1['text']
tokenized_texts_train2 = train2['text']
Y_train1 = train1['label']
Y_train2 = train2['label']

In [9]:
train = pd.concat([train1,train2], axis = 0) #domain1 domain2
tokenized_texts_train = train['text']
Y_train = train['label']

In [10]:
train3 = train2[train2['label'] == 1] #domain2 label1
train3

Unnamed: 0,text,label,id,domain
0,12 920 7 1266 28 9884 1640 116 11 1342 1533 28...,1,5000,2
1,783 397 253 5797 9379 22 793 11838 10 607 6324...,1,5001,2
2,888 14851 323 9 27 1377 584 195 3 137 10 2732 ...,1,5002,2
3,228 1161 5815 379 9 941 10 2 316 4 2693 594 87...,1,5003,2
4,736 19 37 813 45 6723 27 626 8 2 3446 4 564 34...,1,5004,2
...,...,...,...,...
1495,8 15 71 3 12 820 2 5912 102 4 70 26779 1359 47...,1,6495,2
1496,848 17 636 214 6 3011 172 3 15 71 1759 66215 3...,1,6496,2
1497,2 133 2633 2965 10876 2141 1445 83 82 1948 197...,1,6497,2
1498,38 229 4012 10 404 10200 26 24 20692 47 7824 1...,1,6498,2


In [11]:
train1_2 = pd.concat([train1,train3], axis = 0) #domain1 label0,1 domain2 label1
train12_text = train1_2['text']
Y_train12 = train1_2['label']

In [12]:
train_test1 = pd.concat([train1,test], axis = 0)
train_test1_text = train_test1['text']
Y_train_test1 = train_test1['label']

In [13]:
train_test2 = pd.concat([train2,test], axis = 0)
train_test2_text = train_test2['text']
Y_train_test2 = train_test2['label']

In [14]:
max_features = 100000
embedding_dim = 64
sequence_length = 500
vectorize_layer1 = tf.keras.layers.TextVectorization(
    max_tokens=max_features,
    ngrams = (1,7),
    output_mode="int",
    output_sequence_length=sequence_length,
    pad_to_max_tokens=True
)
vectorize_layer1.adapt(train_test1_text)
Text1 = vectorize_layer1(tokenized_texts_train1).numpy()

In [15]:
vectorize_layer2 = tf.keras.layers.TextVectorization(
    max_tokens=max_features,
    ngrams = (1,7),
    output_mode="int",
    output_sequence_length=sequence_length,
    pad_to_max_tokens=True
)
vectorize_layer2.adapt(train_test2_text)
Text2 = vectorize_layer2(tokenized_texts_train2).numpy()

In [16]:
desired_samples = {0: 11500, 1: 11500}
sm = SMOTE(sampling_strategy=desired_samples, random_state=42)
Text2,labels2 = sm.fit_resample(Text2, Y_train2)

In [17]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, lstm_dim, rate=0.2):
        super(TransformerBlock, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.lstm_dim = lstm_dim
        self.rate = rate
        
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.blstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_dim, return_sequences=True))
        # self.linear = tf.keras.layers.Dense(embed_dim,)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training = False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        blstm_output = self.blstm(out1)
        blstm_output = self.dropout2(blstm_output, training=training)
        # linear_output = self.linear(blstm_output)
        return self.layernorm2(out1 + blstm_output)
    
    def get_config(self):
        config = super().get_config()
        config.update({
            'embed_dim': self.embed_dim,
            'num_heads': self.num_heads,
            'lstm_dim': self.lstm_dim,
            'rate': self.rate
        })
        return config

inputs = Input(shape=(sequence_length,), dtype="int64")
x = Embedding(input_dim = max_features, output_dim = embedding_dim)(inputs)
transformerblock = TransformerBlock(embedding_dim, 2, 32)
x = transformerblock(x)
# CNN
conv1 = Conv1D(128, 4, padding="valid", activation="relu", strides=3)(x)
pool1 = MaxPool1D()(conv1)
conv2 = Conv1D(64, 3, padding="valid", activation="relu", strides=3)(x)
pool2 = MaxPool1D()(conv2)
concat = Concatenate()([pool1, pool2])

x = Bidirectional(LSTM(32, return_sequences=True))(concat)
x = Dropout(0.1)(x)
x = Dense(128, activation="relu")(x)
x = Dropout(0.5)(x)
x = Flatten()(x)
predictions = Dense(1, activation="sigmoid")(x)

model1 = Model(inputs=inputs, outputs=predictions)
model1.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 500)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 500, 64)      6400000     ['input_1[0][0]']                
                                                                                                  
 transformer_block (Transformer  (None, 500, 64)     58304       ['embedding[0][0]']              
 Block)                                                                                           
                                                                                                  
 conv1d (Conv1D)                (None, 166, 128)     32896       ['transformer_block[0][0]']  

In [18]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.2):
        super(TransformerBlock, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.rate = rate
        
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"),
            tf.keras.layers.Dense(embed_dim,)]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
    def get_config(self):
        config = super().get_config()
        config.update({
            'embed_dim': self.embed_dim,
            'num_heads': self.num_heads,
            'ff_dim': self.ff_dim,
            'rate': self.rate
        })
        return config
    
inputs = Input(shape=(sequence_length,), dtype="int64")
x = Embedding(input_dim = max_features, output_dim = embedding_dim)(inputs)
transformerblock = TransformerBlock(embedding_dim, 2, 32)
x = transformerblock(x)
#CNN
conv1 = Conv1D(128, 4, padding="valid", activation="relu", strides=3)(x)
pool1 = MaxPool1D()(conv1)
conv2 = Conv1D(64, 3, padding="valid", activation="relu", strides=3)(x)
pool2 = MaxPool1D()(conv2)
concat = Concatenate()([pool1, pool2])

x = Bidirectional(LSTM(32, return_sequences=True))(concat)
x = Dropout(0.1)(x)
x = Dense(128, activation="relu")(x)
x = Dropout(0.5)(x)
x = Flatten()(x)
predictions = Dense(1, activation="sigmoid")(x)

model2 = Model(inputs=inputs, outputs=predictions)
model2.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 500)]        0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 500, 64)      6400000     ['input_2[0][0]']                
                                                                                                  
 transformer_block_1 (Transform  (None, 500, 64)     37664       ['embedding_1[0][0]']            
 erBlock)                                                                                         
                                                                                                  
 conv1d_2 (Conv1D)              (None, 166, 128)     32896       ['transformer_block_1[0][0]

In [30]:
checkpoint_cb =ModelCheckpoint("model1.h5", save_best_only=True)
early_stopping_cb =EarlyStopping(patience=3, restore_best_weights=True)
model1.compile(optimizer ='adam', loss='binary_crossentropy', metrics=['accuracy'])
model1.fit(Text1, Y_train1, epochs=10, validation_split=.1, callbacks=[checkpoint_cb, early_stopping_cb])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<keras.callbacks.History at 0x21fc696a530>

In [20]:
checkpoint_cb =ModelCheckpoint("model2.h5", save_best_only=True)
early_stopping_cb =EarlyStopping(patience=3, restore_best_weights=True)
model2.compile(optimizer ='adam', loss='binary_crossentropy', metrics=['accuracy'])
model2.fit(Text2, labels2, epochs=10, validation_split=.1, callbacks=[checkpoint_cb, early_stopping_cb])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.callbacks.History at 0x21d4f24b7c0>

In [21]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
# This dummy tokenizer is because the text has been already parsed
def dummy(text):
    return text
vectorizer = TfidfVectorizer(ngram_range=(1, 7), sublinear_tf=True, analyzer = 'word',
    min_df = 0.001,
    tokenizer = dummy,
    preprocessor = dummy,
    token_pattern = None)

vectorizer.fit(tokenized_texts_test)

# Getting vocab
vocab = vectorizer.vocabulary_

# print(vocab)
vectorizer = TfidfVectorizer(ngram_range=(1, 7), sublinear_tf=True, vocabulary=vocab,
                            min_df = 0.001,
                            analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None
                            )


tf_train = vectorizer.fit_transform(tokenized_texts_train)

tf_test = vectorizer.transform(tokenized_texts_test)

In [23]:
tf_train1_2 = vectorizer.transform(train12_text)

In [24]:
sgd_model_all = SGDClassifier(max_iter=5000, tol=1e-4, loss="modified_huber")
sgd_model1_2 = SGDClassifier(max_iter=8000, tol=1e-5, loss="modified_huber", class_weight = {0:1,1:1},penalty = 'l2', random_state = 90051, alpha  = 0.000000001)
sgd_model_all.fit(tf_train,Y_train)
sgd_model1_2.fit(tf_train1_2,Y_train12)

In [25]:
train_domain = train['domain']

In [26]:
lr=LogisticRegression()
lr.fit(tf_train,train_domain)

In [27]:
test_text1 = vectorize_layer1(tokenized_texts_test).numpy()
test_text2 = vectorize_layer2(tokenized_texts_test).numpy()

In [31]:
# 假设lr是你的逻辑回归模型，tf_test是你的测试数据
domain_pred = lr.predict_proba(tf_test)

# 初始化一个空列表来存储所有最终预测结果
final_predictions = []

# 遍历domain_pred中的每个样本预测
for i, probas in enumerate(domain_pred):
    # 找到最大概率及其对应的索引（也就是domain）
    max_prob = np.max(probas)
    domain = np.argmax(probas) + 1  # 加1是因为假设domain是从1开始计数的
    sample = tf_test[i:i+1]
    if max_prob > 0.55:
    # 根据最大概率和domain选择模型并进行预测
        if domain == 1:
            sample = test_text1[i:i+1]
            predictions = model1.predict(sample)
            pred = str(int(np.where(predictions>=.5,1,0)))
        else:
            sample = test_text2[i:i+1]
            predictions = model2.predict(sample)
            pred = str(int(np.where(predictions>=.5,1,0)))
    else:
        prob1 = sgd_model_all.predict_proba(sample)
        prob2 = sgd_model1_2.predict_proba(sample)
        prob = (prob1 + prob2*0.48) / 2
        pred = np.argmax(prob, axis=1)
    # 将预测结果添加到最终结果列表中
    final_predictions.append(pred[0])

# 最终预测结果
final_predictions = np.array(final_predictions)
len(final_predictions)





















































4000

In [32]:
#preds = ensemble.predict(tf_test)
sub = pd.read_csv('sample.csv')
sub['class'] = final_predictions
sub.to_csv('submission.csv', index=False)