## **Preprocessing**

In [1]:
#**Preprocessing**

import json
import pandas as pd

def load_domain_from_json(path):
    domain = []
    with open(path, "r") as file:
        for instance in file:
            domain.append(json.loads(instance))
    # print(domain[0])

    domain_label = [instance["label"] for instance in domain]
    domain_id = [instance["id"] for instance in domain]
    domain_text = [instance["text"] for instance in domain]
    for i in range(len(domain_text)):
        text = list(map(str, domain_text[i]))
        domain_text[i] = " ".join(text)
    # print(domain_text[0])
    domain_df = pd.DataFrame({
        "id": domain_id,
        "text": domain_text,
        "label": domain_label
    })
    return domain_df

domain1 = load_domain_from_json("data/domain1_train_data.json")
domain2 = load_domain_from_json("data/domain2_train_data.json")
print(domain1.head())
print(domain2.head())

test_data = []
with open("data/test_data.json", "r") as file:
    for instance in file:
        test_data.append(json.loads(instance))
id = [instance["id"] for instance in test_data]
text = [instance["text"] for instance in test_data]
for i in range(len(text)):
    text_str = list(map(str, text[i]))
    text[i] = " ".join(text_str)
test_data_df = pd.DataFrame({
    "id": id,
    "text": text,
})
print(test_data_df.head())
# print(len(test_data_df))


   id                                               text  label
0   0  16 231 543 5 15 43 8282 94 231 1129 31 34 32 9...      1
1   1  16 4046 138 10 2 1809 2007 3763 14 40113 13 90...      1
2   2  1108 16550 3 6168 3 160 284 19 49 464 5333 8 4...      1
3   3  1802 27 16 25 48 451 632 3 2 2164 25 2380 34 7...      1
4   4  16 19 302 93 97 43 952 118 1 16 528 2 26528 10...      1
     id                                               text  label
0  5000  12 920 7 1266 28 9884 1640 116 11 1342 1533 28...      1
1  5001  783 397 253 5797 9379 22 793 11838 10 607 6324...      1
2  5002  888 14851 323 9 27 1377 584 195 3 137 10 2732 ...      1
3  5003  228 1161 5815 379 9 941 10 2 316 4 2693 594 87...      1
4  5004  736 19 37 813 45 6723 27 626 8 2 3446 4 564 34...      1
   id                                               text
0   0  0 10839 1083 2881 12159 2356 0 3 1426 18 21776...
1   1  12 858 7 1179 944 1485 10 2 4532 12245 499 254...
2   2  12 155 8 15 71 7 183 29 3884 2 654 2162 4 

In [2]:
import pandas as pd
import nltk
import os
from gensim.models import Word2Vec

sentences_list = []
for text in domain1['text'].tolist() + domain2['text'].tolist():
    sentences_list.append([str(num) for num in text])

num_features = 300  
min_word_count = 40   
num_workers = 4
context = 10

# 训练模型
model = Word2Vec(sentences_list, workers=num_workers, vector_size=num_features, min_count=min_word_count, window=context)

# 保存模型
model_directory = '../models'
model_filename = 'word2vec_model'
full_path = os.path.join(model_directory, model_filename)
if not os.path.exists(model_directory):
    os.makedirs(model_directory)
model.save(full_path)
print(f"Model will be saved to: {full_path}")

Model will be saved to: ../models/word2vec_model


In [3]:
from gensim.models import Word2Vec

# 加载模型
model = Word2Vec.load("../models/word2vec_model")

# 查询词向量
vector = model.wv["4"]

# 打印词向量
print(vector)


[ 1.33269839e-02  1.14546455e-02 -3.14167365e-02 -5.50579987e-02
  6.58401800e-03  5.24431292e-04  2.97510978e-02 -1.68141443e-02
  1.16222037e-03 -1.84915736e-02  4.40486036e-02 -4.46196422e-02
  2.89697275e-02 -5.91346771e-02  7.72717297e-02  7.22385272e-02
  4.38336991e-02 -2.90793902e-03 -4.53712866e-02 -1.72197504e-03
  3.18829380e-02  3.13817896e-02  7.85139799e-02  1.74378276e-01
 -4.63725701e-02 -3.99334468e-02 -1.07455291e-02  1.28525989e-02
 -9.77565348e-03 -1.87990144e-02  1.93790682e-02 -4.18375770e-04
 -6.61442475e-03 -5.04643284e-02  3.36600579e-02 -3.90937626e-02
  6.35188892e-02 -3.52367200e-02 -4.31952393e-03  1.45692180e-03
 -4.08509225e-02 -8.16713367e-03 -1.07330130e-02 -3.99538726e-02
  1.06108626e-02  1.41397193e-02 -3.76557335e-02 -1.06225279e-03
  7.51047730e-02  4.57606018e-02 -2.36028340e-02 -2.59279720e-02
 -5.02808541e-02 -2.92668422e-03 -4.91114380e-03  1.23294005e-02
  5.65096028e-02  3.96795711e-03  1.39909219e-02  5.01996279e-02
  9.98333842e-03  2.73952

In [4]:
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

label_counts = domain2['label'].value_counts()
majority_label = label_counts[label_counts == label_counts.max()].index[0]
minority_label = label_counts[label_counts == label_counts.min()].index[0]

domain2_majority = domain2[domain2['label'] == majority_label]
domain2_minority = domain2[domain2['label'] == minority_label]

domain2_majority_underampled = resample(domain2_majority,
                                        replace=False,
                                        n_samples=len(domain2_minority),
                                        random_state=42)

domain2_undersampled = pd.concat([domain2_majority_underampled, domain2_minority])

print(domain2_undersampled['label'].value_counts())

combined_data = pd.concat([domain1, domain2_undersampled], ignore_index=True)

X = combined_data['text']
y = combined_data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Train data shape: {X_train.shape}")

label
0    1500
1    1500
Name: count, dtype: int64
Train data shape: (6400,)


In [5]:
w2v_model = Word2Vec.load("../models/word2vec_model")
from sklearn.model_selection import train_test_split


def text_to_sequences(text_data, word2vec_model):
    sequences = []
    for text in text_data:
        words = text.split()  # 如果你的文本数据是以空格分隔的单词，请根据实际情况选择分词方法
        word_vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
        sequences.append(word_vectors)
    return sequences

# 将训练集和测试集转换为词向量序列
X_train_sequences = text_to_sequences(X_train, w2v_model)
X_test_sequences = text_to_sequences(X_test, w2v_model)

In [6]:
#w2v into LSTM
#已经是X_train, X_test
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.optimizers import Adam
from sklearn.metrics import accuracy_score

def create_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(units=64, input_shape=input_shape, return_sequences=True))
    model.add(Dropout(rate=0.2))
    model.add(LSTM(units=32, return_sequences=False))
    model.add(Dropout(rate=0.2))
    model.add(Dense(units=1, activation='sigmoid'))
    optimizer = Adam(lr=0.001)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

# # Assuming X_train, X_test, y_train, y_test are already prepared

# # Define the maximum number of features for BOW
# max_features = 10000

# # Create and compile the model
# model = create_lstm_model(X_train.shape[1:], max_features)
# history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# # Evaluate the model
# y_pred = model.predict(X_test)
# y_pred_binary = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions
# accuracy = accuracy_score(y_test, y_pred_binary)
# print("LSTM Accuracy:", accuracy)

# 获取词向量的维度
embedding_dim = w2v_model.vector_size

# 获取LSTM模型的输入形状
input_shape = (None, embedding_dim)  # 使用 None 表示可变长度的序列


from keras.callbacks import Callback

class CustomCallback(Callback):
    def on_epoch_end(self, epoch, logs=None):
        print("Epoch:", epoch+1)
        print(" - Loss:", logs['loss'])
        print(" - Accuracy:", logs['accuracy'])

# 创建模型
lstm_model = create_lstm_model(input_shape)

# 定义回调函数
custom_callback = CustomCallback()

# 训练模型，并且使用回调函数
# 训练模型
history = lstm_model.fit(np.array(X_train_sequences), y_train, epochs=50, batch_size=32, validation_split=0.2, callbacks=[custom_callback])

# 评估模型
y_pred = lstm_model.predict(np.array(X_test_sequences))
y_pred_binary = (y_pred > 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred_binary)
print("LSTM Accuracy:", accuracy)

In [None]:

# 可能需要对序列进行填充



## **Baseline(BOW + NaiveBayes)**

In [2]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

combined_domain = pd.concat([domain1, domain2], ignore_index=True)
vectorizer_BOW = CountVectorizer()
X = combined_domain['text']
y = combined_domain['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train = vectorizer_BOW.fit_transform(X_train)
X_test = vectorizer_BOW.transform(X_test)

nb_classifier = BernoulliNB()
nb_classifier.fit(X_train, y_train)

y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Baseline Accuracy:", accuracy)

Baseline Accuracy: 0.6188888888888889


# **Undersampling**

In [8]:


# **Undersampling + BOW**

from sklearn.utils import resample

label_counts = domain2['label'].value_counts()
majority_label = label_counts[label_counts == label_counts.max()].index[0]
minority_label = label_counts[label_counts == label_counts.min()].index[0]

domain2_majority = domain2[domain2['label'] == majority_label]
domain2_minority = domain2[domain2['label'] == minority_label]

domain2_majority_underampled = resample(domain2_majority,
                                        replace=False,
                                        n_samples=len(domain2_minority),
                                        random_state=42)

domain2_undersampled = pd.concat([domain2_majority_underampled, domain2_minority])

print(domain2_undersampled['label'].value_counts())

from sklearn.feature_extraction.text import CountVectorizer

combined_data = pd.concat([domain1, domain2_undersampled], ignore_index=True)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(combined_data['text'])
y = combined_data['label']

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

mnb_classifier = MultinomialNB()
mnb_classifier.fit(X_train, y_train)

y_pred = mnb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

## **Undersampling + TFIDF**

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(combined_data['text'])
y = combined_data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# more models
#SVM
from sklearn.svm import SVC

svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)

y_pred_svm = svm_classifier.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy:", accuracy_svm)

#LR
from sklearn.linear_model import LogisticRegression

lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, y_train)

y_pred_lr = lr_classifier.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression Accuracy:", accuracy_lr)

#XGB
from xgboost import XGBClassifier

xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train, y_train)
y_pred_xgb = xgb_classifier.predict(X_test)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print("XGBoost Accuracy:", accuracy_xgb)

#RF
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

y_pred_rf = rf_classifier.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)

0    1500
1    1500
Name: label, dtype: int64
Accuracy: 0.713125
Accuracy: 0.633125
SVM Accuracy: 0.778125
Logistic Regression Accuracy: 0.74625
XGBoost Accuracy: 0.73125
Random Forest Accuracy: 0.780625


## **Oversampling**

In [10]:

### SMOTE

from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.naive_bayes import BernoulliNB

combined_domain = pd.concat([domain1, domain2], ignore_index=True)
# print(combined_domain)
accuracy_df = pd.DataFrame(index=['SMOTE', 'ADASYN'], columns=['BOW', 'TFIDF'])

########### BOW
vectorizer_BOW = CountVectorizer()
X = combined_domain['text']
y = combined_domain['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train_BOW = vectorizer_BOW.fit_transform(X_train)
X_test_BOW = vectorizer_BOW.transform(X_test)

X_train_BOW_resampled_SMOTE, y_train_BOW_resampled_SMOTE = SMOTE(sampling_strategy='auto', random_state=35, k_neighbors=5, n_jobs=None).fit_resample(X_train_BOW, y_train)
# print(X_train_resampled.shape)
# print(y_train_resampled.shape)

nb_classifier = BernoulliNB()
nb_classifier.fit(X_train_BOW_resampled_SMOTE, y_train_BOW_resampled_SMOTE)

y_pred_BOW_SMOTE = nb_classifier.predict(X_test_BOW)
accuracy = accuracy_score(y_test, y_pred_BOW_SMOTE)
accuracy_df.loc['SMOTE', 'BOW'] = accuracy

########## TFIDF

vectorizer_TFIDF = TfidfVectorizer()
X_train_TFIDF = vectorizer_TFIDF.fit_transform(X_train)
X_test_TFIDF = vectorizer_TFIDF.transform(X_test)
x = test_data_df["text"]
assert(len(x)==4000)
X_test_oversampling = vectorizer_TFIDF.transform(x)
print(X_test_oversampling.shape)

X_train_TFIDF_resampled_SMOTE, y_train_TFIDF_resampled_SMOTE = SMOTE(sampling_strategy='auto', random_state=35, k_neighbors=5, n_jobs=None).fit_resample(X_train_TFIDF, y_train)
# print(X_train_resampled.shape)
# print(y_train_resampled.shape)
nb_classifier = BernoulliNB()
nb_classifier.fit(X_train_TFIDF_resampled_SMOTE, y_train_TFIDF_resampled_SMOTE)

y_pred_TFIDF_SMOTE = nb_classifier.predict(X_test_TFIDF)
accuracy = accuracy_score(y_test, y_pred_TFIDF_SMOTE)
accuracy_df.loc['SMOTE', 'TFIDF'] = accuracy


### ADASYN


from imblearn.over_sampling import SMOTE, ADASYN
X_train_TFIDF_resampled_ADA, y_train_TFIDF_resampled_ADA = ADASYN(sampling_strategy='auto', random_state=35, n_neighbors=5, n_jobs=None).fit_resample(X_train_TFIDF, y_train)
# print(X_train_resampled.shape)
# print(y_train_resampled.shape)

nb_classifier = BernoulliNB()
nb_classifier.fit(X_train_TFIDF_resampled_ADA, y_train_TFIDF_resampled_ADA)

y_pred_TFIDF_ADA = nb_classifier.predict(X_test_TFIDF)
accuracy = accuracy_score(y_test, y_pred_TFIDF_ADA)
accuracy_df.loc['ADASYN', 'TFIDF'] = accuracy


X_train_BOW_resampled_ADA, y_train_BOW_resampled_ADA = ADASYN(sampling_strategy='auto', random_state=35, n_neighbors=5, n_jobs=None).fit_resample(X_train_BOW, y_train)
# print(X_train_resampled.shape)
# print(y_train_resampled.shape)

nb_classifier = BernoulliNB()
nb_classifier.fit(X_train_BOW_resampled_ADA, y_train_BOW_resampled_ADA)

y_pred_BOW_ADA = nb_classifier.predict(X_test_BOW)
accuracy = accuracy_score(y_test, y_pred_BOW_ADA)
accuracy_df.loc['ADASYN', 'BOW'] = accuracy

print(accuracy_df)

svm_classifier = SVC()
svm_classifier.fit(X_train_TFIDF_resampled_ADA, y_train_TFIDF_resampled_ADA)

y_pred_svm_ADA = svm_classifier.predict(X_test_TFIDF)
accuracy_svm_ADA = accuracy_score(y_test, y_pred_svm_ADA)
print("SVM Accuracy:", accuracy_svm_ADA)

(4000, 73066)
             BOW     TFIDF
SMOTE   0.588611  0.776944
ADASYN  0.583611  0.776944
SVM Accuracy: 0.8591666666666666


## **Word2Vec**

pandas.core.series.Series

## **DaNN**

In [None]:
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Function

class Extractor(nn.Module):
    def __init__(self):
        super(Extractor, self).__init__()
        self.extractor = nn.Sequential(
            nn.Linear(in_features=len(vectorizer_TFIDF.vocabulary_), out_features=512),
            nn.ReLU(),
            nn.Linear(in_features=512, out_features=128),
            nn.ReLU(),
        )

    def forward(self, x):
        x = self.extractor(x)
        x = x.view(-1, 3 * 28 * 28)
        return x


class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(in_features=128, out_features=32),
            nn.ReLU(),
            nn.Linear(in_features=32, out_features=1),
            nn.sigmoid()
        )

    def forward(self, x):
        x = self.classifier(x)
        return x


class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.discriminator = nn.Sequential(
            nn.Linear(in_features=128, out_features=16),
            nn.ReLU(),
            nn.Linear(in_features=16, out_features=2)
            nn.Softmax(dim=1)
        )

    def forward(self, input_feature, alpha):
        reversed_input = ReverseLayerF.apply(input_feature, alpha)
        x = self.discriminator(reversed_input)
        return x

class ReverseLayerF(Function):

    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha

        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        output = grad_output.neg() * ctx.alpha

        return output, None


In [None]:
import torch
import numpy as np
import utils
import torch.optim as optim
import torch.nn as nn

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(domain1['text'], domain1['label'].values, test_size=0.2, random_state=42, stratify=domain1['label'].values)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(domain2['text'], domain2['label'].values, test_size=0.2, random_state=42, stratify=domain2['label'].values)

combined_X_train = pd.concat([X_train_1, X_train_2])
vectorizer = TfidfVectorizer()
vectorizer.fit(combined_X_train)

X_train_1_TFIDF = vectorizer.transform(X_train_1)
X_train_2_TFIDF = vectorizer.transform(X_train_2)

X_train_2_TFIDF, y_train_2 = ADASYN(sampling_strategy='auto', random_state=35, n_neighbors=5, n_jobs=None).fit_resample(X_train_2, y_train_2)

X_train_1 = torch.from_numpy(X_train_1_TFIDF.toarray())
y_train_1 = torch.from_numpy(y_train_1)
X_train_2 = torch.from_numpy(X_train_2_TFIDF.toarray())
y_train_2 = torch.from_numpy(y_train_2)

# Create iterable dataset in Torch format
train_1_ds = torch.utils.data.TensorDataset(X_train_1, y_train_1)
train_1_loader = torch.utils.data.DataLoader(train_1_ds, batch_size=32)
train_2_ds = torch.utils.data.TensorDataset(X_train_2, y_train_2)
train_2_loader = torch.utils.data.DataLoader(train_2_ds, batch_size=32)


feature_extractor = Extractor()
label_classifier = Classifier()
domain_classifier = Discriminator()

label_classification_criterion = nn.CrossEntropyLoss()
domain_classification_criterion = nn.CrossEntropyLoss()

DaNN_params = list(feature_extractor.paramters()) + list(label_classifier.paramters()) + list(domain_classifier.parameters())
DaNN_optimizer = optim.Adam(DaNN_params, lr=0.001)

n_epochs = 100
for epoch in range(n_epochs):
    print(f"Epoch: {epoch}")
    
    feature_extractor.train()
    label_classifier.train()
    domain_classifier.train()
    for i, (data1, data2) in enumerate(zip(train_1_loader, train_2_loader)):
        



## **Test Result**

In [29]:
def get_predict_csv(clfs, X_test, id):
    
    prediction = clfs.predict(X_test)
    results_csv = pd.DataFrame({
        "id": id,
        "class": prediction,
    })
    return results_csv

test_results = get_predict_csv(svm_classifier, X_test_oversampling, test_data_df["id"])
print(test_results)
test_results.to_csv('test_results.csv', index=False)

        id  class
0        0      1
1        1      0
2        2      0
3        3      0
4        4      0
...    ...    ...
3995  3995      0
3996  3996      0
3997  3997      0
3998  3998      0
3999  3999      1

[4000 rows x 2 columns]


In [7]:
import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [45]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(domain1['text'], domain1['label'].values, test_size=0.2, random_state=42, stratify=domain1['label'].values)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(domain2['text'], domain2['label'].values, test_size=0.2, random_state=42, stratify=domain2['label'].values)

combined_X_train = pd.concat([X_train_1, X_train_2])
vectorizer = TfidfVectorizer()
vectorizer.fit(combined_X_train)

X_train_1_TFIDF = vectorizer.transform(X_train_1)
X_train_2_TFIDF = vectorizer.transform(X_train_2)

X_train_2_TFIDF, y_train_2 = ADASYN(sampling_strategy='auto', random_state=35, n_neighbors=5, n_jobs=None).fit_resample(X_train_2_TFIDF, y_train_2)

X_train_1 = torch.from_numpy(X_train_1_TFIDF.toarray())
y_train_1 = torch.from_numpy(y_train_1)
X_train_2 = torch.from_numpy(X_train_2_TFIDF.toarray())
y_train_2 = torch.from_numpy(y_train_2)

# Create iterable dataset in Torch format
train_1_ds = torch.utils.data.TensorDataset(X_train_1, y_train_1)
train_1_loader = torch.utils.data.DataLoader(train_1_ds, batch_size=32)
train_2_ds = torch.utils.data.TensorDataset(X_train_2, y_train_2)
train_2_loader = torch.utils.data.DataLoader(train_2_ds, batch_size=32)

print(len(train_1_loader))
print(len(train_2_loader))


n_epochs = 10
for epoch in range(n_epochs):
    print(f"Epoch: {epoch}")
    for i, (data1, data2) in enumerate(zip(train_1_loader, train_2_loader)):
        print("Batch id: ", i)
        print("data1: ", data1)
        print("data2: ", data2)
        print("------------------------\n")

125
584
Epoch: 0
Batch id:  0
data1:  [tensor([[0.0461, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0780, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0698, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0449, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0387, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       dtype=torch.float64), tensor([1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
        0, 1, 1, 1, 0, 0, 1, 1])]
data2:  [tensor([[0.0322, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0258, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0070, 0.0000, 0.0258,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0233, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0479, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0207, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       dtype=