## **Preprocessing**

In [1]:
#**Preprocessing**

import json
import pandas as pd

def load_domain_from_json(path):
    domain = []
    with open(path, "r") as file:
        for instance in file:
            domain.append(json.loads(instance))
    # print(domain[0])

    domain_label = [instance["label"] for instance in domain]
    domain_id = [instance["id"] for instance in domain]
    domain_text = [instance["text"] for instance in domain]
    for i in range(len(domain_text)):
        text = list(map(str, domain_text[i]))
        domain_text[i] = " ".join(text)
    # print(domain_text[0])
    domain_df = pd.DataFrame({
        "id": domain_id,
        "text": domain_text,
        "label": domain_label
    })
    return domain_df

domain1 = load_domain_from_json("data/domain1_train_data.json")
domain2 = load_domain_from_json("data/domain2_train_data.json")
print(domain1.head())
print(domain2.head())

test_data = []
with open("data/test_data.json", "r") as file:
    for instance in file:
        test_data.append(json.loads(instance))
id = [instance["id"] for instance in test_data]
text = [instance["text"] for instance in test_data]
for i in range(len(text)):
    text_str = list(map(str, text[i]))
    text[i] = " ".join(text_str)
test_data_df = pd.DataFrame({
    "id": id,
    "text": text,
})
# print(test_data_df.head())
print(len(test_data_df))


   id                                               text  label
0   0  16 231 543 5 15 43 8282 94 231 1129 31 34 32 9...      1
1   1  16 4046 138 10 2 1809 2007 3763 14 40113 13 90...      1
2   2  1108 16550 3 6168 3 160 284 19 49 464 5333 8 4...      1
3   3  1802 27 16 25 48 451 632 3 2 2164 25 2380 34 7...      1
4   4  16 19 302 93 97 43 952 118 1 16 528 2 26528 10...      1
     id                                               text  label
0  5000  12 920 7 1266 28 9884 1640 116 11 1342 1533 28...      1
1  5001  783 397 253 5797 9379 22 793 11838 10 607 6324...      1
2  5002  888 14851 323 9 27 1377 584 195 3 137 10 2732 ...      1
3  5003  228 1161 5815 379 9 941 10 2 316 4 2693 594 87...      1
4  5004  736 19 37 813 45 6723 27 626 8 2 3446 4 564 34...      1
4000


## **Baseline(BOW + NaiveBayes)**

In [4]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer

combined_domain = pd.concat([domain1, domain2], ignore_index=True)
vectorizer_BOW = CountVectorizer()
X = combined_domain['text']
y = combined_domain['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train = vectorizer_BOW.fit_transform(X_train)
X_test = vectorizer_BOW.transform(X_test)

nb_classifier = BernoulliNB()
nb_classifier.fit(X_train, y_train)

y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Baseline Accuracy:", accuracy)

Baseline Accuracy: 0.6188888888888889


# **Undersampling**

In [8]:


# **Undersampling + BOW**

from sklearn.utils import resample

label_counts = domain2['label'].value_counts()
majority_label = label_counts[label_counts == label_counts.max()].index[0]
minority_label = label_counts[label_counts == label_counts.min()].index[0]

domain2_majority = domain2[domain2['label'] == majority_label]
domain2_minority = domain2[domain2['label'] == minority_label]

domain2_majority_underampled = resample(domain2_majority,
                                        replace=False,
                                        n_samples=len(domain2_minority),
                                        random_state=42)

domain2_undersampled = pd.concat([domain2_majority_underampled, domain2_minority])

print(domain2_undersampled['label'].value_counts())

from sklearn.feature_extraction.text import CountVectorizer

combined_data = pd.concat([domain1, domain2_undersampled], ignore_index=True)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(combined_data['text'])
y = combined_data['label']

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

mnb_classifier = MultinomialNB()
mnb_classifier.fit(X_train, y_train)

y_pred = mnb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

## **Undersampling + TFIDF**

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(combined_data['text'])
y = combined_data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# more models
#SVM
from sklearn.svm import SVC

svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)

y_pred_svm = svm_classifier.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy:", accuracy_svm)

#LR
from sklearn.linear_model import LogisticRegression

lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, y_train)

y_pred_lr = lr_classifier.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression Accuracy:", accuracy_lr)

#XGB
from xgboost import XGBClassifier

xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train, y_train)
y_pred_xgb = xgb_classifier.predict(X_test)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print("XGBoost Accuracy:", accuracy_xgb)

#RF
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

y_pred_rf = rf_classifier.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)

0    1500
1    1500
Name: label, dtype: int64
Accuracy: 0.713125
Accuracy: 0.633125
SVM Accuracy: 0.778125
Logistic Regression Accuracy: 0.74625
XGBoost Accuracy: 0.73125
Random Forest Accuracy: 0.780625


## **Oversampling**

In [10]:

### SMOTE

from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.naive_bayes import BernoulliNB

combined_domain = pd.concat([domain1, domain2], ignore_index=True)
# print(combined_domain)
accuracy_df = pd.DataFrame(index=['SMOTE', 'ADASYN'], columns=['BOW', 'TFIDF'])

########### BOW
vectorizer_BOW = CountVectorizer()
X = combined_domain['text']
y = combined_domain['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train_BOW = vectorizer_BOW.fit_transform(X_train)
X_test_BOW = vectorizer_BOW.transform(X_test)

X_train_BOW_resampled_SMOTE, y_train_BOW_resampled_SMOTE = SMOTE(sampling_strategy='auto', random_state=35, k_neighbors=5, n_jobs=None).fit_resample(X_train_BOW, y_train)
# print(X_train_resampled.shape)
# print(y_train_resampled.shape)

nb_classifier = BernoulliNB()
nb_classifier.fit(X_train_BOW_resampled_SMOTE, y_train_BOW_resampled_SMOTE)

y_pred_BOW_SMOTE = nb_classifier.predict(X_test_BOW)
accuracy = accuracy_score(y_test, y_pred_BOW_SMOTE)
accuracy_df.loc['SMOTE', 'BOW'] = accuracy

########## TFIDF

vectorizer_TFIDF = TfidfVectorizer()
X_train_TFIDF = vectorizer_TFIDF.fit_transform(X_train)
X_test_TFIDF = vectorizer_TFIDF.transform(X_test)
x = test_data_df["text"]
assert(len(x)==4000)
X_test_oversampling = vectorizer_TFIDF.transform(x)
print(X_test_oversampling.shape)

X_train_TFIDF_resampled_SMOTE, y_train_TFIDF_resampled_SMOTE = SMOTE(sampling_strategy='auto', random_state=35, k_neighbors=5, n_jobs=None).fit_resample(X_train_TFIDF, y_train)
# print(X_train_resampled.shape)
# print(y_train_resampled.shape)
nb_classifier = BernoulliNB()
nb_classifier.fit(X_train_TFIDF_resampled_SMOTE, y_train_TFIDF_resampled_SMOTE)

y_pred_TFIDF_SMOTE = nb_classifier.predict(X_test_TFIDF)
accuracy = accuracy_score(y_test, y_pred_TFIDF_SMOTE)
accuracy_df.loc['SMOTE', 'TFIDF'] = accuracy


### ADASYN


from imblearn.over_sampling import SMOTE, ADASYN
X_train_TFIDF_resampled_ADA, y_train_TFIDF_resampled_ADA = ADASYN(sampling_strategy='auto', random_state=35, n_neighbors=5, n_jobs=None).fit_resample(X_train_TFIDF, y_train)
# print(X_train_resampled.shape)
# print(y_train_resampled.shape)

nb_classifier = BernoulliNB()
nb_classifier.fit(X_train_TFIDF_resampled_ADA, y_train_TFIDF_resampled_ADA)

y_pred_TFIDF_ADA = nb_classifier.predict(X_test_TFIDF)
accuracy = accuracy_score(y_test, y_pred_TFIDF_ADA)
accuracy_df.loc['ADASYN', 'TFIDF'] = accuracy


X_train_BOW_resampled_ADA, y_train_BOW_resampled_ADA = ADASYN(sampling_strategy='auto', random_state=35, n_neighbors=5, n_jobs=None).fit_resample(X_train_BOW, y_train)
# print(X_train_resampled.shape)
# print(y_train_resampled.shape)

nb_classifier = BernoulliNB()
nb_classifier.fit(X_train_BOW_resampled_ADA, y_train_BOW_resampled_ADA)

y_pred_BOW_ADA = nb_classifier.predict(X_test_BOW)
accuracy = accuracy_score(y_test, y_pred_BOW_ADA)
accuracy_df.loc['ADASYN', 'BOW'] = accuracy

print(accuracy_df)

svm_classifier = SVC()
svm_classifier.fit(X_train_TFIDF_resampled_ADA, y_train_TFIDF_resampled_ADA)

y_pred_svm_ADA = svm_classifier.predict(X_test_TFIDF)
accuracy_svm_ADA = accuracy_score(y_test, y_pred_svm_ADA)
print("SVM Accuracy:", accuracy_svm_ADA)

(4000, 73066)
             BOW     TFIDF
SMOTE   0.588611  0.776944
ADASYN  0.583611  0.776944
SVM Accuracy: 0.8591666666666666


## **Word2Vec**

In [16]:



data = pd.concat([domain1, domain2], ignore_index=True)

import nltk
import numpy as np

nltk.download('punkt')
sentences_list = [nltk.word_tokenize(text) for text in data['text']]
y = data['label'].values


from gensim.models import Word2Vec
model = Word2Vec(sentences_list, vector_size=300, window=5, min_count=1, workers=4)

def document_vector(model, doc):
    doc = [word for word in doc if word in model.wv.key_to_index]
    return np.mean(model.wv[doc], axis=0) if doc else np.zeros(model.vector_size)
    
test_sentences_list = [nltk.word_tokenize(text) for text in test_data_df['text']]
feature_vectors = np.array([document_vector(model, doc) for doc in sentences_list])
test_feature_vectors = np.array([document_vector(model, doc) for doc in test_sentences_list])
from sklearn.model_selection import train_test_split
X_train_W2V, X_test_W2V, y_train, y_test = train_test_split(feature_vectors, y, test_size=0.2, random_state=42)
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=35)
X_train_W2V_resampled, y_train_W2V_resampled = smote.fit_resample(X_train_W2V, y_train)
from sklearn.svm import SVC
svm_classifier = SVC()
svm_classifier.fit(X_train_W2V_resampled, y_train_W2V_resampled)
y_pred_W2V = svm_classifier.predict(X_test_W2V)
y_pred_test = svm_classifier.predict(test_feature_vectors)
accuracy_W2V = accuracy_score(y_test, y_pred_W2V)
print(accuracy_W2V)
results_csv = pd.DataFrame({
        "id": id,
        "class":y_pred_test,
    })
results_csv.to_csv('test_results.csv', index=False)

pandas.core.series.Series

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

tf.reset_default_graph()
batchSize = 24
lstmUnits = 64
numClasses = 2
iterations = 50000
numDimensions = 300
maxSeqLength = 250 #Maximum length of sentence
import tensorflow as tf
from random import randint
data = pd.concat([domain1, domain2], ignore_index=True)
max_length = 250
sequences_padded = pad_sequences(test_data_df['text'], maxlen=max_length, padding='post', truncating='post', value=0)

import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

train_data['text'] = train_data['text'].apply(lambda x: list(map(int, x.split())))


max_length = 250
sequences_padded = pad_sequences(train_data['text'], maxlen=max_length, padding='post', truncating='post', value=0)

indices = np.random.permutation(len(sequences_padded))
texts_shuffled = sequences_padded[indices]
labels_shuffled = train_data['label'].values[indices]

batch_size = 24
num_batches = len(texts_shuffled) // batch_size
text_batches = [texts_shuffled[i * batch_size:(i + 1) * batch_size] for i in range(num_batches)]
label_batches = [labels_shuffled[i * batch_size:(i + 1) * batch_size] for i in range(num_batches)]

if len(texts_shuffled) % batch_size != 0:
    extra_texts = texts_shuffled[num_batches * batch_size:]
    extra_labels = labels_shuffled[num_batches * batch_size:]
    text_batches.append(extra_texts)
    label_batches.append(extra_labels)
def get_random_batch(text_batches, label_batches):
    batch_index = np.random.randint(len(text_batches))
    return text_batches[batch_index], label_batches[batch_index]


test_data ['text'] = test_data ['text'].apply(lambda x: list(map(int, x.split())))


max_length = 250
sequences_padded1 = pad_sequences(test_data ['text'], maxlen=max_length, padding='post', truncating='post', value=0)

indices1 = np.random.permutation(len(sequences_padded1))
texts_shuffled1 = sequences_padded[indices1]
labels_shuffled1 = train_data['label'].values[indices1]

batch_size = 24
num_batches1 = len(texts_shuffled1) // batch_size
text_batches1 = [texts_shuffled1[i * batch_size:(i + 1) * batch_size] for i in range(num_batches1)]
label_batches1 = [labels_shuffled1[i * batch_size:(i + 1) * batch_size] for i in range(num_batches1)]

if len(texts_shuffled1) % batch_size != 0:
    extra_texts1 = texts_shuffled1[num_batches1 * batch_size:]
    extra_labels1 = labels_shuffled1[num_batches1 * batch_size:]
    text_batches1.append(extra_texts1)
    label_batches1.append(extra_labels1)
def get_test_batch(text_batches1, label_batches1):
    batch_index1 = np.random.randint(len(text_batches1))
    return text_batches1[batch_index1], label_batches1[batch_index1]

import numpy as np
import gensim
import numpy as np

batchSize = 24
lstmUnits = 64
numClasses = 2
iterations = 20001
numDimensions = 300
maxSeqLength = 250  # 

embedding_matrix = model.wv.vectors
embedding_tensor = tf.Variable(initial_value=embedding_matrix, trainable=False, dtype=tf.float32)


input_data = tf.placeholder(tf.int32, [None, maxSeqLength])
labels = tf.placeholder(tf.float32, [None, numClasses])


data = tf.nn.embedding_lookup(embedding_tensor, input_data)


lstmCell = tf.nn.rnn_cell.BasicLSTMCell(lstmUnits, reuse=tf.AUTO_REUSE)
lstmCell = tf.nn.rnn_cell.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
outputs, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)


weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(outputs, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = tf.matmul(last, weight) + bias
correctPred = tf.equal(tf.argmax(prediction, 1), tf.argmax(labels, 1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)




sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
for i in range(iterations):
    nextBatch, nextBatchLabels = get_random_batch(text_batches, label_batches)
    
    # Convert labels to one-hot encoding
    nextBatchLabels_one_hot = tf.one_hot(nextBatchLabels, depth=numClasses)
    nextBatchLabels_one_hot = sess.run(nextBatchLabels_one_hot)  # evaluate the one-hot tensor
    
    # Run the optimization
    sess.run(optimizer, feed_dict={input_data: nextBatch, labels: nextBatchLabels_one_hot})
    
    if (i % 1000 == 0 and i != 0):
        loss_ = sess.run(loss, feed_dict={input_data: nextBatch, labels: nextBatchLabels_one_hot})
        accuracy_ = sess.run(accuracy, feed_dict={input_data: nextBatch, labels: nextBatchLabels_one_hot})
        
        print("iteration {}/{}...".format(i+1, iterations),
              "loss {}...".format(loss_),
              "accuracy {}...".format(accuracy_))
        save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
        print("saved to %s" % save_path)

In [None]:
sess = tf.InteractiveSession()
saver = tf.train.Saver()
saver.restore(sess, tf.train.latest_checkpoint('models'))
iterations = 10
for i in range(iterations):
    nextBatch, nextBatchLabels = get_test_batch(text_batches1, label_batches1)
    nextBatchLabels_one_hot = np.eye(numClasses)[nextBatchLabels]
    print("Accuracy for this batch:", (sess.run(accuracy, {input_data: nextBatch, labels: nextBatchLabels_one_hot})) * 100)

predicted_probabilities = sess.run(prediction, feed_dict={input_data: sequences_padded0})
predicted_labels = np.argmax(predicted_probabilities, axis=1)



print(len(predicted_labels))
id_list = range(len(predicted_labels))
result_df = pd.DataFrame({"id": id_list, "predicted_labels": predicted_labels})
result_df.to_csv("predicted_results.csv", index=False)


## **DaNN**

In [83]:
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Function

class Extractor(nn.Module):
    def __init__(self, feature_dim=len(vectorizer.vocabulary_)):
        super(Extractor, self).__init__()
        self.extractor = nn.Sequential(
            nn.Linear(in_features=feature_dim, out_features=512),
            nn.ReLU(),
            nn.Linear(in_features=512, out_features=128),
            nn.ReLU(),
            nn.Dropout(0.5)
        )

    def forward(self, x):
        x = x.float()
        x = self.extractor(x)
        return x


class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(in_features=128, out_features=32),
            nn.ReLU(),
            nn.Linear(in_features=32, out_features=1),
            
        )

    def forward(self, x):
        x = self.classifier(x)
        return x.squeeze()


class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.discriminator = nn.Sequential(
            nn.Linear(in_features=128, out_features=16),
            nn.ReLU(),
            nn.Linear(in_features=16, out_features=2),
            nn.Softmax(dim=1)
        )

    def forward(self, input_feature, alpha):
        reversed_input = ReverseLayerF.apply(input_feature, alpha)
        x = self.discriminator(reversed_input)
        return x

class ReverseLayerF(Function):

    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha

        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        output = grad_output.neg() * ctx.alpha

        return output, None


In [None]:
import torch
import numpy as np
import torch.optim as optim
import torch.nn as nn

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(domain1['text'], domain1['label'].values, test_size=0.2, random_state=42, stratify=domain1['label'].values)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(domain2['text'], domain2['label'].values, test_size=0.2, random_state=42, stratify=domain2['label'].values)

combined_X_train = pd.concat([X_train_1, X_train_2])
vectorizer = TfidfVectorizer()
vectorizer.fit(combined_X_train)

X_train_1_TFIDF = vectorizer.transform(X_train_1)
X_train_2_TFIDF = vectorizer.transform(X_train_2)

X_train_2_TFIDF, y_train_2 = ADASYN(sampling_strategy='auto', random_state=35, n_neighbors=5, n_jobs=None).fit_resample(X_train_2_TFIDF, y_train_2)

X_train_1 = torch.from_numpy(X_train_1_TFIDF.toarray())
y_train_1 = torch.from_numpy(y_train_1).to(dtype=torch.float)
X_train_2 = torch.from_numpy(X_train_2_TFIDF.toarray())
y_train_2 = torch.from_numpy(y_train_2).to(dtype=torch.float)

# Create iterable dataset in Torch format
train_1_ds = torch.utils.data.TensorDataset(X_train_1, y_train_1)
train_1_loader = torch.utils.data.DataLoader(train_1_ds, batch_size=32)
train_2_ds = torch.utils.data.TensorDataset(X_train_2, y_train_2)
train_2_loader = torch.utils.data.DataLoader(train_2_ds, batch_size=32)


feature_extractor = Extractor(feature_dim=len(vectorizer.vocabulary_))
label_classifier = Classifier()
domain_classifier = Discriminator()

label_classification_criterion = nn.CrossEntropyLoss()
domain_classification_criterion = nn.CrossEntropyLoss()

DaNN_params = list(feature_extractor.parameters()) + list(label_classifier.parameters()) + list(domain_classifier.parameters())
DaNN_optimizer = optim.Adam(DaNN_params, lr=0.001)

alpha = 0.5
n_epochs = 100
for epoch in range(n_epochs):
    
    feature_extractor.train()
    label_classifier.train()
    domain_classifier.train()
    for i, (data_1, data_2) in enumerate(zip(train_1_loader, train_2_loader)):
        text_1, label_1 = data_1
        text_2, label_2 = data_2
        combined_text = torch.cat((text_1, text_2), 0)
        
        feature_1 = feature_extractor(text_1)
        feature_2 = feature_extractor(text_2)
        combined_feature = feature_extractor(combined_text)
        
        label_prediction_1 = label_classifier(feature_1)
        label_prediction_2 = label_classifier(feature_2)
        # print(label_prediction_1)
        # print(label_1)
        label_loss_1 = label_classification_criterion(label_prediction_1, label_1)
        label_loss_2 = label_classification_criterion(label_prediction_2, label_2)

        domain_prediction = domain_classifier(combined_feature, alpha)
        domain_combined_label = torch.cat((torch.zeros(text_1.shape[0]).long(), torch.ones(text_2.shape[0]).long()), 0)

        domain_loss = domain_classification_criterion(domain_prediction, domain_combined_label)
        
        total_loss = label_loss_1 + label_loss_2 + domain_loss
        DaNN_optimizer.zero_grad()
        total_loss.backward()
        DaNN_optimizer.step()
        
    
    print(f"Epoch: {epoch+1}, Total Loss: {total_loss.item()}, Label Loss: {(label_loss_1+label_loss_2).item()}, Domain Loss: {domain_loss.item()}")



Epoch: 1, Total Loss: 42.52740478515625, Label Loss: 41.89863586425781, Domain Loss: 0.6287685632705688


## **Test Result**

In [29]:
def get_predict_csv(clfs, X_test, id):
    
    prediction = clfs.predict(X_test)
    results_csv = pd.DataFrame({
        "id": id,
        "class": prediction,
    })
    return results_csv

test_results = get_predict_csv(svm_classifier, X_test_oversampling, test_data_df["id"])
print(test_results)
test_results.to_csv('test_results.csv', index=False)

        id  class
0        0      1
1        1      0
2        2      0
3        3      0
4        4      0
...    ...    ...
3995  3995      0
3996  3996      0
3997  3997      0
3998  3998      0
3999  3999      1

[4000 rows x 2 columns]


In [7]:
import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [80]:
import torch
import numpy as np
import torch.optim as optim
import torch.nn as nn

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(domain1['text'], domain1['label'].values, test_size=0.2, random_state=42, stratify=domain1['label'].values)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(domain2['text'], domain2['label'].values, test_size=0.2, random_state=42, stratify=domain2['label'].values)

combined_X_train = pd.concat([X_train_1, X_train_2])
vectorizer = TfidfVectorizer()
vectorizer.fit(combined_X_train)

X_train_1_TFIDF = vectorizer.transform(X_train_1)
X_train_2_TFIDF = vectorizer.transform(X_train_2)

X_train_2_TFIDF, y_train_2 = ADASYN(sampling_strategy='auto', random_state=35, n_neighbors=5, n_jobs=None).fit_resample(X_train_2_TFIDF, y_train_2)

X_train_1 = torch.from_numpy(X_train_1_TFIDF.toarray()).to(dtype=torch.float)
y_train_1 = torch.from_numpy(y_train_1).to(dtype=torch.float)
X_train_2 = torch.from_numpy(X_train_2_TFIDF.toarray()).to(dtype=torch.float)
y_train_2 = torch.from_numpy(y_train_2).to(dtype=torch.float)

# Create iterable dataset in Torch format
train_1_ds = torch.utils.data.TensorDataset(X_train_1, y_train_1)
train_1_loader = torch.utils.data.DataLoader(train_1_ds, batch_size=32)
train_2_ds = torch.utils.data.TensorDataset(X_train_2, y_train_2)
train_2_loader = torch.utils.data.DataLoader(train_2_ds, batch_size=32)


feature_extractor = Extractor(feature_dim=len(vectorizer.vocabulary_))
label_classifier = Classifier()
domain_classifier = Discriminator()

label_classification_criterion = nn.CrossEntropyLoss()
domain_classification_criterion = nn.CrossEntropyLoss()

DaNN_params = list(feature_extractor.parameters()) + list(label_classifier.parameters()) + list(domain_classifier.parameters())
DaNN_optimizer = optim.Adam(DaNN_params, lr=0.001)

alpha = 0.5
n_epochs = 100
for epoch in range(n_epochs):
    print(f"Epoch: {epoch}")
    
    feature_extractor.train()
    label_classifier.train()
    domain_classifier.train()

    feature_1 = feature_extractor(X_train_1)
    feature_2 = feature_extractor(X_train_2)
    X_train_combined = torch.cat((X_train_1, X_train_2), 0)
    combined_feature = feature_extractor(X_train_combined)

    label_prediction_1 = label_classifier(feature_1)
    label_prediction_2 = label_classifier(feature_2)
    label_loss_1 = label_classification_criterion(label_prediction_1, y_train_1)
    label_loss_2 = label_classification_criterion(label_prediction_2, y_train_2)
    
    domain_prediction = domain_classifier(combined_feature, alpha)
    domain_combined_label = torch.cat((torch.zeros(X_train_1.shape[0]).long(), torch.ones(X_train_2.shape[0]).long()), 0)
    domain_loss = domain_classification_criterion(domain_prediction, domain_combined_label)
        
    total_loss = label_loss_1 + label_loss_2 + domain_loss
    DaNN_optimizer.zero_grad()
    total_loss.backward()
    DaNN_optimizer.step()
        
    if (epoch+1) % 10 == 0:
        print(f"Epoch: {epoch+1}, Total Loss: {total_loss.item()}, Label Loss: {(label_loss_1+label_loss_2).item()}, Domain Loss: {domain_loss.item()}")

Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10, Total Loss: 108765.1796875, Label Loss: 108764.5390625, Domain Loss: 0.6370952129364014
Epoch: 10
Epoch: 11
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19
Epoch: 20, Total Loss: 104948.5234375, Label Loss: 104947.953125, Domain Loss: 0.573634684085846
Epoch: 20
Epoch: 21
Epoch: 22
Epoch: 23
Epoch: 24
Epoch: 25
Epoch: 26
Epoch: 27
Epoch: 28
Epoch: 29
Epoch: 30, Total Loss: 102620.3515625, Label Loss: 102619.859375, Domain Loss: 0.49597883224487305
Epoch: 30
Epoch: 31
Epoch: 32
Epoch: 33
Epoch: 34
Epoch: 35
Epoch: 36
Epoch: 37
Epoch: 38
Epoch: 39
Epoch: 40, Total Loss: 102241.40625, Label Loss: 102240.921875, Domain Loss: 0.4878908395767212
Epoch: 40
Epoch: 41
Epoch: 42
Epoch: 43
Epoch: 44


KeyboardInterrupt: 