In [2]:
import pandas as pd
import random
import math
import matplotlib.pyplot as plt
import numpy as np
from statistics import mean

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim


In [3]:
seed = 12
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

In [13]:
# load datasets
domain1 = pd.read_json('data/domain1_train.json', lines=True)
domain2 = pd.read_json('data/domain2_train.json', lines=True)
test = pd.read_json('data/test_set.json', lines=True)

In [11]:
# random sample without replacement from label 0
# label_0_rate: the rate of label 0 in the sampled dataframe, can be any number

def under_sample(df, label_0_rate):

    sampled_label_1 = df[df['label'] == 1]
    label_1_count = len(df[df['label'] == 1])

    #calculate the number of label 0 samples
    sampled_label_0_count = int(label_0_rate * label_1_count)

    total_label_0 = df[df['label'] == 0]

    model_count = sampled_label_0_count // 7
    sampled_label_0 = pd.DataFrame()

    # use for loop to sample from each model
    for i in range(7):
        model = total_label_0[total_label_0['model'] == i]
        sampled_model = model.sample(n=model_count, replace=False)
        sampled_label_0 = pd.concat([sampled_label_0, sampled_model])
    
    # domain2_machine = train_data2[train_data2['label'] == 8].sample(4388)
    # domain2_human = train_data2[train_data2['label'] == 1].sample(4300, replace = True)
    # print(len(doamin2_machine), len(domain2_human))
    # train_data = pd.concat(train_datal, domain2_machine[['text,'label']],domain2 human[['text','label']]]).sample(frac = 1)

    # concatenate the sampled label 0 and sampled label 1
    sampled_df = pd.concat([sampled_label_1, sampled_label_0])
    
    return sampled_df

In [14]:
newdomain2 = under_sample(domain2,1)
newdomain2 = newdomain2.sort_index()

In [15]:
domain2 = domain2[['text','label']]

In [16]:
# split data for validation
train_domain1, valid_domain1 = train_test_split(domain1,test_size=0.2,random_state=12)
train_domain2, valid_domain2 = train_test_split(domain2,test_size=0.2,random_state=12)

In [17]:
## input length should be the same while training neural network
max_length = 100 
class MyDataset(Dataset):
  def __init__(self, df):
    self.df = df

  def __len__(self):
    return len(self.df)

  def __getitem__(self,id):
    txt = self.df.iloc[id,:]['text'][:max_length]
    text_len = len(txt)
    txt = torch.tensor(txt)
    if text_len < max_length:
      txt = F.pad(txt,(0,max_length - text_len),'constant', 5000)

    label = torch.tensor(self.df.iloc[id,:]['label']).to(torch.int64)
    return txt, label

In [35]:
batch_size = 256, # 
train_dm2 = MyDataset(train_domain1)
train_loader = DataLoader(train_dm2, batch_size = 256, shuffle = True) 
valid_dm2 = MyDataset(valid_domain1)
valid_loader = DataLoader(valid_dm2, batch_size = 256, shuffle = True) 

In [31]:
class MLP(nn.Module):
  def __init__(self, dict_size, emb_size, hidden_size):
    super(MLP,self).__init__()
    ## embedding layer =
    self.embedding = nn.Embedding(dict_size, emb_size) # dict_size = 5000

    self.classifier = nn.Sequential(
        nn.Linear(emb_size, hidden_size),
        nn.ReLU(), 
        nn.Linear(hidden_size, hidden_size), 
        nn.ReLU(),
        nn.Linear(hidden_size, hidden_size), 
        nn.ReLU(),
        nn.Linear(hidden_size, hidden_size),
        nn.ReLU(),
        nn.Linear(hidden_size,1) ## real value
    )

  def forward(self,text):
    text_emb = self.embedding(text) # (batch_size, max_length, emb_size)
    text_emb = text_emb.mean(dim = 1) # (batch_size, emb_size) mean
    output = self.classifier(text_emb)
    return output.squeeze() 


In [32]:
GPU = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
model = MLP(dict_size = 5001, emb_size = 256, hidden_size = 256).to(GPU)
optimizer = optim.Adam(model.parameters(), lr=0.005)
criterion = nn.BCEWithLogitsLoss()

In [20]:
def train_one_epoch(model, train_loader):
  model.train()

  batch_loss = 0.0
  train_pred = []
  train_targets = []

  for batch in train_loader:

    text, label = batch[0].to(GPU), batch[1].to(device=GPU, dtype=torch.long)

    x_output = model.forward(text) 

    loss = criterion(x_output, label.float())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    batch_loss += loss.item()
    train_pred += torch.round(torch.sigmoid(x_output)).tolist()
    train_targets += label.tolist()

    batch_loss /= len(train_loader)

    train_fi = f1_score(train_targets, train_pred, average = 'macro')
    return train_fi, batch_loss

In [37]:
def valid_one_epoch(model, valid_loader):
  model.eval() # not update gradients
  batch_loss = 0.0
  valid_pred = []
  valid_target = []

  with torch.no_grad():
    for batch in valid_loader:
      txt, label = batch[0].to(device=GPU, dtype=torch.long), batch[1].to(device=GPU, dtype=torch.long)

      x_valid = model.forward(txt)
      loss = criterion(x_valid, label.float())

      batch_loss += loss.item()
      valid_pred += torch.round(torch.sigmoid(x_valid)).tolist()
      valid_target += label.tolist()

      batch_loss /= len(valid_loader)
      return batch_loss, valid_pred, valid_target


In [36]:
num_epochs = 40
valid_f1s = []
for epoch in range(num_epochs):
  train_f1, train_loss = train_one_epoch(model, train_loader)
  valid_loss, valid_targets, valid_pred = valid_one_epoch(model, valid_loader)
  valid_f1 = f1_score(valid_targets, valid_pred, average = 'macro')
  valid_f1s.append(valid_f1)
  print(f"epoch {epoch + 1}, train loss: {train_loss}.4f, train f1: {train_f1}.4f, valid_loss: {valid_loss}.4f, valid_f1: {valid_f1}.4f")

epoch 1, train loss: 0.015151219289810931.4f, train f1: 0.3385012919896641.4f, valid_loss: 0.04376106336712837.4f, valid_f1: 0.33506493506493507.4f
epoch 2, train loss: 0.011586904525756836.4f, train f1: 0.33159268929503916.4f, valid_loss: 0.04274410009384155.4f, valid_f1: 0.3402061855670103.4f
epoch 3, train loss: 0.011197035429907626.4f, train f1: 0.3385012919896641.4f, valid_loss: 0.043246038258075714.4f, valid_f1: 0.32275132275132273.4f
epoch 4, train loss: 0.011151848269290611.4f, train f1: 0.3469387755102041.4f, valid_loss: 0.042859211564064026.4f, valid_f1: 0.3402061855670103.4f
epoch 5, train loss: 0.01122124077843838.4f, train f1: 0.3435897435897436.4f, valid_loss: 0.0437910333275795.4f, valid_f1: 0.29863013698630136.4f
epoch 6, train loss: 0.011261973224702428.4f, train f1: 0.34190231362467866.4f, valid_loss: 0.0429520383477211.4f, valid_f1: 0.34190231362467866.4f
epoch 7, train loss: 0.011298893905076825.4f, train f1: 0.3402061855670103.4f, valid_loss: 0.04320754483342171.4f

In [None]:
mean(valid_f1s)