In [29]:
import os.path
from typing import Union, List, Tuple

import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix


In [30]:
import spacy

In [31]:
df_sample = pd.read_csv('data.csv')

# balanced train and test data in each df

In [32]:
ratio=0.8
df_human= df_sample[df_sample['source']=='Human']
df_llm = df_sample[df_sample['source']!='Human']
df_llm.source='LLM'
cut_idx_l=int(np.ceil(len(df_llm)*ratio))
cut_idx_h=int(np.ceil(len(df_human)*ratio))
df_train_human = df_human[:cut_idx_h]
df_test_human = df_human[cut_idx_h:]
df_train_llm = df_llm[:cut_idx_l]
df_test_llm = df_llm[cut_idx_l:]
df_train=pd.concat([df_train_human,df_train_llm])
df_test=pd.concat([df_test_human,df_test_llm])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_llm.source='LLM'


In [33]:
df_train.source.value_counts()


source
LLM      352984
Human    278154
Name: count, dtype: int64

In [34]:

df_test.source.value_counts()

source
LLM      88246
Human    69538
Name: count, dtype: int64

In [35]:
df_train.head(10)

Unnamed: 0,text,source,prompt_id,text_length,word_count
156512,"There is a saying in my home that goes, ""You t...",Human,0,1524,268
156513,Williams-Sonoma: Strategies and Future Prospec...,Human,0,5985,904
156514,"Shristi, your first two paragraphs are irrelev...",Human,0,1124,187
156515,Extended Lifespan and Its Great Danger Essay\n...,Human,0,7578,1261
156516,Imperialism and Nationalism in Middle Eastern ...,Human,0,9350,1441
156517,The Power of Free Speech Essay\n\nTable of Con...,Human,0,8629,1408
156518,Accounting and Auditing Practices Essay (Artic...,Human,0,4103,649
156519,Civil War Effect on Medicine and Public Health...,Human,0,2455,383
156520,Simple Stimulus Learning: Habituation and Perc...,Human,0,8773,1315
156521,Core Ethical Values of a Lawyer Essay\n\nTable...,Human,0,5500,857


In [36]:
df_train.shape

(631138, 5)

In [37]:
df_train.isna().sum()

text           0
source         0
prompt_id      0
text_length    0
word_count     0
dtype: int64

In [38]:
df_test.shape

(157784, 5)

In [39]:
df_test.isna().sum()

text           0
source         0
prompt_id      0
text_length    0
word_count     0
dtype: int64

In [40]:
!python3 -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [41]:
nlp = spacy.load("en_core_web_lg")
doc = nlp("Apple shares rose on the news. Apple pie is delicious.").vector

In [42]:
doc

array([-1.18700646e-01, -1.41039336e+00, -2.14910340e+00,  8.83527994e-01,
        5.56492472e+00, -9.87121642e-01, -1.14053094e+00,  4.43576670e+00,
       -3.72914225e-01, -1.41204226e+00,  4.56952858e+00,  4.27760839e-01,
       -2.31775141e+00,  6.92854226e-01,  1.15528846e+00,  2.31034160e+00,
        2.57655549e+00, -6.75784111e-01,  1.13154316e+00, -2.44445491e+00,
        1.47608435e+00, -1.68519959e-01, -7.40572512e-01, -1.11758280e+00,
       -3.36654162e+00, -7.73094893e-01, -1.00135839e+00, -1.16324246e+00,
       -7.35800922e-01,  1.05197847e+00, -3.64668399e-01, -1.64117500e-01,
       -3.30009460e+00, -3.08980751e+00, -1.81026840e+00,  5.23460805e-01,
       -1.47361326e+00,  3.91166568e+00,  1.94414079e+00,  1.41275465e+00,
        9.01624143e-01,  2.62060571e+00, -1.04173517e-03, -9.47884858e-01,
        3.97454977e-01,  1.39844847e+00,  2.00074172e+00, -1.80079424e+00,
        8.34936619e-01,  9.27045763e-01, -7.79365063e-01,  1.24467003e+00,
       -1.04951508e-01, -

### do not forget to define labels vector

In [43]:
from sklearn.preprocessing import LabelEncoder

In [44]:
le = LabelEncoder()
df_train['target']=le.fit_transform(df_train['source'])
df_train.target.value_counts()

target
1    352984
0    278154
Name: count, dtype: int64

In [45]:
df_test['target']=le.transform(df_test['source'])
df_test.target.value_counts()

target
1    88246
0    69538
Name: count, dtype: int64

In [46]:
df_train=df_train.drop('source', axis=1)
df_test=df_test.drop('source',axis=1)

In [47]:
df_train=df_train.drop(['prompt_id','text_length','word_count'], axis =1)
df_test=df_test.drop(['prompt_id','text_length','word_count'], axis =1)

In [48]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 631138 entries, 156512 to 700675
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    631138 non-null  object
 1   target  631138 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 14.4+ MB


In [49]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 157784 entries, 434666 to 788921
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    157784 non-null  object
 1   target  157784 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.6+ MB


In [50]:
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${torch.__version__}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${torch.__version__}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

#from torch_geometric

2.1.2+cu121


In [51]:
#df_train.to_csv('./train.csv', sep=',')

In [52]:
#df_train = pd.read_csv('train.csv', sep=',')

In [53]:
class Tokenizer(object):
    def __init__(self, word2idx=None, nlp_model="en_core_web_lg"):
        self.nlp= spacy.load(nlp_model)
        #self.word2idx=word2idx
        #self.idx=len(word2idx)
        if word2idx is None:
            self.word2idx = {}
            self.idx2word = {}
            self.idx = 0
            self.word2idx['<pad>'] = self.idx
            self.idx2word[self.idx] = '<pad>'
            self.idx += 1
            self.word2idx['<unk>'] = self.idx
            self.idx2word[self.idx] = 'unk'
            self.idx += 1
        else:
            self.word2idx = word2idx
            self.idx2word = { v:k for k,v in word2idx.items()}
        
    def fit_on_doc(self, doc:spacy.tokens.doc.Doc):
        for word in doc:
            word= str(word).lower()
            if word not in self.word2idx:
                self.word2idx[word]=self.idx
                self.idx2word[self.idx]=word
                self.idx += 1
    def text_to_doc(self, text):
        return self.nlp(text)
    def doc_to_sequence(self, doc:spacy.tokens.doc.Doc):
        sequence = []
        for w in doc:
            w = str(w).lower()
            word_id = self.word2idx.get(w,-1)
            if word_id == -1:
                word_id = self.word2idx['<unk>']
            sequence.append(word_id)
        if len(sequence) == 0:
            sequence = [0]
        return np.array(sequence, dtype=np.int32)
    def doc_to_adj(self, doc: spacy.tokens.doc.Doc):
        matrix = np.zeros((len(doc),len(doc))).astype('int32')
        for token in doc:
            for child in token.children:
                matrix[token.i][child.i] = 1
                matrix[child.i][token.i] = 1
        return matrix

In [54]:
tokenizer = Tokenizer()

In [55]:
tokenizer.word2idx

{'<pad>': 0, '<unk>': 1}

In [56]:
from scipy.sparse import coo_matrix

In [57]:
from tqdm import tqdm

In [58]:
df_train.isna().sum()

text      0
target    0
dtype: int64

In [59]:
df_train=df_train[:434666]

In [None]:
train_idx2graph = {}

for i in tqdm(range(len(df_train))):
    text = df_train.text[i].lower().replace("/n", "").strip()
    doc = tokenizer.text_to_doc(text)
    tokenizer.fit_on_doc(doc)
    adj_matrix = tokenizer.doc_to_adj(doc)
    coo = coo_matrix(adj_matrix)
    train_idx2graph[i] = np.array([coo.row, coo.col])

  3%|▎         | 11945/434666 [15:52<11:46:29,  9.97it/s]

In [None]:
len(train_idx2graph)

In [None]:
test_idx2graph = {}
for i in tqdm(range(len(df_test))):
    #doc=pd.concat([[df_train.text_tkn[i]],[df_test.prompt_tkn[i]])
    tokenizer.fit_on_doc(doc)
    adj_matrix = tokenizer.doc_to_adj(doc)
    coo = coo_matrix(adj_matrix)
    test_idx2graph[i] = np.array([coo.row, coo.col], dtype=np.int32)

In [None]:
df_train.to_csv('train.csv')

In [None]:
df_test.to_csv('test.csv')

In [None]:
result=train_idx2graph.items()
da = list(result)
nparr=np.array(da)
train = pd.DataFrame(nparr)

In [None]:
train.head()

In [None]:
train.to_csv('train_idx2graph.csv')

In [None]:
df_idx2graph=pd.read_csv('train_idx2graph.csv')

In [None]:
np_train_idx2graph=df_idx2graph.to_numpy()

In [None]:
result=test_idx2graph.items()
da = list(result)
nparr=np.array(da)
test = pd.DataFrame(nparr)

In [None]:
test.to_csv('test_idx2graph.csv')

In [None]:
df_test_idx2graph=pd.read_csv('test_idx2graph.csv')

In [None]:
np_test_idx2graph=df_test_idx2graph.to_numpy()

In [None]:
len(Tokenizer.word2idx)

In [None]:
def load_word_vec(path, word2idx=None, embed_dim=300):
    fin = open(path, 'r', encoding='utf8', newline='\n', errors='ignore')
    word_vec = {}
    for line in fin:
        tokens = line.rstrip().split()
        word, vec = ' '.join(tokens[:-embed_dim]), tokens[-embed_dim:]
        if word in word2idx.keys():
            word_vec[word] = np.array(vec, dtype=np.float32)
    return word_vec

In [None]:
from torchtext.vocab import GloVe
def build_embedding_matrix(word2idx, embed_dim=300):
    embedding_matrix = np.zeros((len(word2idx), embed_dim))
    embedding_matrix[1, :] = np.random.uniform(-1/np.sqrt(embed_dim), 1/np.sqrt(embed_dim), (1, embed_dim))

    glob_vector='./glove.840B.300d.txt'
    word_vec = load_word_vec(glob_vector, word2idx=word2idx, embed_dim=embed_dim)

    for word, i in word2idx.items():
        vec = word_vec.get(word)
        if vec is not None:
            embedding_matrix[i] = vec
    return embedding_matrix

In [None]:
embedding_matrix = build_embedding_matrix(tokenizer.word2idx, 300)

In [None]:
from torch_geometric.data import InMemoryDataset, Data


class TrainGraphFactoryDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
        super().__init__(root, transform, pre_transform, pre_filter)

        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_dir(self):
        return "./"

    @property
    def processed_dir(self):
        return os.path.join(self.root, "train_processed")

    @property
    def raw_file_names(self):
        return ['train.csv']

    @property
    def processed_file_names(self):
        return ['train-graph.pt']

    def download(self):
        pass

    def process(self):

        data_list = self.read_data()

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]

        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

    def read_data(self):
        df_train = pd.read_csv(self.raw_paths[0])
        all_data = []
        for i in tqdm(range(df_train.shape[0])):
            text = df_train.text[i].lower().replace("\n", "").strip()
            doc = tokenizer.text_to_doc(text)
            input_ids = tokenizer.doc_to_sequence(doc)
            label = df_train.loc[i, ["target"]].to_list()

            x = torch.tensor(input_ids.reshape(-1, 1), dtype=torch.int32)
            edge_index  = torch.tensor(train_idx2graph[i], dtype=torch.long)
            y = torch.tensor(np.array(label), dtype=torch.float32).reshape(-1, 1)
            data = Data(x=x, edge_index=edge_index, y=y)

            all_data.append(data)
        return all_data

In [None]:
dataset = TrainGraphFactoryDataset(root='./')

In [None]:
print(dataset.raw_paths, dataset.processed_paths)

In [None]:
dataset.data

In [None]:
dataset[0]

In [None]:
dataset[0].y

In [None]:
len(dataset)

In [None]:
dataset = dataset.shuffle()
ratio_cut= 0.8
train_len = int(ratio_cut*len(dataset))
train_dataset= dataset[:train_len]
val_dataset= dataset[train_len:]
print(len(train_dataset),len(val_dataset))


In [None]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.optim import lr_scheduler
from torch_geometric.data import Data
from torch_geometric.data import InMemoryDataset
from torch_geometric.nn import GCNConv, SAGEConv, GATConv, TransformerConv, GATv2Conv, ChebConv, ResGatedGraphConv
import torch_geometric.nn as pyg_nn
from torch_geometric.loader import DataLoader

In [None]:
from torch_geometric.data import DataLoader

train_loader = DataLoader(train_dataset,batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128)

In [None]:
print(embedding_matrix)

In [None]:
class FeedbackModel(nn.Module):
    def __init__(self, embedding_matrix):

        super(FeedbackModel, self).__init__()

        self.embed = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float), freeze=False)
        # GCNConv SAGEConv ResGatedGraphConv GraphConv(300, 128) 
        # TransformerConv GATv2Conv GATConv(300, 128, heads=4) ChebConv(300, 128, K=2)
        # GCNConv SAGEConv ResGatedGraphConv GraphConv(128, 64) 
        # TransformerConv  GATv2Conv GATConv(4*128, 64) ChebConv(128, 64, K=2)
        #         self.gru = nn.GRU(256, 256, num_layers=1, 
        #                           dropout=0, batch_first=True,
        #                           bidirectional=False)          # RNN, GRU
        # output: (N, L, D∗Hout), D = 2 if bidirectional=True otherwise 1
        # h_n: (D∗num_layers, N, Hout)
        self.gc1   = GATv2Conv(300, 128)
        self.pool1 = pyg_nn.TopKPooling(128, ratio=0.8)
        self.gc2   = GCNConv(128, 128)
        self.pool2 = pyg_nn.TopKPooling(128, ratio=0.8)
        self.lin1  = nn.Linear(256, 64)
        self.lin2  = nn.Linear(64, 1)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = x.squeeze(1)
        x = self.embed(x)

        x = F.relu(self.gc1(x, edge_index))
        x, edge_index, edge_attr, batch, perm, score = self.pool1(x, edge_index, None, batch)
        x1 = torch.cat([pyg_nn.global_max_pool(x, batch), pyg_nn.global_mean_pool(x, batch)], dim=1)

        x = F.relu(self.gc2(x, edge_index))
        x, edge_index, edge_attr, batch, perm, score = self.pool2(x, edge_index, None, batch)
        x2 = torch.cat([pyg_nn.global_max_pool(x, batch), pyg_nn.global_mean_pool(x, batch)], dim=1)

        x = x1 + x2
        # x, hn = self.gru(x, None)
        x = F.relu(self.lin1(x))
        x = F.dropout(x, p=0.5, training=self.training)
        output = F.relu(self.lin2(x))

        return output



In [None]:
model = FeedbackModel(embedding_matrix)
model

In [None]:
epochs = 60

criterion = nn.MSELoss()
optimizer = Adam(model.parameters(), lr=1e-5)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=6)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion.to(device)

In [None]:


total_loss = []
for epoch_num in range(epochs):

    model.train()
    total_loss_train = 0
    print(f'epoch: {epoch_num} in progress...')
    print(f'train computing')
    for j, sample_batched in tqdm(enumerate(train_loader)):
        sample_batched = sample_batched.to(device)
        optimizer.zero_grad()
        outputs = model(sample_batched)
        label = sample_batched.y.to(device)
        loss = criterion(outputs, label)
        loss.backward()
        total_loss_train += loss.item()
        optimizer.step()

    model.eval()
    total_loss_val = 0
    print(f'train evaluating')
    with torch.no_grad():
        for k, sample_batched in tqdm(enumerate(val_loader)):
            sample_batched = sample_batched.to(device)
            outputs = model(sample_batched)
            label = sample_batched.y.to(device)
            loss = criterion(outputs, label)
            total_loss_val += loss.item()

    scheduler.step(total_loss_val / len(val_dataset))

    print(f'Epoch: %02.0f ended | Train Loss: {total_loss_train / len(train_dataset): .3f} | Val Loss: {total_loss_val / len(val_dataset): .3f}' % (epoch_num + 1))
    total_loss.append([total_loss_train / len(train_dataset), total_loss_val / len(val_dataset)])

