In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv
/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv
/kaggle/input/llm-detect-ai-generated-text/test_essays.csv
/kaggle/input/llm-detect-ai-generated-text/train_essays.csv
/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv


In [2]:
dataset = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv")

In [3]:
import random

In [4]:
stoi = {"[PAD]": 0,
       "[UNK]": 1}
itos = {0 : "[PAD]",
       1 : "[UNK]"}
total_text = set()
for index, row in dataset.iterrows():
    if random.random() <= 0.1:
        total_text.update(row["text"].lower().split())
stoi.update({char:i+2 for i, char in enumerate(sorted(total_text))})
itos.update({i+2:char for i, char in enumerate(sorted(total_text))})

In [5]:
itos[stoi['the']]

'the'

In [6]:
vocab_size = len(stoi)
vocab_size

44608

In [7]:
itos[0]

'[PAD]'

In [8]:
class tokenizer:
    def __init__(self, stoi, itos, max_len=100):
        self.stoi = stoi
        self.itos = itos
        self.vocab_size = len(itos)
        self.max_len = max_len
    def encode(self, text):
        out = []
        text = text.lower().split()[:self.max_len]
        for word in text:
            if word in stoi:
                out.append(stoi[word])
            else:
                out.append(1)
        out = [0] * max(0, abs(len(out) - self.max_len)) + out
        return out
    def decode(self, ids):
        out = ""
        for token in ids:
            if token in itos:
                out += itos[token] + " "
        return out

In [9]:
context_len = 30

In [10]:
tokenizer = tokenizer(stoi, itos, max_len=context_len)

In [11]:
tokenizer.encode("Cars. Cars have been around since they became famous in the 1900s, when Henry Ford created and built")

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 8354,
 8333,
 20122,
 6328,
 5073,
 36031,
 39812,
 6268,
 17258,
 21677,
 39624,
 2029,
 43382,
 20410,
 18119,
 11521,
 4468,
 7547]

In [12]:
tokenizer.decode(tokenizer.encode("Cars. Cars have been around since they became famous in the 1900s, when Henry Ford created and built"))

'[PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] cars. cars have been around since they became famous in the 1900s, when henry ford created and built '

In [13]:
import torch

In [14]:
def build_dataset():
    xs, ys = [], []
    for index, row in dataset.iterrows():
        xs.append(tokenizer.encode(row["text"]))
        ys.append(row["label"])
    xs = torch.tensor(xs)
    ys = torch.tensor(ys)
    return xs, ys
xs, ys = build_dataset()

In [15]:
tokenizer.decode(xs[0].numpy())

'phones modern humans today are always on their phone. they are always on their phone more than 5 hours a day no stop .all they do is text back and '

In [16]:
ys[0]

tensor(0)

In [17]:
emb_dim = 10

In [18]:
# initialize parameters
C = torch.randn(vocab_size, emb_dim) * 0.1
w1 = torch.randn(context_len * emb_dim, 100) * 0.1
b1 = torch.randn(100) * 0.01
w2 = torch.randn(100, 2) * 0.1
b2 = torch.randn(2) * 0.01

params = [C, w1, b1, w2, b2]
for p in params:
    p.requires_grad = True

In [19]:
num_examples = len(xs)

In [20]:
# training params
num_epochs = 150000
lr = 0.01
batch_size = 16

In [21]:
import torch.nn.functional as F

In [22]:
for epoch in range(num_epochs):
    selection = torch.randint(0, num_examples-1, (batch_size, ))
    ins = xs[selection]
    outs = ys[selection]
    emb = C[ins].view(-1, emb_dim * context_len)
    l1 = ((emb @ w1) + b1).tanh()
    logits = ((l1 @ w2) + b2).sigmoid()
    for p in params:
        p.grad = None

    loss = F.cross_entropy(logits, outs)
    if epoch % 1000 == 0:
        print("Epoch:", epoch, ", loss:", loss.item())
    loss.backward()
    
    for p in params:
        p.data -= p.grad * lr
    

Epoch: 0 , loss: 0.7018887996673584
Epoch: 1000 , loss: 0.6407331228256226
Epoch: 2000 , loss: 0.6564851403236389
Epoch: 3000 , loss: 0.6855296492576599
Epoch: 4000 , loss: 0.6256868243217468
Epoch: 5000 , loss: 0.646554172039032
Epoch: 6000 , loss: 0.6064630150794983
Epoch: 7000 , loss: 0.6537873148918152
Epoch: 8000 , loss: 0.5783604383468628
Epoch: 9000 , loss: 0.7226593494415283
Epoch: 10000 , loss: 0.5815320611000061
Epoch: 11000 , loss: 0.539470911026001
Epoch: 12000 , loss: 0.5919642448425293
Epoch: 13000 , loss: 0.5460279583930969
Epoch: 14000 , loss: 0.5537245273590088
Epoch: 15000 , loss: 0.48451903462409973
Epoch: 16000 , loss: 0.5546855926513672
Epoch: 17000 , loss: 0.5416461825370789
Epoch: 18000 , loss: 0.42463603615760803
Epoch: 19000 , loss: 0.39539268612861633
Epoch: 20000 , loss: 0.37280118465423584
Epoch: 21000 , loss: 0.438774049282074
Epoch: 22000 , loss: 0.48591169714927673
Epoch: 23000 , loss: 0.44220396876335144
Epoch: 24000 , loss: 0.4549175202846527
Epoch: 250

In [23]:
with torch.no_grad():
    selection = torch.randint(0, num_examples-1, (1,))
    ins = xs[selection]
    outs = ys[selection]
    emb = C[ins].view(-1, emb_dim * context_len)
    l1 = ((emb @ w1) + b1).tanh()
    logits = ((l1 @ w2) + b2).sigmoid()
    print(tokenizer.decode(ins[0].numpy()))
    print("prediction:", torch.argmax(logits).item())
    print("truth:", outs[0].item())

extending the school day can be both a blessing and a curse for students. on the one hand, four days of school can be fewer school days in a week 
prediction: 1
truth: 1


In [24]:
with torch.no_grad():
    while True:
        selection = torch.randint(0, num_examples-1, (1,))
        ins = xs[selection]
        outs = ys[selection]
        emb = C[ins].view(-1, emb_dim * context_len)
        l1 = ((emb @ w1) + b1).tanh()
        logits = ((l1 @ w2) + b2).sigmoid()
        if outs[0].item() == 1:
            print(tokenizer.decode(ins[0].numpy()))
            print("prediction:", torch.argmax(logits).item(), "confidence:", (logits[0][torch.argmax(logits).item()]/logits.sum()).item())
            print("truth:", outs[0].item())
            print("selection index:", selection.item())
            break

graduating early from high school can be a great opportunity for motivated and actively engaged students who are looking to take their education to the next level. while leaving the 
prediction: 1 confidence: 0.9999995231628418
truth: 1
selection index: 29703


In [25]:
def prompt(essay):
    with torch.no_grad():
        ins = torch.tensor(tokenizer.encode(essay))
        emb = C[ins].view(-1, emb_dim * context_len)
        l1 = ((emb @ w1) + b1).tanh()
        logits = ((l1 @ w2) + b2).sigmoid()
        return logits[0][1] / logits.sum()

In [26]:
prompt("i love cats always, they are my favoite animals ever and so fun. i have two cats at home")

tensor(1.1511e-06)

In [27]:
tokenizer.decode(xs[96].numpy())

'today many people are on their phone & driving at the same time. cell phone use while driving has led to 1.6 million crashes each year. nearly 390,000 injuries have '

In [28]:
ys[96]

tensor(0)

In [29]:
test_dataset = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv")

In [30]:
import csv

with open('submission.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    field = ["id", "generated"]
    
    writer.writerow(field)
    for index, row in test_dataset.iterrows():
        writer.writerow([row["id"], prompt(row["text"]).item()])

In [31]:
answers = pd.read_csv("/kaggle/working/submission.csv")

In [32]:
for index, row in answers.iterrows():
    print(row)
    break

id           0059830c
generated    0.000004
Name: 0, dtype: object
