In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv
/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv
/kaggle/input/llm-detect-ai-generated-text/test_essays.csv
/kaggle/input/llm-detect-ai-generated-text/train_essays.csv
/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv


In [2]:
dataset = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv")

In [3]:
import random

In [4]:
stoi = {"[PAD]": 0,
       "[UNK]": 1}
itos = {0 : "[PAD]",
       1 : "[UNK]"}
total_text = set()
for index, row in dataset.iterrows():
    if random.random() <= 0.1:
        total_text.update(row["text"].lower().split())
stoi.update({char:i+2 for i, char in enumerate(sorted(total_text))})
itos.update({i+2:char for i, char in enumerate(sorted(total_text))})

In [5]:
itos[stoi['the']]

'the'

In [6]:
vocab_size = len(stoi)
vocab_size

44818

In [7]:
itos[0]

'[PAD]'

In [8]:
class tokenizer:
    def __init__(self, stoi, itos, max_len=100):
        self.stoi = stoi
        self.itos = itos
        self.vocab_size = len(itos)
        self.max_len = max_len
    def encode(self, text):
        out = []
        text = text.lower().split()[:self.max_len]
        for word in text:
            if word in stoi:
                out.append(stoi[word])
            else:
                out.append(1)
        out = [0] * max(0, abs(len(out) - self.max_len)) + out
        return out
    def decode(self, ids):
        out = ""
        for token in ids:
            if token in itos:
                out += itos[token] + " "
        return out

In [9]:
context_len = 30

In [10]:
tokenizer = tokenizer(stoi, itos, max_len=context_len)

In [11]:
tokenizer.encode("Cars. Cars have been around since they became famous in the 1900s, when Henry Ford created and built")

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 8509,
 8487,
 20260,
 6474,
 5223,
 36307,
 40055,
 6411,
 17417,
 21807,
 39868,
 2035,
 43620,
 20545,
 18296,
 11734,
 4598,
 7683]

In [12]:
tokenizer.decode(tokenizer.encode("Cars. Cars have been around since they became famous in the 1900s, when Henry Ford created and built"))

'[PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] cars. cars have been around since they became famous in the 1900s, when henry ford created and built '

In [13]:
import torch

In [14]:
def build_dataset():
    xs, ys = [], []
    for index, row in dataset.iterrows():
        xs.append(tokenizer.encode(row["text"]))
        ys.append(row["label"])
    xs = torch.tensor(xs)
    ys = torch.tensor(ys)
    return xs, ys
xs, ys = build_dataset()

In [15]:
tokenizer.decode(xs[0].numpy())

'phones modern humans today are always on their phone. they are always on their phone more than 5 hours a day no stop .all they do is text back and '

In [16]:
ys[0]

tensor(0)

In [17]:
emb_dim = 10

In [18]:
# initialize parameters
C = torch.randn(vocab_size, emb_dim) * 0.1
w1 = torch.randn(context_len * emb_dim, 100) * 0.1
b1 = torch.randn(100) * 0.01
w2 = torch.randn(100, 2) * 0.1
b2 = torch.randn(2) * 0.01

params = [C, w1, b1, w2, b2]
for p in params:
    p.requires_grad = True

In [19]:
num_examples = len(xs)

In [20]:
# training params
num_epochs = 150000
lr = 0.01
batch_size = 16

In [21]:
import torch.nn.functional as F

In [22]:
for epoch in range(num_epochs):
    selection = torch.randint(0, num_examples-1, (batch_size, ))
    ins = xs[selection]
    outs = ys[selection]
    emb = C[ins].view(-1, emb_dim * context_len)
    l1 = ((emb @ w1) + b1).tanh()
    logits = ((l1 @ w2) + b2).sigmoid()
    for p in params:
        p.grad = None

    loss = F.cross_entropy(logits, outs)
    if epoch % 1000 == 0:
        print("Epoch:", epoch, ", loss:", loss.item())
    loss.backward()
    
    for p in params:
        p.data -= p.grad * lr
    

Epoch: 0 , loss: 0.7041065096855164
Epoch: 1000 , loss: 0.666569709777832
Epoch: 2000 , loss: 0.7159237265586853
Epoch: 3000 , loss: 0.6696544289588928
Epoch: 4000 , loss: 0.6647873520851135
Epoch: 5000 , loss: 0.6194063425064087
Epoch: 6000 , loss: 0.6472158432006836
Epoch: 7000 , loss: 0.6610148549079895
Epoch: 8000 , loss: 0.5692396759986877
Epoch: 9000 , loss: 0.5215916037559509
Epoch: 10000 , loss: 0.5241203904151917
Epoch: 11000 , loss: 0.612557053565979
Epoch: 12000 , loss: 0.6103710532188416
Epoch: 13000 , loss: 0.6535864472389221
Epoch: 14000 , loss: 0.5833619832992554
Epoch: 15000 , loss: 0.4598546028137207
Epoch: 16000 , loss: 0.5573556423187256
Epoch: 17000 , loss: 0.49386683106422424
Epoch: 18000 , loss: 0.38247013092041016
Epoch: 19000 , loss: 0.3662448525428772
Epoch: 20000 , loss: 0.3510403335094452
Epoch: 21000 , loss: 0.5207529067993164
Epoch: 22000 , loss: 0.4289911687374115
Epoch: 23000 , loss: 0.39424988627433777
Epoch: 24000 , loss: 0.4177905321121216
Epoch: 25000

In [23]:
with torch.no_grad():
    selection = torch.randint(0, num_examples-1, (1,))
    ins = xs[selection]
    outs = ys[selection]
    emb = C[ins].view(-1, emb_dim * context_len)
    l1 = ((emb @ w1) + b1).tanh()
    logits = ((l1 @ w2) + b2).sigmoid()
    print(tokenizer.decode(ins[0].numpy()))
    print("prediction:", torch.argmax(logits).item())
    print("truth:", outs[0].item())

finding a rock-like formation on mars was one of the biggest dicscovories nasa has made. they first say it in a picture taken by one of their satlights called viking 
prediction: 0
truth: 0


In [24]:
with torch.no_grad():
    while True:
        selection = torch.randint(0, num_examples-1, (1,))
        ins = xs[selection]
        outs = ys[selection]
        emb = C[ins].view(-1, emb_dim * context_len)
        l1 = ((emb @ w1) + b1).tanh()
        logits = ((l1 @ w2) + b2).sigmoid()
        if outs[0].item() == 1:
            print(tokenizer.decode(ins[0].numpy()))
            print("prediction:", torch.argmax(logits).item(), "confidence:", (logits[0][torch.argmax(logits).item()]/logits.sum()).item())
            print("truth:", outs[0].item())
            print("selection index:", selection.item())
            break

cell phones are a great way to keep in touch with the outside world, but they have no place inside a school building during class hours. allowing students to bring 
prediction: 0 confidence: 0.9970309138298035
truth: 1
selection index: 38134


In [25]:
def prompt(essay):
    with torch.no_grad():
        ins = torch.tensor(tokenizer.encode(essay))
        emb = C[ins].view(-1, emb_dim * context_len)
        l1 = ((emb @ w1) + b1).tanh()
        logits = ((l1 @ w2) + b2).sigmoid()
        return round((logits[0][1] / logits[0].sum()).item(), 1)

In [26]:
prompt("furthermore, the introduction of cars into modern society assisted with means of transportation")

1.0

In [27]:
tokenizer.decode(xs[96].numpy())

'today many people are on their phone & driving at the same time. cell phone use while driving has led to 1.6 million crashes each year. nearly 390,000 injuries have '

In [28]:
ys[96]

tensor(0)

In [29]:
test_dataset = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/test_essays.csv")

In [30]:
import csv

with open('submission.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    field = ["id", "generated"]
    
    writer.writerow(field)
    for index, row in test_dataset.iterrows():
        writer.writerow([row["id"], prompt(row["text"])])

In [31]:
answers = pd.read_csv("/kaggle/working/submission.csv")

In [32]:
for index, row in answers.iterrows():
    print(row)

id           0000aaaa
generated         1.0
Name: 0, dtype: object
id           1111bbbb
generated         1.0
Name: 1, dtype: object
id           2222cccc
generated         1.0
Name: 2, dtype: object
