In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv
/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv
/kaggle/input/llm-detect-ai-generated-text/test_essays.csv
/kaggle/input/llm-detect-ai-generated-text/train_essays.csv


In [2]:
dataset = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv")

In [3]:
stoi = {"[PAD]": 0,
       "[UNK]": 1}
itos = {0 : "[PAD]",
       1 : "[UNK]"}
total_text = set()
for index, row in dataset.iterrows():
    total_text.update(row["text"].lower().split())
stoi.update({char:i+2 for i, char in enumerate(sorted(total_text))})
itos.update({i+2:char for i, char in enumerate(sorted(total_text))})

In [4]:
itos[stoi['the']]

'the'

In [5]:
vocab_size = len(stoi)
vocab_size

27098

In [6]:
itos[0]

'[PAD]'

In [7]:
class tokenizer:
    def __init__(self, stoi, itos, max_len=100):
        self.stoi = stoi
        self.itos = itos
        self.vocab_size = len(itos)
        self.max_len = max_len
    def encode(self, text):
        out = []
        text = text.lower().split()[:self.max_len]
        for word in text:
            if word in stoi:
                out.append(stoi[word])
            else:
                out.append(1)
        out = [0] * max(0, abs(len(out) - self.max_len)) + out
        return out
    def decode(self, ids):
        out = ""
        for token in ids:
            if token in itos:
                out += itos[token] + " "
        return out

In [8]:
tokenizer = tokenizer(stoi, itos)

In [9]:
tokenizer.encode("Cars. Cars have been around since they became famous in the 1900s, when Henry Ford created and built")

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 5101,
 5078,
 12037,
 3689,
 3022,
 21877,
 23950,
 3649,
 10404,
 13049,
 23783,
 1057,
 26411,
 12229,
 10942,
 7047,
 2623,
 4481]

In [10]:
tokenizer.decode(tokenizer.encode("Cars. Cars have been around since they became famous in the 1900s, when Henry Ford created and built"))

'[PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] cars. cars have been around since they became famous in the 1900s, when henry ford created and built '

In [11]:
import torch

In [12]:
def build_dataset():
    xs, ys = [], []
    for index, row in dataset.iterrows():
        xs.append(tokenizer.encode(row["text"]))
        ys.append(row["generated"])
    xs = torch.tensor(xs)
    ys = torch.tensor(ys)
    return xs, ys
xs, ys = build_dataset()

In [13]:
tokenizer.decode(xs[0].numpy())

'cars. cars have been around since they became famous in the 1900s, when henry ford created and built the first modelt. cars have played a major role in our every day lives since then. but now, people are starting to question if limiting car usage would be a good thing. to me, limiting the use of cars might be a good thing to do. in like matter of this, article, "in german suburb, life goes on without cars," by elizabeth rosenthal states, how automobiles are the linchpin of suburbs, where middle class families from either shanghai or chicago tend to '

In [14]:
ys[0]

tensor(0)

In [15]:
emb_dim = 10
context_len = 100

In [16]:
# initialize parameters
C = torch.randn(vocab_size, emb_dim) * 0.1
w1 = torch.randn(context_len * emb_dim, 100) * 0.1
b1 = torch.randn(100) * 0.01
w2 = torch.randn(100, 2) * 0.1
b2 = torch.randn(2) * 0.01

params = [C, w1, b1, w2, b2]
for p in params:
    p.requires_grad = True

In [17]:
num_examples = len(xs)

In [18]:
# training params
num_epochs = 100000
lr = 0.001
batch_size = 32

In [19]:
import torch.nn.functional as F

In [20]:
for epoch in range(num_epochs):
    selection = torch.randint(0, num_examples-1, (batch_size, ))
    ins = xs[selection]
    outs = ys[selection]
    emb = C[ins].view(-1, emb_dim * context_len)
    l1 = ((emb @ w1) + b1).tanh()
    logits = ((l1 @ w2) + b2).sigmoid()
    for p in params:
        p.grad = None
        
    loss = F.cross_entropy(logits, outs)
    if epoch % 1000 == 0:
        print("Epoch:", epoch, ", loss:", loss.item())
    loss.backward()
    
    for p in params:
        p.data -= p.grad * lr
    

Epoch: 0 , loss: 0.6834402084350586
Epoch: 1000 , loss: 0.6344739198684692
Epoch: 2000 , loss: 0.574284017086029
Epoch: 3000 , loss: 0.5157482624053955
Epoch: 4000 , loss: 0.4966508746147156
Epoch: 5000 , loss: 0.4564148187637329
Epoch: 6000 , loss: 0.41934099793434143
Epoch: 7000 , loss: 0.39981362223625183
Epoch: 8000 , loss: 0.38611936569213867
Epoch: 9000 , loss: 0.37509405612945557
Epoch: 10000 , loss: 0.36387571692466736
Epoch: 11000 , loss: 0.35215264558792114
Epoch: 12000 , loss: 0.34922531247138977
Epoch: 13000 , loss: 0.34379836916923523
Epoch: 14000 , loss: 0.3394901156425476
Epoch: 15000 , loss: 0.3369530439376831
Epoch: 16000 , loss: 0.33599817752838135
Epoch: 17000 , loss: 0.33191531896591187
Epoch: 18000 , loss: 0.32918819785118103
Epoch: 19000 , loss: 0.33179453015327454
Epoch: 20000 , loss: 0.32788747549057007
Epoch: 21000 , loss: 0.32777413725852966
Epoch: 22000 , loss: 0.32558953762054443
Epoch: 23000 , loss: 0.32405173778533936
Epoch: 24000 , loss: 0.351763516664505

In [21]:
with torch.no_grad():
    selection = torch.randint(0, num_examples-1, (1,))
    ins = xs[selection]
    outs = ys[selection]
    emb = C[ins].view(-1, emb_dim * context_len)
    l1 = ((emb @ w1) + b1).tanh()
    logits = ((l1 @ w2) + b2).sigmoid()
    print(tokenizer.decode(ins[0].numpy()))
    print("prediction:", torch.argmax(logits).item())
    print("truth:", outs[0].item())

dear mr.mrs. senator, in light of previous research i've done on the electoral college, i would like the opportunity to voice my opinion on how the voting system should work in this country. based off of the information acquired and with the best interest of the country at heart, i believe that the system should be changed to the election by popular vote for the president of the united states. first and foremost, when voters vote, they aren't really voting for their candidate, but a slate of electors, which in turn elect the president. don't you think that this method 
prediction: 0
truth: 0


In [22]:
def prompt(essay):
    with torch.no_grad():
        ins = torch.tensor(tokenizer.encode(essay))
        emb = C[ins].view(-1, emb_dim * context_len)
        l1 = ((emb @ w1) + b1).tanh()
        logits = ((l1 @ w2) + b2).sigmoid()
        return logits[0][1] / logits.sum()

In [23]:
prompt("no")

tensor(0.0071)

In [24]:
tokenizer.decode(xs[96].numpy())

'when electing the president of the united states, you must vote in favor of the president you want. however, instead of coming down to popular vote, the system uses the electrolar college to decide. the electoral vote works in that the people of a certain states votes for their candidate and the majority of people who vote for a certain president give their votes to a slate of voters. what this means is that the popular vote within a state wins the electoral votes of the electors who then vote for the president. many people dislike this system including myself. '

In [25]:
ys[96]

tensor(0)

In [26]:
test_dataset = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv")

In [27]:
import csv

with open('submission.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    field = ["id", "generated"]
    
    writer.writerow(field)
    writer.writerow(["Oladele Damilola", "40", "Nigeria"])