In [3]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [41]:
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk import pos_tag
from collections import Counter
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd
import string
import re

# Extracting Data from Dataset 'COLING_2025_MGT_en'

In [42]:
data = pd.read_csv("/content/train_df (2).csv")

In [43]:
data

Unnamed: 0,text,label
0,"Well, gosh, that's a mighty interesting questi...",1
1,Sure! When a company like Google makes changes...,1
2,Sleep plays a vital role in our overall health...,1
3,Two important properties of a high-quality has...,0
4,the genetics of hair color variation is a comp...,1
...,...,...
9326,My interpretation is that it was a story about...,0
9327,Manga is a style of Japanese comics and graphi...,1
9328,sometimes a post on reddit has comments that a...,1
9329,The use of the dragon as a symbol of power and...,1


In [44]:
df_0 = data[data["label"] == 0].sample(1000)
df_1 = data[data["label"] == 1].sample(1000)

# Combine both subsets
df_selected = pd.concat([df_0, df_1])

# Keep only the 'label' and 'text' columns
df = df_selected[["label", "text"]]

# n-gram analysis

In [45]:
def remove_punctuation(tokens):
    punctuation = set(string.punctuation)  # Get punctuation characters
    return [word for word in tokens if word not in punctuation and not word.isdigit() and word != '...']

In [46]:
def clean_text(text):
  patext = re.sub(r'[^a-z\s]', '', text)
  text = ' '.join(text.split())
  return text

In [47]:
def generate_ngrams(text, n):
    text = clean_text(text.lower())
    tokens = word_tokenize(text.lower())
    tokens = remove_punctuation(tokens)
    # tagged_tokens = pos_tag(tokens)
    n_grams = list(ngrams(tokens, n))
    return Counter(n_grams)

In [56]:
human_text = df[df['label'] == 0]['text'].str.cat(sep=" ")
ai_text = df[df['label'] == 1]['text'].str.cat(sep=" ")

## Unigram

In [57]:
n = 1

ai_ngrams = generate_ngrams(ai_text, n)
human_ngrams = generate_ngrams(human_text, n)

print(f"Unigram analysis of AI-generated text:\n", ai_ngrams.most_common(10))
print(f"Unigram analysis ofHuman-written text:\n", human_ngrams.most_common(10))

Unigram analysis of AI-generated text:
 [(('the',), 11744), (('and',), 7709), (('of',), 6278), (('to',), 5943), (('a',), 5413), (('in',), 3861), (('that',), 3202), (('it',), 2733), (('is',), 2400), (('for',), 1781)]
Unigram analysis ofHuman-written text:
 [(('the',), 24541), (('to',), 11955), (('of',), 11905), (('a',), 10861), (('and',), 10553), (('in',), 7512), (('is',), 6648), (('that',), 6462), (('it',), 5829), (('you',), 4549)]


## Bigram

In [58]:
n = 2

ai_ngrams = generate_ngrams(ai_text, n)
human_ngrams = generate_ngrams(human_text, n)

print(f"Bigram analysis of AI-generated text:\n", ai_ngrams.most_common(10))
print(f"Bigram analysis ofHuman-written text:\n", human_ngrams.most_common(10))

Bigram analysis of AI-generated text:
 [(('of', 'the'), 1308), (('in', 'the'), 998), (('to', 'the'), 580), (('it', "'s"), 522), (('and', 'the'), 492), (('it', 'is'), 436), (('is', 'a'), 365), (('that', 'the'), 358), (('as', 'a'), 338), (('such', 'as'), 334)]
Bigram analysis ofHuman-written text:
 [(('of', 'the'), 2741), (('in', 'the'), 2043), (('to', 'the'), 1088), (('it', "'s"), 1019), (('on', 'the'), 784), (('and', 'the'), 762), (('to', 'be'), 761), (('this', 'is'), 636), (('if', 'you'), 635), (('do', "n't"), 582)]


## Trigram

In [59]:
n = 3

ai_ngrams = generate_ngrams(ai_text, n)
human_ngrams = generate_ngrams(human_text, n)

print(f"Trigram analysis of AI-generated text:\n", ai_ngrams.most_common(10))
print(f"Trigram analysis ofHuman-written text:\n", human_ngrams.most_common(10))

Trigram analysis of AI-generated text:
 [(('the', 'united', 'states'), 126), (('a', 'lot', 'of'), 105), (('one', 'of', 'the'), 100), (('it', '’', 's'), 87), (('it', "'s", 'important'), 77), (("'s", 'important', 'to'), 75), (('such', 'as', 'the'), 70), (('due', 'to', 'the'), 69), (('i', 'do', "n't"), 69), (('that', "'s", 'a'), 66)]
Trigram analysis ofHuman-written text:
 [(('a', 'lot', 'of'), 361), (('--', '--', '--'), 209), (('one', 'of', 'the'), 164), (('be', 'able', 'to'), 109), (('it', "'s", 'not'), 107), (('it', "'s", 'a'), 100), (('i', 'do', "n't"), 99), (('in', 'order', 'to'), 97), (('you', 'do', "n't"), 96), (('as', 'well', 'as'), 92)]


In [60]:
n = 4

ai_ngrams = generate_ngrams(ai_text, n)
human_ngrams = generate_ngrams(human_text, n)

print(f"4-grams analysis of AI-generated text:\n", ai_ngrams.most_common(10))
print(f"4-grams analysis ofHuman-written text:\n", human_ngrams.most_common(10))

4-grams analysis of AI-generated text:
 [(('it', "'s", 'important', 'to'), 75), (('the', 'paragraph', 'with', 'synonyms'), 52), (('in', 'the', 'united', 'states'), 51), (('certainly', 'here', '’', 's'), 49), (('here', '’', 's', 'the'), 49), (('important', 'to', 'note', 'that'), 49), (('it', 'is', 'important', 'to'), 47), (('’', 's', 'the', 'paragraph'), 45), (('s', 'the', 'paragraph', 'with'), 45), (('one', 'of', 'the', 'most'), 41)]
4-grams analysis ofHuman-written text:
 [(('--', '--', '--', '--'), 182), (('the', 'end', 'of', 'the'), 45), (('at', 'the', 'same', 'time'), 44), (('a', 'lot', 'of', 'people'), 42), (('the', 'speed', 'of', 'light'), 36), (('on', 'the', 'other', 'hand'), 34), (('the', 'rest', 'of', 'the'), 34), (('to', 'be', 'able', 'to'), 34), (('at', 'the', 'end', 'of'), 34), (('if', 'you', 'want', 'to'), 30)]


In [61]:
n = 5

ai_ngrams = generate_ngrams(ai_text, n)
human_ngrams = generate_ngrams(human_text, n)

print(f"5-grams analysis of AI-generated text:\n", ai_ngrams.most_common(10))
print(f"5-grams analysis ofHuman-written text:\n", human_ngrams.most_common(10))

5-grams analysis of AI-generated text:
 [(('certainly', 'here', '’', 's', 'the'), 46), (('here', '’', 's', 'the', 'paragraph'), 45), (('’', 's', 'the', 'paragraph', 'with'), 45), (('s', 'the', 'paragraph', 'with', 'synonyms'), 45), (('it', "'s", 'important', 'to', 'note'), 27), (("'s", 'important', 'to', 'note', 'that'), 25), (('the', 'paragraph', 'with', 'synonyms', 'replacing'), 25), (('let', 'me', 'know', 'if', 'you'), 23), (('me', 'know', 'if', 'you', 'have'), 23), (('know', 'if', 'you', 'have', 'any'), 23)]
5-grams analysis ofHuman-written text:
 [(('--', '--', '--', '--', '--'), 157), (('at', 'the', 'end', 'of', 'the'), 18), (('√', 'zw', '√', 'z', '√'), 16), (('zw', '√', 'z', '√', 'w'), 16), (('there', 'are', 'a', 'lot', 'of'), 13), (('on', 'the', 'other', 'side', 'of'), 12), (('the', 'other', 'side', 'of', 'the'), 10), (('identity', '√', 'zw', '√', 'z'), 10), (('as', 'far', 'as', 'i', 'know'), 9), (('in', 'the', 'same', 'way', 'that'), 9)]
