# Construção do Dataset
Primeiramente, pretendemos agrupar diversos datasets, utilizando duas colunas: "text", texto plano, sendo a segunda coluna a label "source" que categoriza o texto como "human" ou "ai".
O segundo passo é extrair as _features_ necessárias para treinar os modelos

In [4]:
import numpy as np
import pandas as pd
import os
import random
def set_seed(seed: int):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo

set_seed(25)

# Montar o Dataset

In [4]:
dataframes = []

## Primeiro Dataset

In [3]:
from datasets import load_dataset
ds = load_dataset("artem9k/ai-text-detection-pile")

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 1392522/1392522 [00:05<00:00, 237601.89 examples/s]


In [4]:
df_1 = pd.DataFrame(ds['train'])
df_1.drop(columns=['id'], inplace=True)

df_1 = df_1.iloc[:, [1,0]]
dataframes.append(df_1)
df_1.head()

Unnamed: 0,text,source
0,12 Years a Slave: An Analysis of the Film Essa...,human
1,20+ Social Media Post Ideas to Radically Simpl...,human
2,2022 Russian Invasion of Ukraine in Global Med...,human
3,533 U.S. 27 (2001) Kyllo v. United States: The...,human
4,A Charles Schwab Corporation Case Essay\n\nCha...,human


## Segundo Dataset

In [5]:
from datasets import load_dataset

ds2 = load_dataset("dmitva/human_ai_generated_text")

Generating train split: 100%|██████████| 1000000/1000000 [00:37<00:00, 27021.05 examples/s]


In [6]:
df_2 = pd.DataFrame(ds2['train'])

# Create a DataFrame for human text
df_human = df_2[['human_text']].copy()
df_human = df_human.rename(columns={'human_text': 'text'})
df_human['source'] = 'human'

# Create a DataFrame for AI text
df_ai = df_2[['ai_text']].copy()
df_ai = df_ai.rename(columns={'ai_text': 'text'})
df_ai['source'] = 'ai'

# Combine the two DataFrames into one
new_df_2 = pd.concat([df_human, df_ai], ignore_index=True)
dataframes.append(new_df_2)
new_df_2.head()

Unnamed: 0,text,source
0,Also they feel more comfortable at home. Some ...,human
1,"I can get another job to work on the weekends,...",human
2,parents and school should agree on the desicio...,human
3,"Base in my experiences I'm growing, I try hard...",human
4,Many people around the world have different ch...,human


In [7]:
# Sanity check
print(new_df_2['source'].value_counts())

source
human    1000000
ai       1000000
Name: count, dtype: int64


## Terceiro Dataset

In [9]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds_abs = load_dataset("NicolaiSivesind/human-vs-machine", "research_abstracts_labeled")
ds_wiki = load_dataset("NicolaiSivesind/human-vs-machine", "wiki_labeled")

df_3_1_1 = pd.DataFrame(ds_abs['train'])
df_3_1_2 = pd.DataFrame(ds_abs['validation'])
df_3_1_3 = pd.DataFrame(ds_abs['test'])

df_3_2_1 = pd.DataFrame(ds_wiki['train'])
df_3_2_2 = pd.DataFrame(ds_wiki['validation'])
df_3_2_3 = pd.DataFrame(ds_wiki['test'])

df_3 = pd.concat([df_3_1_1, df_3_1_2, df_3_1_3, df_3_2_1, df_3_2_2, df_3_2_3], ignore_index=True)

Generating train split: 100%|██████████| 14000/14000 [00:00<00:00, 52253.38 examples/s]
Generating test split: 100%|██████████| 3000/3000 [00:00<00:00, 99985.79 examples/s]
Generating validation split: 100%|██████████| 3000/3000 [00:00<00:00, 162519.53 examples/s]
Generating train split: 100%|██████████| 210000/210000 [00:00<00:00, 341620.08 examples/s]
Generating test split: 100%|██████████| 45000/45000 [00:00<00:00, 452801.90 examples/s]
Generating validation split: 100%|██████████| 45000/45000 [00:00<00:00, 471568.01 examples/s]


In [10]:
# Create a mapping for the label values
label_to_source = {
    0: "human",
    1: "ai"
}

# Apply the mapping to create the "source" column
df_3['source'] = df_3['label'].map(label_to_source)

# Select only the desired columns: "text" and "source"
new_df_3 = df_3[['text', 'source']].copy()

dataframes.append(new_df_3)
new_df_3.head()


Unnamed: 0,text,source
0,Coupling losses were studied in composite tape...,human
1,"In this study, we investigate the coupling los...",ai
2,Let $\mathsf M_{\mathsf S}$ denote the strong ...,human
3,"In this paper, we investigate Weighted Solyani...",ai
4,In 2019 October Betelgeuse began a decline in ...,human


## Quarto Dataset

In [11]:
df_4 = pd.read_csv("LLM.csv")
df_4.rename(columns = {"Text": "text", "Label": "source"}, inplace=True)

# Create a mapping for the label values
label_to_source = {
    "ai": "ai",
    "student": "human"
}

# Apply the mapping to create the "source" column
df_4['source'] = df_4['source'].map(label_to_source)
df_4.head()

Unnamed: 0,text,source
0,y r u always l8 to the meetings?,human
1,The project team embraced a user-centric desig...,ai
2,"i dont like dealing with risks, it's too stres...",human
3,"i dont worry about reliability, it's good enough",human
4,"i dont care about human-centered design, just ...",human


## Quinto Dataset

In [12]:
df_5 = pd.read_csv("data_set.csv")
df_5.rename(columns = {"abstract": "text", "is_ai_generated": "source"}, inplace=True)
df_5.drop(columns=['title','ai_generated'], inplace=True)

# Create a mapping for the label values
label_to_source = {
    1: "ai",
    0: "human"
}

# Apply the mapping to create the "source" column
df_5['source'] = df_5['source'].map(label_to_source)
print(df_5['source'].value_counts())
dataframes.append(df_5)
df_5.head()

source
human    2100
ai       1953
Name: count, dtype: int64


Unnamed: 0,text,source
0,Advanced electromagnetic potentials are indi...,human
1,This research paper investigates the question ...,ai
2,We give an algorithm for finding network enc...,human
3,The paper presents an efficient centralized bi...,ai
4,We introduce an exponential random graph mod...,human


## Sexto Dataset

In [13]:
df_6_news_gpt = pd.read_pickle("en_news_gpt_features_df.pkl")
df_6_news_human = pd.read_pickle("en_news_human_features_df.pkl")
df_6_wiki_gpt = pd.read_pickle("en_wiki_gpt_features_df.pkl")
df_6_wiki_human = pd.read_pickle("en_wiki_human_features_df.pkl")

df_6_news_gpt = df_6_news_gpt[['text']]
df_6_news_gpt['source'] = 'ai'

df_6_news_human = df_6_news_human[['text']]
df_6_news_human['source'] = 'human'

df_6_wiki_gpt = df_6_wiki_gpt[['text']]
df_6_wiki_gpt['source'] = 'ai'

df_6_wiki_human = df_6_wiki_human[['text']]
df_6_wiki_human['source'] = 'human'

df_6 = pd.concat([df_6_news_gpt, df_6_news_human, df_6_wiki_gpt, df_6_wiki_human], ignore_index=True)
dataframes.append(df_6)
print(df_6['source'].value_counts())


source
ai       800
human    200
Name: count, dtype: int64


## Juntar tudo

In [14]:
df = pd.concat(dataframes, ignore_index=True)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3717575 entries, 0 to 3717574
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   text    object
 1   source  object
dtypes: object(2)
memory usage: 56.7+ MB


In [None]:
df = df.drop_duplicates()
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 3400859 entries, 0 to 3717574
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   text    object
 1   source  object
dtypes: object(2)
memory usage: 77.8+ MB


In [20]:
df.to_csv("human_or_ai_dataset.csv", index=False)

# Segunda etapa, extração

In [19]:
from sklearn.feature_extraction.text import HashingVectorizer

# Initialize the HashingVectorizer
vectorizer = HashingVectorizer(n_features=5000, binary=True, lowercase=True)
X = vectorizer.transform(df['text'])

# Create a sparse DataFrame; since HashingVectorizer doesn't store feature names,
# we generate generic column names.
column_names = [f"feature_{i}" for i in range(X.shape[1])]
df_encoded = pd.DataFrame.sparse.from_spmatrix(X, columns=column_names, dtype="int16")


MemoryError: Unable to allocate 16.0 GiB for an array with shape (2147483648,) and data type float64

In [None]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3400859 entries, 0 to 3400858
Columns: 1000 entries, 000 to yourself
dtypes: int16(1000)
memory usage: 6.3 GB


In [None]:
df_encoded.to_csv("human_or_ai_dataset.csv", index=False)


## Tentar com outras libs

In [4]:
df = pd.read_csv("human_or_ai_dataset.csv")  # Change this to your actual data source

In [None]:
# Load large dataset into Dask DataFrame (assuming a CSV file)
from sklearn.feature_extraction.text import CountVectorizer,HashingVectorizer

print(df.info())
# Initialize CountVectorizer
vectorizer = CountVectorizer(max_features=10000,binary=True)

# Fit and transform the text data
X = vectorizer.fit_transform(df['text'].head(1500000))

# Convert to DataFrame with words as feature names
df_encoded = pd.DataFrame(
    X.astype("int8").toarray(), 
    columns=vectorizer.get_feature_names_out()
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3400859 entries, 0 to 3400858
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   text    object
 1   source  object
dtypes: object(2)
memory usage: 51.9+ MB
None


ValueError: np.nan is an invalid document, expected byte or unicode string.

In [None]:
df_encoded.head()

Unnamed: 0,00,000,01,02,03,04,05,06,07,08,...,youth,youthful,youths,youtube,zealand,zero,zombie,zombies,zone,zones
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
