# Segunda etapa, extração

In [4]:
import numpy as np
import pandas as pd
import os
import random
def set_seed(seed: int):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo

set_seed(25)

In [5]:
import pandas as pd

df = pd.read_csv("human_or_ai_dataset.csv")

In [6]:
df.head()

Unnamed: 0,text,source
0,12 Years a Slave: An Analysis of the Film Essa...,human
1,20+ Social Media Post Ideas to Radically Simpl...,human
2,2022 Russian Invasion of Ukraine in Global Med...,human
3,533 U.S. 27 (2001) Kyllo v. United States: The...,human
4,A Charles Schwab Corporation Case Essay\n\nCha...,human


## Tentar com outras libs

In [7]:
# Encode the source column, "human" = 0, "ai" = 1
df["source"] = df["source"].apply(lambda x: 0 if x == "human" else 1)

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Assume df is your original DataFrame (loaded from a CSV)
print(df.info())

# Initialize CountVectorizer with binary features
vectorizer = CountVectorizer(max_features=10000, binary=True)
print(df["source"].value_counts())

# Split data into df_tail (98.6%) and df_head (1.2%) using stratified sampling
df_tail, df_head = train_test_split(df, test_size=0.013, random_state=25, stratify=df["source"])

print(df_tail["source"].value_counts())
print(df_head["source"].value_counts())

# Rename "source" to "targetLabel"
df_head = df_head.rename(columns={"source": "targetLabel"})

# Fit and transform the text data (using df_head)
X = vectorizer.fit_transform(df_head['text'])

# Convert the sparse matrix to a DataFrame
df_encoded = pd.DataFrame(
    X.astype("int8").toarray(),
    columns=vectorizer.get_feature_names_out()
)

# Add the "targetLabel" column at the last position
df_encoded["targetLabel"] = df_head["targetLabel"].values

# Final check
print(df_encoded.info())
print(df_encoded.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3400868 entries, 0 to 3400867
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   text    object
 1   source  int64 
dtypes: int64(1), object(1)
memory usage: 51.9+ MB
None
source
0    2002394
1    1398474
Name: count, dtype: int64
source
0    1976362
1    1380294
Name: count, dtype: int64
source
0    26032
1    18180
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44212 entries, 0 to 44211
Columns: 10001 entries, 00 to targetLabel
dtypes: int64(1), int8(10000)
memory usage: 422.0 MB
None
   00  000  01  02  03  04  05  06  07  08  ...  youth  youths  youtube  \
0   0    0   0   0   0   0   0   0   0   0  ...      0       0        0   
1   0    0   0   0   0   0   0   0   0   0  ...      0       0        0   
2   0    0   0   0   0   0   0   0   0   0  ...      0       0        0   
3   0    0   0   0   0   0   0   0   0   0  ...      0       0        0   
4   0    0   0   0   0   0  

In [9]:
df_encoded.info()
df_encoded.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44212 entries, 0 to 44211
Columns: 10001 entries, 00 to targetLabel
dtypes: int64(1), int8(10000)
memory usage: 422.0 MB


Unnamed: 0,00,000,01,02,03,04,05,06,07,08,...,youth,youths,youtube,zealand,zero,zombie,zone,zones,zoo,targetLabel
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [10]:
print(df_encoded["targetLabel"].value_counts())

targetLabel
0    26032
1    18180
Name: count, dtype: int64


In [11]:
# Split the data: First separate out 20% as the test set.
df_train_val, df_test = train_test_split(df_encoded, test_size=0.2, random_state=25,stratify=df_encoded["targetLabel"])

# Now split the remaining 80% into training and validation sets.
# Since we want a total of 10% of the original data for validation, we split 12.5% of the remaining data.
df_train, df_val = train_test_split(df_train_val, test_size=0.125, random_state=25,stratify=df_train_val["targetLabel"])

print(df_train["targetLabel"].value_counts())
print(df_test["targetLabel"].value_counts())
print(df_val["targetLabel"].value_counts())

targetLabel
0    18221
1    12726
Name: count, dtype: int64
targetLabel
0    5207
1    3636
Name: count, dtype: int64
targetLabel
0    2604
1    1818
Name: count, dtype: int64


In [14]:
# Write each DataFrame split to CSV files.
df_train.to_csv('train.csv', index=False)
df_val.to_csv('validation.csv', index=False)
df_test.to_csv('test.csv', index=False)

# Stor

In [28]:
df_in = pd.read_csv("dataset1_inputs.csv", sep="\t")
df_in.drop(columns=["ID"], inplace=True)
df_in.head()

Unnamed: 0,Text
0,"The cell cycle, or cell-division cycle, is the..."
1,The cell cycle is the process by which a cell ...
2,"Photons, in many atomic models in physics, are..."
3,A photon is a fundamental particle of light an...
4,"According to the theory of plate tectonics, Ea..."


In [29]:
df_in.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    30 non-null     object
dtypes: object(1)
memory usage: 372.0+ bytes


In [30]:
# Rename "Text" to "text"
df_in = df_in.rename(columns={"Text": "text"})

# Fit and transform the text data
X_in = vectorizer.transform(df_in['text'])

# Convert the sparse matrix to a DataFrame
df_in_encoded = pd.DataFrame(
    X_in.astype("int8").toarray(),
    columns=vectorizer.get_feature_names_out()
)

# Final check
print(df_in_encoded.info())
print(df_in_encoded.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Columns: 10000 entries, 00 to zoo
dtypes: int8(10000)
memory usage: 293.1 KB
None
   00  000  01  02  03  04  05  06  07  08  ...  youself  youth  youths  \
0   0    0   0   0   0   0   0   0   0   0  ...        0      0       0   
1   0    0   0   0   0   0   0   0   0   0  ...        0      0       0   
2   0    0   0   0   0   0   0   0   0   0  ...        0      0       0   
3   0    0   0   0   0   0   0   0   0   0  ...        0      0       0   
4   0    0   0   0   0   0   0   0   0   0  ...        0      0       0   

   youtube  zealand  zero  zombie  zone  zones  zoo  
0        0        0     0       0     0      0    0  
1        0        0     0       0     0      0    0  
2        0        0     1       0     0      0    0  
3        0        0     0       0     0      0    0  
4        0        0     0       0     0      0    0  

[5 rows x 10000 columns]


In [31]:
df_in_encoded.to_csv('input_prof.csv', index=False)