# 0.0 IMPORTS

In [None]:
import pandas as pd
import tiktoken

pd.set_option("display.max_colwidth", 200)

: 

## 0.1 Helper Functions

In [None]:
def chunk_text(
    text: str,
    chunk_size: int = 1000,
    overlap: int = 150,
    encoding_name: str = "cl100k_base"
):
    enc = tiktoken.get_encoding(encoding_name)
    tokens = enc.encode(text)

    chunks = []
    start = 0

    while start < len(tokens):
        end = start + chunk_size
        chunk_tokens = tokens[start:end]
        chunk_text = enc.decode(chunk_tokens)
        chunks.append(chunk_text)

        start += chunk_size - overlap

    return chunks

def chunk_text_by_tokens(text, max_tokens=1000):
    tokens = enc.encode(text)
    
    chunks = []
    for i in range(0, len(tokens), max_tokens):
        chunk_tokens = tokens[i:i + max_tokens]
        chunk_text = enc.decode(chunk_tokens)
        chunks.append(chunk_text)
        
    return chunks

## 0.2 Loading Data

In [None]:
df_raw = pd.read_csv( "../data/raw/train.csv")

df_raw.head()

# 1.0 DESCRIÇÃO DOS DADOS

In [None]:
df1 = df_raw.copy()

In [None]:
df1.columns

## 1.1 Data Dimensions

In [None]:
print(f'Numero de Linhas: {df1.shape[0]}')
print(f'Numero de Colunas: {df1.shape[1]}')

## 1.2 Data Types

In [None]:
df1.info()

## 1.3 Check NA

In [None]:
df1.isna().sum()

# 2.0 ANALISE EXPLORATORIA DE DADOS

In [None]:
df2 = df1.copy()

## 2.1 Analise Texto

In [None]:
df2['text'].iloc[0]

In [None]:
df2['text'].iloc[1]

In [None]:
# Quantidade de caracteres
df2['text_length'] = df2['text'].str.len()

df2['text_length'].describe()

In [None]:
# Quantidade de palavras
df2['word_count'] = df2['text'].str.findall(r'\w+').str.len()

df2['word_count'].describe()

# 3.0 LIMPEZA DOS DADOS

In [None]:
df3 = df2.copy()

In [None]:
## Remoção de espaços extras e quebras de linhas

df3['text'] = (
    df3['text'].str.replace(r'\s+', ' ', regex=True)
               .str.strip()
)

## Remoção Caracteres invisíveis / estranhos
df3['text'] = (
    df3['text'].str.replace('\u00a0', ' ')
               .str.replace('\ufeff', '')
)

## Padronização haspas e Hífens
df3['text'] = (
    df3['text']
    .str.replace('“', '"')
    .str.replace('”', '"')
    .str.replace("’", "'")
    .str.replace("–", "-")
)
df3.head()

# 4.0 TOKENS

In [None]:
df4 = df3.copy()

In [None]:
enc = tiktoken.get_encoding("cl100k_base")
df4['n_tokens'] = df4['text'].apply(lambda x: len(enc.encode(x)))
df4['n_tokens'].describe()

In [None]:
sample_text = df4.loc[1, 'text']

chunks = chunk_text(sample_text)

len(chunks)

In [None]:
chunks[0][:500]

In [None]:
chunks[-1][:500]

In [None]:
[len(enc.encode(c)) for c in chunks[:5]]

In [None]:
chunks_data = []

for idx, row in df4.iterrows():
    article_id = idx
    text = row['text']
    
    chunks = chunk_text_by_tokens(text, max_tokens=1000)
    
    for chunk_id, chunk_text in enumerate(chunks):
        chunks_data.append({
            'article_id': article_id,
            'chunk_id': chunk_id,
            'chunk_text': chunk_text,
            'n_tokens': len(enc.encode(chunk_text))
        })


df_chunks = pd.DataFrame(chunks_data)

In [None]:
df_chunks.head()

In [None]:
df_chunks['n_tokens'].describe()

In [None]:
df_chunks.groupby('article_id').size().describe()

In [None]:
df_chunks.to_parquet(
    "data/processed/articles_chunks.parquet",
    index=False
)