In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datetime import datetime
import time
import gc
import os
import json
from tqdm import tqdm
import psutil

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
print("=== VERIFICAÇÃO DE RECURSOS COM PYTHON ===")

# Verificar se PyTorch vê as GPUs
print(f"GPUs disponíveis: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
    print(f"  Memória: {torch.cuda.get_device_properties(i).total_memory / 1e9:.1f} GB")

# Verificar CPU
print(f"\nNúcleos CPU: {os.cpu_count()}")

# Verificar memória
import psutil
print(f"Memória RAM Total: {psutil.virtual_memory().total / 1e9:.1f} GB")
print(f"Memória RAM Disponível: {psutil.virtual_memory().available / 1e9:.1f} GB")

=== VERIFICAÇÃO DE RECURSOS COM PYTHON ===
GPUs disponíveis: 3
GPU 0: NVIDIA L40S
  Memória: 47.7 GB
GPU 1: NVIDIA L40S
  Memória: 47.7 GB
GPU 2: NVIDIA L40S
  Memória: 47.7 GB

Núcleos CPU: 64
Memória RAM Total: 540.3 GB
Memória RAM Disponível: 496.4 GB


In [2]:
# Execute este código em uma máquina COM internet
from transformers import AutoTokenizer, AutoModel
import torch

model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"  # ou use o modelo 7B se preferir
token = "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"

# Baixe o tokenizer e modelo
tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
model = AutoModel.from_pretrained(model_id, token=token, torch_dtype=torch.bfloat16)

print("✅ Download completo! Arquivos salvos em cache.")

KeyboardInterrupt: 

In [2]:
model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

# ✅ Carrega tokenizer em modo offline
tokenizer = AutoTokenizer.from_pretrained(
    model_id, 
    local_files_only=True
)

# ✅ Carrega modelo em modo offline
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    local_files_only=True
)

topic = "artificial intelligence in education"
n_tweets = 5000
batch_size = 5
save_every = 20
csv_filename = f"Authorship-attribution/df_pronto/tweets_ai_education_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
checkpoint_file = f"checkpoint_{timestamp}.json"

print("✅ Modelo carregado com sucesso em modo offline!")

Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 19/19 [00:13<00:00,  1.38it/s]
Some parameters are on the meta device because they were offloaded to the cpu.


✅ Modelo carregado com sucesso em modo offline!


In [3]:
def load_checkpoint():
    """Carrega progresso anterior se existir"""
    if os.path.exists(checkpoint_file):
        try:
            with open(checkpoint_file, 'r') as f:
                data = json.load(f)
            print(f"📖 Checkpoint carregado: {data['generated']} tweets")
            return data['all_tweets'], data['generated']
        except:
            pass
    return [], 0

def save_checkpoint(all_tweets, generated_count):
    """Salva progresso atual"""
    checkpoint_data = {
        'all_tweets': all_tweets,
        'generated': generated_count,
        'last_update': datetime.now().isoformat()
    }
    with open(checkpoint_file, 'w') as f:
        json.dump(checkpoint_data, f)

def save_to_csv(all_tweets, topic, final=False):
    """Salva tweets em CSV"""
    if not all_tweets:
        return
    
    df = pd.DataFrame({
        'tweet_id': range(1, len(all_tweets) + 1),
        'text': all_tweets,
        'topic': topic,
        'length': [len(tweet) for tweet in all_tweets],
        'batch_timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'characters_remaining': [280 - len(tweet) for tweet in all_tweets]
    })
    
    # Modo append se não for final
    mode = 'w' if final else 'a'
    header = final or not os.path.exists(csv_filename)
    
    df.to_csv(csv_filename, mode=mode, header=header, index=False, encoding='utf-8')

def generate_batch(tokenizer, model, num_tweets, topic):
    """Gera um lote de tweets"""
    try:
        # Prompt otimizado
        messages = [
            {"role": "user", "content": f"Generate {num_tweets} unique, engaging tweets about '{topic}'. Each must be under 280 characters, sound natural, and be separated by new lines. No numbering."}
        ]

        formatted_prompt = tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
        )

        # Tokenização
        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)

        # Geração com parâmetros conservadores
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=800,  # ✅ Conservador
                do_sample=True,
                temperature=0.8,     # ✅ Mais determinístico
                top_p=0.85,
                repetition_penalty=1.2,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.eos_token_id,
            )

        # Processamento da resposta
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        response = generated_text.replace(formatted_prompt, "").strip()
        
        # Extrai tweets
        batch_tweets = []
        for tweet in response.split('\n'):
            tweet = tweet.strip()
            if tweet and len(tweet) > 15 and len(tweet) <= 280:  # Filtros rigorosos
                batch_tweets.append(tweet)
                
        return batch_tweets[:num_tweets]  # Garante não exceder
        
    except Exception as e:
        print(f"❌ Erro na geração: {e}")
        return []

In [5]:
all_tweets, already_generated = load_checkpoint()
print(f"📊 Progresso inicial: {already_generated}/{n_tweets} tweets")

# Calcula número de lotes necessários
n_batches = (n_tweets + batch_size - 1) // batch_size

# Barra de progresso
pbar = tqdm(total=n_tweets, initial=already_generated, desc="Gerando tweets", unit="tweet")

start_time = time.time()
last_save_time = time.time()

try:
    for batch in range(n_batches):
        if len(all_tweets) >= n_tweets:
            break
            
        tweets_needed = min(batch_size, n_tweets - len(all_tweets))
        
        print(f"📦 Lote {batch + 1}/{n_batches}: Gerando {tweets_needed} tweets...")
        
        # Gera lote
        new_tweets = generate_batch(tokenizer, model, tweets_needed, topic)
        
        if new_tweets:
            all_tweets.extend(new_tweets)
            pbar.update(len(new_tweets))
            pbar.set_postfix({'Total': len(all_tweets), 'Lote': len(new_tweets)})
            
            # Salvamento periódico
            current_time = time.time()
            if (len(all_tweets) % save_every == 0 or 
                current_time - last_save_time > 300):  # 5 minutos
                
                save_to_csv(all_tweets, topic)
                save_checkpoint(all_tweets, len(all_tweets))
                last_save_time = current_time
                print(f"💾 Checkpoint salvo: {len(all_tweets)} tweets")
        
        # Pausa estratégica
        time.sleep(2)
        
        # Limpeza de memória
        if len(all_tweets) % 20 == 0:
            gc.collect()
            torch.cuda.empty_cache()

except KeyboardInterrupt:
    print("\n⏹️  Interrompido pelo usuário")
except Exception as e:
    print(f"❌ Erro durante a geração: {e}")

finally:
    # Finalização
    pbar.close()
    
    # Limita ao número exato desejado
    all_tweets = all_tweets[:n_tweets]
    
    # Salvamento final
    save_to_csv(all_tweets,topic, final=True)
    save_checkpoint(all_tweets, len(all_tweets))
    
    # Estatísticas
    total_time = time.time() - start_time
    tweets_per_hour = (len(all_tweets) / total_time) * 3600 if total_time > 0 else 0
    
    print("\n" + "=" * 60)
    print("📊 RELATÓRIO FINAL")
    print("=" * 60)
    print(f"✅ Tweets gerados: {len(all_tweets)}/{n_tweets}")
    print(f"⏰ Tempo decorrido: {total_time/60:.1f} minutos")
    print(f"📈 Velocidade: {tweets_per_hour:.1f} tweets/hora")
    print(f"💾 Arquivo salvo: {csv_filename}")
    
    if all_tweets:
        avg_length = sum(len(t) for t in all_tweets) / len(all_tweets)
        print(f"📏 Comprimento médio: {avg_length:.1f} caracteres")
        
        # Exemplo dos primeiros tweets
        print("\n🎯 Primeiros tweets gerados:")
        for i, tweet in enumerate(all_tweets[:3], 1):
            print(f"{i}. {tweet}")

# Exibe o DataFrame final
if all_tweets:
    df_tweets = pd.DataFrame({
        'tweet_id': range(1, len(all_tweets) + 1),
        'text': all_tweets,
        'topic': topic,
        'length': [len(tweet) for tweet in all_tweets]
    })
    
    print(f"\n📋 DataFrame final ({len(df_tweets)} tweets):")
    print(df_tweets.head())
else:
    print("❌ Nenhum tweet foi gerado")

📖 Checkpoint carregado: 16 tweets
📊 Progresso inicial: 16/5000 tweets


Gerando tweets:   0%|▍                                                                                                                        | 16/5000 [00:00<?, ?tweet/s]

📦 Lote 1/1000: Gerando 5 tweets...


Gerando tweets:   0%|▎                                                                                           | 20/5000 [03:02<62:59:31, 45.54s/tweet, Total=20, Lote=4]

💾 Checkpoint salvo: 20 tweets
📦 Lote 2/1000: Gerando 5 tweets...


Gerando tweets:   0%|▍                                                                                           | 24/5000 [05:53<60:50:43, 44.02s/tweet, Total=24, Lote=4]

📦 Lote 3/1000: Gerando 5 tweets...


Gerando tweets:   1%|▌                                                                                           | 28/5000 [09:04<63:05:19, 45.68s/tweet, Total=28, Lote=4]

💾 Checkpoint salvo: 28 tweets
📦 Lote 4/1000: Gerando 5 tweets...


Gerando tweets:   1%|▌                                                                                           | 32/5000 [11:49<60:34:15, 43.89s/tweet, Total=32, Lote=4]

📦 Lote 5/1000: Gerando 5 tweets...


Gerando tweets:   1%|▋                                                                                           | 36/5000 [15:25<65:33:50, 47.55s/tweet, Total=36, Lote=4]

💾 Checkpoint salvo: 36 tweets
📦 Lote 6/1000: Gerando 5 tweets...


Gerando tweets:   1%|▋                                                                                           | 36/5000 [18:46<77:40:44, 56.33s/tweet, Total=36, Lote=4]


⏹️  Interrompido pelo usuário

📊 RELATÓRIO FINAL
✅ Tweets gerados: 36/5000
⏰ Tempo decorrido: 18.8 minutos
📈 Velocidade: 115.0 tweets/hora
💾 Arquivo salvo: Authorship-attribution/df_pronto/tweets_ai_education_20250907_111642.csv
📏 Comprimento médio: 175.1 caracteres

🎯 Primeiros tweets gerados:
1. "Artificial Intelligence is no longer just science fiction; it's reshaping our schools as we know them. Personalized learning paths for every student? Yes, please! #FutureOfEducation"
2. "From intelligent tutoring systems to predictive analytics, artificial intelligence is empowering educators with tools to enhance teaching methods. The future looks bright! #AIinEdu"
3. "Imagine an assistant that can grade thousands of essays instantly, freeing up teachers' time for more meaningful interactions with students. That's the power of AI! #EdTech"

📋 DataFrame final (36 tweets):
   tweet_id                                               text  \
0         1  "Artificial Intelligence is no longer jus




In [2]:
import pandas as pd
mistral_df = pd.read_csv("~/Authorship-attribution/df_pronto/tweets_ai_education_20250824_154410.csv")


In [4]:
mistral_df.reset_index()

Unnamed: 0,index,tweet_id,text,topic,length,batch_timestamp,characters_remaining
0,0,1,"""Imagine an educator that's always available f...",artificial intelligence in education,156,2025-08-24 19:11:51,124
1,1,2,"""The fusion of human creativity and emotional ...",artificial intelligence in education,148,2025-08-24 19:11:51,132
2,2,3,"""Artificial Intelligence is no longer a thing ...",artificial intelligence in education,153,2025-08-24 19:11:51,127
3,3,4,"""From predictive analytics identifying at-risk...",artificial intelligence in education,241,2025-08-24 19:11:51,39
4,4,5,"""Artificial Intelligence is not just for sci-f...",artificial intelligence in education,164,2025-08-24 19:11:51,116
...,...,...,...,...,...,...,...
2296,26009,2316,"""No more one-size-fits-all teaching methods! W...",artificial intelligence in education,215,2025-08-24 19:41:04,65
2297,26010,2317,"""Artificial Intelligence is no longer just a s...",artificial intelligence in education,206,2025-08-24 19:41:04,74
2298,26011,2318,"""The rise of AI in classrooms means more inclu...",artificial intelligence in education,178,2025-08-24 19:41:04,102
2299,26012,2319,"""Imagine an exam corrector that doesn't just s...",artificial intelligence in education,187,2025-08-24 19:41:04,93


In [3]:
mistral_df = mistral_df.drop_duplicates(subset='text')
mistral_df

Unnamed: 0,tweet_id,text,topic,length,batch_timestamp,characters_remaining
0,1,"""Imagine an educator that's always available f...",artificial intelligence in education,156,2025-08-24 19:11:51,124
1,2,"""The fusion of human creativity and emotional ...",artificial intelligence in education,148,2025-08-24 19:11:51,132
2,3,"""Artificial Intelligence is no longer a thing ...",artificial intelligence in education,153,2025-08-24 19:11:51,127
3,4,"""From predictive analytics identifying at-risk...",artificial intelligence in education,241,2025-08-24 19:11:51,39
4,5,"""Artificial Intelligence is not just for sci-f...",artificial intelligence in education,164,2025-08-24 19:11:51,116
...,...,...,...,...,...,...
26009,2316,"""No more one-size-fits-all teaching methods! W...",artificial intelligence in education,215,2025-08-24 19:41:04,65
26010,2317,"""Artificial Intelligence is no longer just a s...",artificial intelligence in education,206,2025-08-24 19:41:04,74
26011,2318,"""The rise of AI in classrooms means more inclu...",artificial intelligence in education,178,2025-08-24 19:41:04,102
26012,2319,"""Imagine an exam corrector that doesn't just s...",artificial intelligence in education,187,2025-08-24 19:41:04,93


In [5]:
mistral_df.to_csv("~/Authorship-attribution/df_pronto/tweets_ai_education_20250824_154410.csv", index=False)

In [None]:
# 30 twwets -> 80 min 
# 100000 tweets -> 