In [1]:
from ntscraper import Nitter
import requests
import json
import random

# URL da sua API
login_url = 'http://localhost:3000/auth/login'
url = 'http://localhost:3000/usuarios'

# Credenciais do usuário
credentials = {
    'usuario': 'admin',
    'senha': 'admin'
}

# Fazendo uma solicitação POST para obter o token JWT
login_response = requests.post(login_url, json=credentials)

if login_response.status_code == 200:
    token = login_response.json()['token']
    
    # Incluindo o token JWT no cabeçalho Authorization
    headers = {
        'Authorization': "Bearer " + token
    }

    # Fazendo uma solicitação GET para a API protegida
    response = requests.get(url, headers=headers)

    # Verificando se a solicitação foi bem-sucedida (código de status 200)
    if response.status_code == 200:
        # Se a solicitação foi bem-sucedida, você pode acessar os dados da resposta assim:
        data = response.json()  # Isso assume que sua API retorna dados no formato JSON
        
        # Inicializa listas vazias para armazenar os nomes de perfil, ids de usuário e nomes de perfil
        perfis = []
        ids_usuarios = []
        nomes_perfis = []

        # Percorre a resposta da API
        for usuario in data:
            # Itera sobre os perfis do usuário
            for perfil in usuario.get('perfis', []):
                ids_usuarios.append(usuario['_id'])
                nomes_perfis.append(perfil['usuario'])
                perfis.append(perfil['usuario'])

        # Imprime as listas
        print("IDs de Usuários:", ids_usuarios)
        print("Nomes de Perfis:", nomes_perfis)
        print("Perfis:", perfis)
    else:
        print("Erro ao acessar a API. Código de status:", response.status_code)
else:
    print("Erro ao fazer login. Código de status:", login_response.status_code)

IDs de Usuários: ['66772a2c3e11c4f746ec936b', '66772a2c3e11c4f746ec936b', '66772a2c3e11c4f746ec936b', '66772a2c3e11c4f746ec936b', '66772a2c3e11c4f746ec936b', '6678c37f5285516dbef87718']
Nomes de Perfis: ['Germancanoofi', 'jhonariasa', 'fredgol9', 'phganso', 'tsilva3', 'vigiamenteteste']
Perfis: ['Germancanoofi', 'jhonariasa', 'fredgol9', 'phganso', 'tsilva3', 'vigiamenteteste']


In [29]:
from pprint import pprint
import requests
import json
import random
import ntscraper
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Baixar recursos adicionais do NLTK
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Carregar e treinar o modelo de classificação
data = pd.read_excel('Expanded_Dataset_Tweets.xlsx')

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('portuguese'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    processed_text = ' '.join(tokens)
    return processed_text

data['processed_tweets'] = data['tweets'].apply(preprocess_text)
X_train, X_test, y_train, y_test = train_test_split(data['processed_tweets'], data['classificador'], test_size=0.55, random_state=42)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

param_grid = {
    'tfidf__max_df': [0.8, 0.9, 1.0],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'clf__C': [0.1, 1, 10]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_pipeline = grid_search.best_estimator_

# Função de scraping
instances = [
    "https://nitter.lucabased.xyz",
    "https://nitter.privacydev.net"
]

def get_tweets_with_instance(scraper, username, number, instances):
    for _ in range(len(instances)):
        instance = random.choice(instances)
        try:
            tweets = scraper.get_tweets(username, mode="user", number=number, instance=instance)
            if tweets and tweets.get('tweets'):
                return tweets
            else:
                print(f"Nenhum tweet encontrado usando a instância {instance}")
                instances.remove(instance)
        except Exception as e:
            print(f"Erro ao usar a instância {instance}: {e}")
            instances.remove(instance)
    raise Exception("Não foi possível obter tweets de nenhuma instância.")

scraper = ntscraper.Nitter()

# Autenticação na API
login_url = 'http://localhost:3000/auth/login'
credentials = {
    'usuario': 'admin',
    'senha': 'admin'
}

login_response = requests.post(login_url, json=credentials)
token = login_response.json()['token']
headers = {
    'Content-Type': 'application/json',
    'Authorization': "Bearer " + token
}

# URL da API
api_url = "http://localhost:3000"
# Função principal
def process_and_send_tweets(ids_usuarios, nomes_perfis):
    for usuarioId, nomePerfil in zip(ids_usuarios, nomes_perfis):
        try:
            tweets = get_tweets_with_instance(scraper, nomePerfil, 1, instances.copy())
        except Exception as e:
            print(f"Erro ao obter tweets para o perfil {nomePerfil}: {e}")
            continue
        
        tweets_processados = []
        for tweet in tweets['tweets']:
            tweet_text = tweet['text']
            processed_text = preprocess_text(tweet_text)
            is_suicida = best_pipeline.predict([processed_text])[0] == 'Post  com Potencial Suicida'
            
            tweet_processado = {
                'link': tweet['link'],
                'texto': tweet_text,
                'usuarioId': usuarioId,
                'nomePerfil': nomePerfil,
                'isSuicida': is_suicida
            }
            tweets_processados.append(tweet_processado)

        for tweet in tweets_processados:
            dados = json.dumps(tweet)
            response = requests.post(f"{api_url}/tweets", data=dados, headers=headers)
            if response.status_code == 201:
                print("Tweet inserido com sucesso!")
                id_inserido = response.json()['tweet']['_id']
                print("ID do tweet inserido:", id_inserido)
            else:
                print("Erro ao inserir tweet:", response.text)

# Executar o processo
process_and_send_tweets(ids_usuarios, nomes_perfis)


23-Jun-24 17:20:19 - NumExpr defaulting to 8 threads.


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Testing instances:  92%|██████████████████████████████████████████████████████████     | 71/77 [02:12<00:08,  1.45s/it]

23-Jun-24 17:22:37 - Certificate did not match expected hostname: nt.ggtyler.dev. Certificate: {'subject': ((('commonName', '4g.ggtyler.dev'),),), 'issuer': ((('countryName', 'US'),), (('organizationName', "Let's Encrypt"),), (('commonName', 'R3'),)), 'version': 3, 'serialNumber': '03A4459CE6DA31CFF555DB78FF3F35BFEA88', 'notBefore': 'May 14 10:55:44 2024 GMT', 'notAfter': 'Aug 12 10:55:43 2024 GMT', 'subjectAltName': (('DNS', '4g.ggtyler.dev'),), 'OCSP': ('http://r3.o.lencr.org',), 'caIssuers': ('http://r3.i.lencr.org/',)}


Testing instances:  94%|██████████████████████████████████████████████████████████▉    | 72/77 [02:12<00:05,  1.20s/it]

23-Jun-24 17:22:38 - Certificate did not match expected hostname: nitter.uni-sonia.com. Certificate: {'subject': ((('commonName', '*.xserver.jp'),),), 'issuer': ((('countryName', 'JP'),), (('organizationName', 'CloudSecure Corporation'),), (('commonName', 'CloudSecure RSA Domain Validation Secure Server CA 2'),)), 'version': 3, 'serialNumber': 'ACA67AD2030638EE2DCE8E845B8299A6', 'notBefore': 'Mar 11 00:00:00 2024 GMT', 'notAfter': 'Apr 11 23:59:59 2025 GMT', 'subjectAltName': (('DNS', '*.xserver.jp'), ('DNS', 'xserver.jp')), 'OCSP': ('http://ocsp.sectigo.com',), 'caIssuers': ('http://crt.sectigo.com/CloudSecureRSADomainValidationSecureServerCA2.crt',)}


Testing instances:  99%|██████████████████████████████████████████████████████████████▏| 76/77 [02:18<00:01,  1.45s/it]

23-Jun-24 17:22:45 - Certificate did not match expected hostname: nitter.tinfoil-hat.net. Certificate: {'subject': ((('commonName', 'jelly.tinfoil-hat.de'),),), 'issuer': ((('countryName', 'US'),), (('organizationName', "Let's Encrypt"),), (('commonName', 'R3'),)), 'version': 3, 'serialNumber': '044FDE3E7089FB997C3D8AFDE2412CE51554', 'notBefore': 'May 15 09:29:23 2024 GMT', 'notAfter': 'Aug 13 09:29:22 2024 GMT', 'subjectAltName': (('DNS', 'jelly.tinfoil-hat.de'),), 'OCSP': ('http://r3.o.lencr.org',), 'caIssuers': ('http://r3.i.lencr.org/',)}


Testing instances: 100%|███████████████████████████████████████████████████████████████| 77/77 [02:20<00:00,  1.83s/it]


23-Jun-24 17:22:53 - Current stats for Germancanoofi: 1 tweets, 0 threads...
Tweet inserido com sucesso!
ID do tweet inserido: 6678841d479ed5327c41099b
23-Jun-24 17:22:55 - Fetching error: Instance has been rate limited.Use another instance or try again later.
Nenhum tweet encontrado usando a instância https://nitter.lucabased.xyz
23-Jun-24 17:23:04 - Current stats for jhonariasa: 1 tweets, 0 threads...
Erro ao inserir tweet: {"message":"O tweet não foi cadastrado porque o link já existe"}
23-Jun-24 17:23:05 - Fetching error: Instance has been rate limited.Use another instance or try again later.
Nenhum tweet encontrado usando a instância https://nitter.lucabased.xyz
23-Jun-24 17:23:14 - Current stats for fredgol9: 1 tweets, 0 threads...
Erro ao inserir tweet: {"message":"O tweet não foi cadastrado porque o link já existe"}
23-Jun-24 17:23:17 - Fetching error: Instance has been rate limited.Use another instance or try again later.
Nenhum tweet encontrado usando a instância https://nitt