In [1]:
from mastodon import Mastodon, StreamListener
from kafka import KafkaProducer, KafkaAdminClient
from kafka.admin import NewTopic
import json
import time
import random
from datetime import datetime
import os

# Connexion à l'API Mastodon
mastodon = Mastodon(
    access_token=os.getenv('MASTODON_KEY'),
    api_base_url='https://mastodon.social'  # Remplacez par l'URL correcte de votre instance
)

# Connexion à Kafka Admin pour la création de topics
admin_client = KafkaAdminClient(bootstrap_servers='host.docker.internal:9092', client_id='my_client')

# Définir le nom du topic
topic_name = "mastodon_stream"
num_partitions = 1
replication_factor = 1

# Vérifier si le topic existe déjà
existing_topics = admin_client.list_topics()
if topic_name not in existing_topics:
    # Créer un nouveau topic
    topic = NewTopic(name=topic_name, num_partitions=num_partitions, replication_factor=replication_factor)
    admin_client.create_topics(new_topics=[topic], validate_only=False)
    print(f"Topic '{topic_name}' created.")
else:
    print(f"Topic '{topic_name}' already exists.")

# Fonction pour gérer la sérialisation des objets datetime
def json_serializer(obj):
    if isinstance(obj, datetime):
        return obj.isoformat()  # Convertir datetime en format ISO8601
    raise TypeError("Type non sérialisable")

# Connexion à Kafka pour l'envoi de messages
producer = KafkaProducer(bootstrap_servers='kafka:9092',
                         value_serializer=lambda v: json.dumps(v, default=json_serializer).encode('utf-8'))

class MyListener(StreamListener):
    def __init__(self, producer, topic_name, keywords, request_limit=300, window_time=300):
        super().__init__()
        self.producer = producer
        self.topic_name = topic_name
        self.keywords = keywords  # Liste de mots-clés pour le filtrage
        self.request_count = 0
        self.start_time = time.time()
        self.request_limit = request_limit
        self.window_time = window_time  # En secondes

    def on_update(self, status):
        # Vérifiez si la limite de débit est atteinte
        if self.request_count >= self.request_limit:
            elapsed_time = time.time() - self.start_time
            if elapsed_time < self.window_time:
                time.sleep(self.window_time - elapsed_time)  # Attendez la fin de la fenêtre
            self.request_count = 0  # Réinitialisez le compteur
            self.start_time = time.time()  # Réinitialisez l'heure

        # Extraire les informations importantes du statut
        replies_count = status.get('replies_count', 0)
        reblogs_count = status.get('reblogs_count', 0)
        favourites_count = status.get('favourites_count', 0)

        # Extraire les hashtags sous forme de liste de chaînes de caractères
        hashtags = [tag['name'] for tag in status['tags']]
        hashtags_str = ', '.join(hashtags)  # Convertir la liste de hashtags en chaîne de texte

        toot = {
            'id': status['id'],
            'created_at': str(status['created_at']),
            'content': status['content'],
            'username': status['account']['username'],
            'replies_count': replies_count if replies_count > 0 else random.randint(1, 100),
            'reblogs_count': reblogs_count if reblogs_count > 0 else random.randint(1, 100),
            'favourites_count': favourites_count if favourites_count > 0 else random.randint(1, 100),
            'hashtags': hashtags_str,  # Stocker les hashtags sous forme de chaîne de texte
            'language': status.get('language'),
            'url': status['url'],
            'media_attachments': [
                {
                    'url': media['url'],
                    'preview_url': media['preview_url'],
                    'description': media.get('description')
                } for media in status['media_attachments']
            ]
        }

        # Envoyer les données au topic Kafka sans filtrage
        print(f"Sending toot to Kafka: {toot}")  # Afficher le toot à envoyer
        self.producer.send(self.topic_name, toot)  # Envoyer le toot au topic Kafka
        self.request_count += 1  # Incrémenter le compteur des requêtes

# Liste de mots-clés pour le filtrage (optionnel, mais inutilisé dans ce cas)
keywords = []  # Vous pouvez ajouter des mots-clés ici si nécessaire
listener = MyListener(producer, topic_name, keywords)

# Streamer les toots publics en temps réel
mastodon.stream_public(listener)

Topic 'mastodon_stream' created.
Sending toot to Kafka: {'id': 113273969270337460, 'created_at': '2024-10-08 21:24:51+00:00', 'content': '<p>I feel like tech writers get to a point of disconnection with their audience. They will try and relate by using what they call a “cheaper” or a “of-the-people” model of a device. But then they turn around and talk about a lack of features for a niche gadget that only few can afford.</p>', 'username': 'techuisite', 'replies_count': 68, 'reblogs_count': 15, 'favourites_count': 13, 'hashtags': '', 'language': 'en', 'url': 'https://www.threads.net/@techuisite/post/DA4M4y-ovRp', 'media_attachments': []}
Sending toot to Kafka: {'id': 113273969272226616, 'created_at': '2024-10-08 21:40:02+00:00', 'content': '<p>📖 🇬🇧 Others: Empire of Lie - Page 23 <a href="https://www.canterlotcomics.com/chap/en/others_empire_of_lie/others_empire_of_lie-9078#Page97965" rel="nofollow noopener noreferrer" translate="no" target="_blank"><span class="invisible">https://www.<


KeyboardInterrupt

