In [5]:
# Working Reservoir Sampling

from pyflink.common.serialization import DeserializationSchema, SimpleStringSchema
from pyflink.common.typeinfo import Types
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.datastream.connectors import FlinkKafkaConsumer, FlinkKafkaProducer

import json
import random

class JsonDeserializationSchema(DeserializationSchema):
    def deserialize(self, message: bytes):
        try:
            json_data = json.loads(message.decode('utf-8'))
            return True, json_data
        except json.JSONDecodeError:
            return False, None

    def get_produced_type(self):
        return Types.PICKLED_BYTE_ARRAY()

def main():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    env.add_jars("file:///Users/spartan/Downloads/flink-sql-connector-kafka-1.17.2.jar")

    kafka_consumer = FlinkKafkaConsumer(
        topics='Big_Data_Project',
        deserialization_schema=SimpleStringSchema(),
        properties={'bootstrap.servers': 'localhost:9092', 'group.id': 'Big_Data_Project_1'}
    )

    data_stream = env.add_source(kafka_consumer)

    def json_parse_map(value):
        json_data = json.loads(value)
        return json_data

    def reservoir_sampling(data, sample_size=15):
        sample = []
        for index, item in enumerate(data):
            if index < sample_size:
                sample.append(item)
            else:
                r = random.randint(0, index)
                if r < sample_size:
                    sample[r] = item
        return sample

    parsed_stream = data_stream.map(json_parse_map)
    sampled_stream = parsed_stream.map(lambda x: reservoir_sampling([x]))
    sampled_stream.print()
    #kafka_producer = FlinkKafkaProducer(
    #    topic='per_test',
    #    serialization_schema=SimpleStringSchema(),
    #    producer_config={'bootstrap.servers': 'localhost:9092'}
    #)
    print(sampled_stream)
    # sampled_stream.add_sink(kafka_producer)

    env.execute("Reservoir Sampling with Kafka and PyFlink")

if __name__ == "__main__":
    main()


<pyflink.datastream.data_stream.DataStream object at 0x1053d9e50>
[{'articleID': '02ae988b188249118a87bd721d2ac597', 'title': 'An energy salesperson came to my door. Will they actually save me money?', 'description': "Here's what to know before you choose whether to engage with a salesperson.", 'content': "Door-to-door energy salespeople may pitch you steep savings on your monthly bills. Experts say it's best to be wary. Read more\n\nChristopher Reynolds was putting on his shoes inside his Conshohocken ho... [5930 symbols]", 'publishedDate': '2024-04-20T09:00:00+00:00', 'url': 'https://www.inquirer.com/business/energy/third-party-energy-sales-soliciting-20240420.html'}]
[{'articleID': '3bbfd4646977400db17f1b3df7390743', 'title': 'Community efforts to fund teachers, students growing in Coastal Bend', 'description': 'Local education foundations formed to support school districts are popping up in the Coastal Bend, following example of existing groups.', 'content': 'Community dignitaries 

KeyboardInterrupt: 

In [9]:
# Working Min-Hash Algorithm

from pyflink.datastream import StreamExecutionEnvironment
from pyflink.common.serialization import SimpleStringSchema
import json
import random
import hashlib

def main():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    env.add_jars("file:///Users/spartan/Downloads/flink-sql-connector-kafka-1.17.2.jar")

    kafka_consumer = FlinkKafkaConsumer(
        topics='Big_Data_Project_LSH',
        deserialization_schema=SimpleStringSchema(),
        properties={'bootstrap.servers': 'localhost:9092', 'group.id': 'Big_Data_Project_LSH_1'}
    )

    data_stream = env.add_source(kafka_consumer)

    def json_parse_map(value):
        try:
            json_data = json.loads(value)
            return json_data['title'], json_data['content']
        except Exception as e:
            print(f"Failed to parse JSON: {e}")
            return None
    def get_min_hash(text, num_hashes):
        min_hashes = [float('inf')] * num_hashes
        for word in set(text.split()):
            for i in range(num_hashes):
                hash_val = int(hashlib.sha256(f"{word}_{i}".encode()).hexdigest(), 16)
                min_hashes[i] = min(min_hashes[i], hash_val)
        return min_hashes
    
    def lsh_bucketing(title, description, num_hashes, num_bands):
        min_hashes = get_min_hash(description, num_hashes)
        buckets = []
        band_size = len(min_hashes) // num_bands
        for i in range(num_bands):
            band = min_hashes[i * band_size:(i + 1) * band_size]
            bucket_hash = hashlib.sha256(str(band).encode()).hexdigest()
            buckets.append((title, bucket_hash))  # Pair the title with the bucket hash
        return buckets

    parsed_stream = data_stream.map(json_parse_map)
    bucketed_stream = parsed_stream.flat_map(lambda x: lsh_bucketing(x[0], x[1], 10, 1))

    bucketed_stream.print()

    env.execute("Locality-Sensitive Hashing with Kafka and PyFlink")

if __name__ == "__main__":
    main()


('An energy salesperson came to my door. Will they actually save me money?', '076dbc4ed4904998ca6114afe2472cd388358751eb2823ed4b5726e41d407582')
('An energy salesperson came to my door. Will they actually save me money?', 'c56d0d619a5ab74c2e76c1b02ae480f6b5759a2009429161403994bcdcad63e5')
('An energy salesperson came to my door. Will they actually save me money?', '331d013660571dced150cf5eeb5f0994fab161f8915a45a629ad33098baabf11')
('An energy salesperson came to my door. Will they actually save me money?', '1c81974ed17a2b20ce56869b4aa57120f17ed81eabe14573d513ac01916f96d2')
('An energy salesperson came to my door. Will they actually save me money?', 'fb743d899cefe88f3e0c474dbd199a29264d1ac5e05c0e14e8e0553d518c3ee3')
('An energy salesperson came to my door. Will they actually save me money?', '245e682a6be5086b5a19ac3cebc2123f515112c5cb8260ded68bec0b5a327a6b')
('An energy salesperson came to my door. Will they actually save me money?', 'd960c6600d42a7a2b80c8142503bde68dc72460206744ff30a3b

KeyboardInterrupt: 