In [1]:
import glob
import json
import os
from typing import List
import pandas as pd

from cassandra.cluster import Cluster


table_drop = "DROP TABLE events"

table_create = """
    CREATE TABLE IF NOT EXISTS events
    (
        eventid text,
        username text,
        filenumber text,
        filename text,
        comment text,
        PRIMARY KEY (
            eventid,
            username,
            filenumber
        )
    )
"""

create_table_queries = [
    table_create,
]
drop_table_queries = [
    table_drop,
]

def drop_tables(session):
    for query in drop_table_queries:
        try:
            rows = session.execute(query)
        except Exception as e:
            print(e)


def create_tables(session):
    for query in create_table_queries:
        try:
            session.execute(query)
        except Exception as e:
            print(e)


def get_files(filepath: str) -> List[str]:
    """
    Description: This function is responsible for listing the files in a directory
    """

    all_files = []
    for root, dirs, files in os.walk(filepath):
        files = glob.glob(os.path.join(root, "*.json"))
        for f in files:
            all_files.append(os.path.abspath(f))

    num_files = len(all_files)
    print(f"{num_files} files found in {filepath}")

    return all_files


def process(session, filepath):
    # Get list of files from filepath
    all_files = get_files(filepath)

    for datafile in all_files:
        with open(datafile, "r") as f:
            data = json.loads(f.read())
            for each in data:
                # Print some sample data
                print(each["eventid"], each["username"], each["filenumber"]["filename"])

                # Insert data into tables here


def insert_sample_data(session):
    df = pd.read_json("data/events.json")
    loopcount = 0
    while loopcount < len(df['eventid']) :
        query = f"""
            INSERT INTO events (eventid, username, filenumber, filename, comment) 
            VALUES ('{df["eventid"][loopcount]}', '{df["username"][loopcount]}', '{df["filenumber"][loopcount]}', '{df["filename"][loopcount]}', '{df["comment"][loopcount]}')
            """
        session.execute(query)
        loopcount += 1


def main():
    cluster = Cluster(['127.0.0.1'])
    session = cluster.connect()

    # Create keyspace
    try:
        session.execute(
            """
            CREATE KEYSPACE IF NOT EXISTS github_events
            WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }
            """
        )
    except Exception as e:
        print(e)

    # Set keyspace
    try:
        session.set_keyspace("github_events")
    except Exception as e:
        print(e)

    drop_tables(session)
    create_tables(session)

    process(session, filepath="/data")
    insert_sample_data(session)

    # Select data in Cassandra and print them to stdout
    query = """
    SELECT * from events
    """
    try:
        rows = session.execute(query)
    except Exception as e:
        print(e)

    for row in rows:
        print(row)


if __name__ == "__main__":
    main()

0 files found in /data
Row(eventid='2', username='user2', filenumber='1', comment='image', filename='file4.jpg')
Row(eventid='1', username='user1', filenumber='1', comment='image', filename='file1.jpg')
Row(eventid='1', username='user2', filenumber='1', comment='image', filename='file2.jpg')
Row(eventid='1', username='user3', filenumber='1', comment='image', filename='file3.jpg')
