# DECOMPRESS

In [4]:
import csv
import subprocess

def decompress_zst_file(input_file, output_file):
    decompress_command = ['zstd', '--long=31', '-dc', input_file]
    with open(output_file, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        with subprocess.Popen(decompress_command, stdout=subprocess.PIPE, universal_newlines=True) as process:
            for line in process.stdout:
                row = line.strip().split('\t')
                writer.writerow(row)

# Beispielaufruf der Funktion
input_file = r'/Volumes/WD5TB/reddit/comments/RC_2005-12.zst'
output_file = r'/Volumes/WD5TB/Raw_Output_Comments.csv'
decompress_zst_file(input_file, output_file)



# SEARCH WITHIN ONE ZST FILE

In [1]:
import os
import zstandard as zstd
import json
import csv

def extract_zst_file(file_path, output_file_path, keyword):
    print("Dateipfad:", file_path)
    file_size = os.path.getsize(file_path)
    print("Dateigröße:", file_size)

    # Ergebnisse in eine CSV-Datei umleiten
    with open(output_file_path, 'w', newline='') as output_file:
        csv_writer = csv.writer(output_file)

        with open(file_path, 'rb') as compressed_file:
            dctx = zstd.ZstdDecompressor(max_window_size=2147483648)  # Max Window Size auf 4 GB erhöht
            reader = dctx.stream_reader(compressed_file)
            chunk_size = 10 * 1024 * 1024  # Chunk-Größe auf 10 MB festgelegt
            current_line = b''

            def line_generator():
                for chunk in iter(lambda: reader.read(chunk_size), b''):
                    nonlocal current_line
                    lines = (current_line + chunk).split(b'\n')
                    for line in lines[:-1]:
                        yield line
                    current_line = lines[-1]
                if current_line:
                    yield current_line

            # Counter hinzugefügt
            counter = 0
            for line in line_generator():
                counter += process_line(line, csv_writer, keyword)
            print("Anzahl der Zeilen, die das Schlüsselwort enthalten:", counter)

def process_line(line, csv_writer, keyword):
    decoded_line = line.decode('utf-8')
    decoded_line = json.loads(decoded_line)
    # Check if the keyword is in the "title" or "selftext"
    if keyword.lower() in decoded_line.get("title", "").lower() or keyword.lower() in decoded_line.get("selftext", "").lower():
        # Ergebnisse in die CSV-Datei schreiben
        csv_writer.writerow([decoded_line["created_utc"], decoded_line["title"], decoded_line.get("selftext", "")])
        return 1
    return 0

# Beispielaufruf der Funktion
zst_file_path = '/Volumes/WD5TB/reddit/submissions/RS_2021-01.zst'
output_file_path = '/Volumes/WD5TB/output202101.csv'
keyword = "$MSFT"
extract_zst_file(zst_file_path, output_file_path, keyword)


Dateipfad: /Volumes/WD5TB/reddit/submissions/RS_2021-01.zst
Dateigröße: 8698915333
Anzahl der Zeilen, die das Schlüsselwort enthalten: 62


# SEARCH WITHIN DIRECTION (ONE CSV)

In [1]:
import os
import zstandard as zstd
import json
import csv

def extract_zst_files(zst_directory, output_file_path, keyword):
    # CSV-Datei zum Schreiben öffnen
    with open(output_file_path, 'w', newline='') as output_file:
        csv_writer = csv.writer(output_file)

        # Durch alle .zst-Dateien im Verzeichnis iterieren
        for filename in os.listdir(zst_directory):
            if filename.endswith(".zst") and not filename.startswith("."):
                year = filename.split("_")[1][:4]
                if "2020" <= year <= "2022":
                    zst_file_path = os.path.join(zst_directory, filename)
                    print(zst_file_path)
                    extract_zst_file(zst_file_path, csv_writer, keyword)

def extract_zst_file(file_path, csv_writer, keyword):
    with open(file_path, 'rb') as compressed_file:
        dctx = zstd.ZstdDecompressor(max_window_size=2147483648)
        reader = dctx.stream_reader(compressed_file)
        chunk_size = 100 * 1024 * 1024
        current_line = b''

        def line_generator():
            for chunk in iter(lambda: reader.read(chunk_size), b''):
                nonlocal current_line
                lines = (current_line + chunk).split(b'\n')
                for line in lines[:-1]:
                    yield line
                current_line = lines[-1]
            if current_line:
                yield current_line

        counter = 0
        for line in line_generator():
            counter += process_line(line, csv_writer, keyword)
        print("Anzahl der Zeilen, die das Schlüsselwort enthalten:", counter)

def process_line(line, csv_writer, keyword):
    decoded_line = line.decode('utf-8')
    decoded_line = json.loads(decoded_line)
    if keyword.lower() in decoded_line.get("title", "").lower() or keyword.lower() in decoded_line.get("selftext", "").lower():
        csv_writer.writerow([decoded_line["created_utc"], decoded_line["title"], decoded_line.get("selftext", "")])
        return 1
    return 0

# Beispielaufruf der Funktion für mehrere .zst-Dateien
zst_directory = '/Volumes/WD5TB/reddit/submissions/'
output_file_path = '/Volumes/WD5TB/all_results.csv'
keyword = "$TSLA"

extract_zst_files(zst_directory, output_file_path, keyword)


/Volumes/WD5TB/reddit/submissions/RS_2021-01.zst
Anzahl der Zeilen, die das Schlüsselwort enthalten: 913
/Volumes/WD5TB/reddit/submissions/RS_2020-06.zst
Anzahl der Zeilen, die das Schlüsselwort enthalten: 143
/Volumes/WD5TB/reddit/submissions/RS_2020-04.zst
Anzahl der Zeilen, die das Schlüsselwort enthalten: 173
/Volumes/WD5TB/reddit/submissions/RS_2021-03.zst
Anzahl der Zeilen, die das Schlüsselwort enthalten: 760
/Volumes/WD5TB/reddit/submissions/RS_2021-02.zst
Anzahl der Zeilen, die das Schlüsselwort enthalten: 826
/Volumes/WD5TB/reddit/submissions/RS_2020-09.zst
Anzahl der Zeilen, die das Schlüsselwort enthalten: 765
/Volumes/WD5TB/reddit/submissions/RS_2021-04.zst
Anzahl der Zeilen, die das Schlüsselwort enthalten: 752
/Volumes/WD5TB/reddit/submissions/RS_2020-03.zst
Anzahl der Zeilen, die das Schlüsselwort enthalten: 158
/Volumes/WD5TB/reddit/submissions/RS_2021-12.zst
Anzahl der Zeilen, die das Schlüsselwort enthalten: 358
/Volumes/WD5TB/reddit/submissions/RS_2022-12.zst
Anzahl

KeyboardInterrupt: 

# SEARCH WITHIN DIRECTION (EACH CSV)

In [1]:
import os
import zstandard as zstd
import json
import csv

def extract_zst_files(zst_directory, output_directory, keyword):
    # Durch alle .zst-Dateien im Verzeichnis iterieren
    for filename in os.listdir(zst_directory):
        if filename.endswith(".zst") and not filename.startswith("."):
            year = filename.split("_")[1][:4]
            if "2022" <= year <= "2022":
                zst_file_path = os.path.join(zst_directory, filename)
                output_file_path = os.path.join(output_directory, filename.replace('.zst', '.csv'))
                print(zst_file_path)
                extract_zst_file(zst_file_path, output_file_path, keyword)

def extract_zst_file(file_path, output_file_path, keyword):
    # CSV-Datei zum Schreiben öffnen
    with open(output_file_path, 'w', newline='') as output_file:
        csv_writer = csv.writer(output_file)

        with open(file_path, 'rb') as compressed_file:
            dctx = zstd.ZstdDecompressor(max_window_size=2147483648)
            reader = dctx.stream_reader(compressed_file)
            chunk_size = 100 * 1024 * 1024
            current_line = b''

            def line_generator():
                for chunk in iter(lambda: reader.read(chunk_size), b''):
                    nonlocal current_line
                    lines = (current_line + chunk).split(b'\n')
                    for line in lines[:-1]:
                        yield line
                    current_line = lines[-1]
                if current_line:
                    yield current_line

            counter = 0
            for line in line_generator():
                counter += process_line(line, csv_writer, keyword)
            print("Anzahl der Zeilen, die das Schlüsselwort enthalten:", counter)

def process_line(line, csv_writer, keyword):
    decoded_line = line.decode('utf-8')
    decoded_line = json.loads(decoded_line)
    if keyword.lower() in decoded_line.get("title", "").lower() or keyword.lower() in decoded_line.get("selftext", "").lower():
        csv_writer.writerow([decoded_line["created_utc"], decoded_line["title"], decoded_line.get("selftext", "")])
        return 1
    return 0

# Beispielaufruf der Funktion für mehrere .zst-Dateien
zst_directory = '/Volumes/WD5TB/reddit/submissions/'
output_directory = '/Volumes/WD5TB/'
keyword = "$TSLA"

extract_zst_files(zst_directory, output_directory, keyword)


/Volumes/WD5TB/reddit/submissions/RS_2022-12.zst
Anzahl der Zeilen, die das Schlüsselwort enthalten: 749
/Volumes/WD5TB/reddit/submissions/RS_2022-08.zst
Anzahl der Zeilen, die das Schlüsselwort enthalten: 476
/Volumes/WD5TB/reddit/submissions/RS_2022-10.zst
Anzahl der Zeilen, die das Schlüsselwort enthalten: 352
/Volumes/WD5TB/reddit/submissions/RS_2022-02.zst
Anzahl der Zeilen, die das Schlüsselwort enthalten: 350
/Volumes/WD5TB/reddit/submissions/RS_2022-01.zst
Anzahl der Zeilen, die das Schlüsselwort enthalten: 559
/Volumes/WD5TB/reddit/submissions/RS_2022-03.zst
Anzahl der Zeilen, die das Schlüsselwort enthalten: 393
/Volumes/WD5TB/reddit/submissions/RS_2022-07.zst
Anzahl der Zeilen, die das Schlüsselwort enthalten: 554
/Volumes/WD5TB/reddit/submissions/RS_2022-09.zst
Anzahl der Zeilen, die das Schlüsselwort enthalten: 241
/Volumes/WD5TB/reddit/submissions/RS_2022-11.zst
Anzahl der Zeilen, die das Schlüsselwort enthalten: 397
/Volumes/WD5TB/reddit/submissions/RS_2022-06.zst
Anzahl

# SEARCH WITHIN DIRECTION (EACH CSV) MULTIPLE KEYWORDS

In [2]:
import os
import zstandard as zstd
import json
import csv

def extract_zst_files(zst_directory, output_directory, keywords):
    # Durch alle .zst-Dateien im Verzeichnis iterieren
    for filename in os.listdir(zst_directory):
        if filename.endswith(".zst") and not filename.startswith("."):
            year = filename.split("_")[1][:4]
            if "2022" <= year <= "2022":
                zst_file_path = os.path.join(zst_directory, filename)
                output_file_path = os.path.join(output_directory, filename.replace('.zst', '.csv'))
                print(zst_file_path)
                extract_zst_file(zst_file_path, output_file_path, keywords)

def extract_zst_file(file_path, output_file_path, keywords):
    # CSV-Datei zum Schreiben öffnen
    with open(output_file_path, 'w', newline='') as output_file:
        csv_writer = csv.writer(output_file)

        with open(file_path, 'rb') as compressed_file:
            dctx = zstd.ZstdDecompressor(max_window_size=2147483648)
            reader = dctx.stream_reader(compressed_file)
            chunk_size = 500 * 1024 * 1024  # Erhöhte Chunk-Größe auf 500 MB
            current_line = b''

            def line_generator():
                for chunk in iter(lambda: reader.read(chunk_size), b''):
                    nonlocal current_line
                    lines = (current_line + chunk).split(b'\n')
                    for line in lines[:-1]:
                        yield line
                    current_line = lines[-1]
                if current_line:
                    yield current_line

            batch_size = 1000  # Anzahl der Zeilen pro Batch
            batch_data = []  # Zwischenspeicher für Zeilen

            counters = {keyword: 0 for keyword in keywords}
            written_titles = set()  # Liste der bereits geschriebenen Titel
            for line in line_generator():
                for keyword in keywords:
                    counters[keyword] += process_line(line, keyword, written_titles, batch_data, csv_writer, batch_size)
            print("Anzahl der Zeilen, die das Schlüsselwort enthalten:", counters)

            # Schreiben Sie die restlichen Daten im Zwischenspeicher in die CSV-Datei
            if batch_data:
                csv_writer.writerows(batch_data)

def process_line(line, keyword, written_titles, batch_data, csv_writer, batch_size):
    decoded_line = line.decode('utf-8')
    decoded_line = json.loads(decoded_line)
    title = decoded_line.get("title", "").lower()
    if keyword.lower() in title or keyword.lower() in decoded_line.get("selftext", "").lower():
        if title not in written_titles:  # Überprüfung auf Duplikate
            batch_data.append([decoded_line["created_utc"], decoded_line["title"], decoded_line.get("selftext", "")])
            written_titles.add(title)  # Titel zur Liste der bereits geschriebenen Titel hinzufügen

            if len(batch_data) >= batch_size:
                csv_writer.writerows(batch_data)
                batch_data.clear()  # Zwischenspeicher leeren
            return 1
    return 0

# Beispielaufruf der Funktion für mehrere .zst-Dateien
zst_directory = '/Volumes/WD5TB/reddit/submissions/'
output_directory = '/Volumes/WD5TB/'
keywords = ["$JPM", "JPM.N", "JPM Aktie", "JPMorgan Chase Stock", "JPM Stock", 
            "$BAC", "BAC.N", "BAC Aktie", "Bank of America Stock", "BAC Stock", 
            "$WFC", "WFC.N", "WFC Aktie", "Wells Fargo Stock", "WFC Stock", 
            "$MS", "MS.N", "MS Aktie", "Morgan Stanley Stock", "MS Stock", 
            "$AXP", "AXP.N", "AXP Aktie", "American Express Stock", "AXP Stock", 
            "$BX", "BX.N", "BX Aktie", "Blackstone Stock", "BX Stock",
            "$GS", "GS.N", "GS Aktie", "Goldman Sachs Stock", "GS Stock",
            "$BLK", "BLK.N", "BLK Aktie", "BlackRock Stock", "BLK Stock"]

extract_zst_files(zst_directory, output_directory, keywords)


/Volumes/WD5TB/reddit/submissions/RS_2022-12.zst
Anzahl der Zeilen, die das Schlüsselwort enthalten: {'$JPM': 44, 'JPM.N': 0, 'JPM Aktie': 0, 'JPMorgan Chase Stock': 0, 'JPM Stock': 2, '$BAC': 58, 'BAC.N': 0, 'BAC Aktie': 0, 'Bank of America Stock': 6, 'BAC Stock': 3, '$WFC': 63, 'WFC.N': 1, 'WFC Aktie': 0, 'Wells Fargo Stock': 0, 'WFC Stock': 1, '$MS': 228, 'MS.N': 218, 'MS Aktie': 0, 'Morgan Stanley Stock': 1, 'MS Stock': 92, '$AXP': 21, 'AXP.N': 0, 'AXP Aktie': 0, 'American Express Stock': 0, 'AXP Stock': 1, '$BX': 71, 'BX.N': 2, 'BX Aktie': 0, 'Blackstone Stock': 0, 'BX Stock': 20, '$GS': 109, 'GS.N': 192, 'GS Aktie': 0, 'Goldman Sachs Stock': 0, 'GS Stock': 102, '$BLK': 31, 'BLK.N': 0, 'BLK Aktie': 0, 'BlackRock Stock': 0, 'BLK Stock': 2}
/Volumes/WD5TB/reddit/submissions/RS_2022-08.zst


KeyboardInterrupt: 

# SEARCH WITHIN DIRECTION (EACH CSV) MULTIPLE KEYWORDS OPTIMIZED

# SUBMISSIONS

## FINANCIALS

In [1]:
# Jupyter Notebook Code
import os
from multiprocessing import Pool
from worker import extract_zst_file

def extract_zst_files(zst_directory, output_directory, keywords):
    zst_files = [os.path.join(zst_directory, filename) 
                 for filename in os.listdir(zst_directory) 
                 if filename.endswith(".zst") and not filename.startswith(".") 
                 and "2022" <= filename.split("_")[1][:4] <= "2022"]

    with Pool(processes=3) as pool:
        pool.starmap(extract_zst_file, [(file_path, output_directory, keywords) for file_path in zst_files])


zst_directory = '/Users/philippsbresny/Documents/RedditData'
output_directory = '/Users/philippsbresny/Library/CloudStorage/OneDrive-Persönlich/VSC/Lazarus_Project'
keywords = ["$JPM", "JPM.N", "JPM Aktie", "JPMorgan Chase Stock", "JPM Stock", 
            "$BAC", "BAC.N", "BAC Aktie", "Bank of America Stock", "BAC Stock", 
            "$WFC", "WFC.N", "WFC Aktie", "Wells Fargo Stock", "WFC Stock", 
            "$MS", "MS.N", "MS Aktie", "Morgan Stanley Stock", "MS Stock", 
            "$AXP", "AXP.N", "AXP Aktie", "American Express Stock", "AXP Stock", 
            "$BX", "BX.N", "BX Aktie", "Blackstone Stock", "BX Stock",
            "$GS", "GS.N", "GS Aktie", "Goldman Sachs Stock", "GS Stock",
            "$BLK", "BLK.N", "BLK Aktie", "BlackRock Stock", "BLK Stock"]

keywords = [keyword.lower() for keyword in keywords]

extract_zst_files(zst_directory, output_directory, keywords)


Processing /Users/philippsbresny/Documents/RedditData/RS_2022-12.zst...
Processing /Users/philippsbresny/Documents/RedditData/RS_2022-10.zst...
Processing /Users/philippsbresny/Documents/RedditData/RS_2022-07.zst...
Anzahl der Zeilen, die das Schlüsselwort enthalten für Datei /Users/philippsbresny/Documents/RedditData/RS_2022-10.zst: {'$jpm': 0, 'jpm.n': 0, 'jpm aktie': 0, 'jpmorgan chase stock': 0, 'jpm stock': 0, '$bac': 0, 'bac.n': 0, 'bac aktie': 0, 'bank of america stock': 2, 'bac stock': 2, '$wfc': 0, 'wfc.n': 0, 'wfc aktie': 0, 'wells fargo stock': 6, 'wfc stock': 3, '$ms': 0, 'ms.n': 0, 'ms aktie': 0, 'morgan stanley stock': 0, 'ms stock': 0, '$axp': 0, 'axp.n': 0, 'axp aktie': 0, 'american express stock': 2, 'axp stock': 0, '$bx': 0, 'bx.n': 0, 'bx aktie': 0, 'blackstone stock': 0, 'bx stock': 0, '$gs': 0, 'gs.n': 0, 'gs aktie': 0, 'goldman sachs stock': 4, 'gs stock': 1, '$blk': 0, 'blk.n': 0, 'blk aktie': 0, 'blackrock stock': 5, 'blk stock': 0}
Processing /Users/philippsbre

## TECHNOLOGY

In [2]:
# Jupyter Notebook Code
import os
from multiprocessing import Pool
from worker import extract_zst_file

def extract_zst_files(zst_directory, output_directory, keywords):
    zst_files = [os.path.join(zst_directory, filename) 
                 for filename in os.listdir(zst_directory) 
                 if filename.endswith(".zst") and not filename.startswith(".") 
                 and "2022" <= filename.split("_")[1][:4] <= "2022"]

    with Pool(processes=3) as pool:
        pool.starmap(extract_zst_file, [(file_path, output_directory, keywords) for file_path in zst_files])


zst_directory = '/Users/philippsbresny/Documents/RedditData'
output_directory = '/Users/philippsbresny/Library/CloudStorage/OneDrive-Persönlich/VSC/Lazarus_Project/temp_tech'
keywords = ["$AAPL", "AAPL.OQ", "AAPL Aktie", "Apple Stock", "AAPL Stock",
            "$MSFT", "MSFT.OQ", "MSFT Aktie", "Microsoft Stock", "MSFT Stock",
            "$GOOGL", "GOOGL.OQ", "GOOGL Aktie", "Alphabet Stock", "GOOGL Stock",
            "$NVDA", "NVDA.OQ", "NVDA Aktie", "NVIDIA Stock", "NVDA Stock",
            "$META", "META.OQ", "META Aktie", "Meta Platforms Stock", "META Stock",
            "$V", "V.N", "V Aktie", "Visa Stock", "V Stock",
            "$MA", "MA.N", "MA Aktie", "Mastercard Stock", "MA Stock",
            "$AVGO", "AVGO.OQ", "AVGO Aktie", "Broadcom Stock", "AVGO Stock"]

keywords = [keyword.lower() for keyword in keywords]

extract_zst_files(zst_directory, output_directory, keywords)


Processing /Users/philippsbresny/Documents/RedditData/RS_2022-10.zst...Processing /Users/philippsbresny/Documents/RedditData/RS_2022-07.zst...

Processing /Users/philippsbresny/Documents/RedditData/RS_2022-12.zst...
Anzahl der Zeilen, die das Schlüsselwort enthalten für Datei /Users/philippsbresny/Documents/RedditData/RS_2022-10.zst: {'$aapl': 0, 'aapl.oq': 0, 'aapl aktie': 1, 'apple stock': 59, 'aapl stock': 5, '$msft': 0, 'msft.oq': 0, 'msft aktie': 0, 'microsoft stock': 28, 'msft stock': 5, '$googl': 0, 'googl.oq': 0, 'googl aktie': 0, 'alphabet stock': 15, 'googl stock': 0, '$nvda': 0, 'nvda.oq': 0, 'nvda aktie': 0, 'nvidia stock': 27, 'nvda stock': 7, '$meta': 0, 'meta.oq': 0, 'meta aktie': 1, 'meta platforms stock': 12, 'meta stock': 87, '$v': 0, 'v.n': 9, 'v aktie': 0, 'visa stock': 4, 'v stock': 6, '$ma': 0, 'ma.n': 1, 'ma aktie': 0, 'mastercard stock': 1, 'ma stock': 3, '$avgo': 0, 'avgo.oq': 0, 'avgo aktie': 0, 'broadcom stock': 0, 'avgo stock': 2}
Processing /Users/philippsb

## ENERGY

In [3]:
# Jupyter Notebook Code
import os
from multiprocessing import Pool
from worker import extract_zst_file

def extract_zst_files(zst_directory, output_directory, keywords):
    zst_files = [os.path.join(zst_directory, filename) 
                 for filename in os.listdir(zst_directory) 
                 if filename.endswith(".zst") and not filename.startswith(".") 
                 and "2022" <= filename.split("_")[1][:4] <= "2022"]

    with Pool(processes=3) as pool:
        pool.starmap(extract_zst_file, [(file_path, output_directory, keywords) for file_path in zst_files])


zst_directory = '/Users/philippsbresny/Documents/RedditData'
output_directory = '/Users/philippsbresny/Library/CloudStorage/OneDrive-Persönlich/VSC/Lazarus_Project/temp_energy'
keywords = ["$XOM", "XOM.N", "XOM Aktie", "Exxon Mobil Stock", "XOM Stock",
            "$CVX", "CVX.N", "CVX Aktie", "Chevron Stock", "CVX Stock",
            "$COP", "COP.N", "COP Aktie", "Conocophillips Stock", "COP Stock",
            "$SLB", "SLB.N", "SLB Aktie", "Schlumberger Stock", "SLB Stock",
            "$EOG", "EOG.N", "EOG Aktie", "EOG Resources Stock", "EOG Stock",
            "$EPD", "EPD.N", "EPD Aktie", "Enterprise Products Stock", "EPD Stock",
            "$OXY", "OXY.N", "OXY Aktie", "Occidental Petroleum Stock", "OXY Stock",
            "$MPC", "MPC.N", "MPC Aktie", "Marathon Petroleum Stock", "MPC Stock",]

keywords = [keyword.lower() for keyword in keywords]

extract_zst_files(zst_directory, output_directory, keywords)


Processing /Users/philippsbresny/Documents/RedditData/RS_2022-07.zst...
Processing /Users/philippsbresny/Documents/RedditData/RS_2022-10.zst...
Processing /Users/philippsbresny/Documents/RedditData/RS_2022-12.zst...
Anzahl der Zeilen, die das Schlüsselwort enthalten für Datei /Users/philippsbresny/Documents/RedditData/RS_2022-10.zst: {'$xom': 0, 'xom.n': 0, 'xom aktie': 0, 'exxon mobil stock': 5, 'xom stock': 3, '$cvx': 0, 'cvx.n': 0, 'cvx aktie': 0, 'chevron stock': 5, 'cvx stock': 1, '$cop': 0, 'cop.n': 0, 'cop aktie': 0, 'conocophillips stock': 0, 'cop stock': 4, '$slb': 0, 'slb.n': 0, 'slb aktie': 0, 'schlumberger stock': 1, 'slb stock': 1, '$eog': 0, 'eog.n': 0, 'eog aktie': 0, 'eog resources stock': 0, 'eog stock': 3, '$epd': 0, 'epd.n': 0, 'epd aktie': 0, 'enterprise products stock': 0, 'epd stock': 2, '$oxy': 0, 'oxy.n': 0, 'oxy aktie': 0, 'occidental petroleum stock': 0, 'oxy stock': 1, '$mpc': 0, 'mpc.n': 0, 'mpc aktie': 0, 'marathon petroleum stock': 0, 'mpc stock': 1}
Proce

# COMMENTS

## TECHNOLOGY

In [7]:
# Jupyter Notebook Code
import os
from multiprocessing import Pool
from worker2 import extract_zst_file

def extract_zst_files(zst_directory, output_directory, keywords):
    zst_files = [os.path.join(zst_directory, filename) 
                 for filename in os.listdir(zst_directory) 
                 if filename.endswith(".zst") and not filename.startswith(".") 
                 and "2022" <= filename.split("_")[1][:4] <= "2022"
                 and filename.startswith("RC_")]

    with Pool(processes=3) as pool:
        pool.starmap(extract_zst_file, [(file_path, output_directory, keywords) for file_path in zst_files])


zst_directory = '/Users/philippsbresny/Documents/RedditComments'
output_directory = '/Users/philippsbresny/Library/CloudStorage/OneDrive-Persönlich/VSC/Lazarus_Project/temp_tech'
# Ihre Schlüsselwörter hier
keywords = ["$AAPL", "AAPL.OQ", "AAPL Aktie", "Apple Stock", "AAPL Stock",
            "$MSFT", "MSFT.OQ", "MSFT Aktie", "Microsoft Stock", "MSFT Stock",
            "$GOOGL", "GOOGL.OQ", "GOOGL Aktie", "Alphabet Stock", "GOOGL Stock",
            "$NVDA", "NVDA.OQ", "NVDA Aktie", "NVIDIA Stock", "NVDA Stock",
            "$META", "META.OQ", "META Aktie", "Meta Platforms Stock", "META Stock",
            "$V", "V.N", "V Aktie", "Visa Stock", "V Stock",
            "$MA", "MA.N", "MA Aktie", "Mastercard Stock", "MA Stock",
            "$AVGO", "AVGO.OQ", "AVGO Aktie", "Broadcom Stock", "AVGO Stock"]

keywords = [keyword.lower() for keyword in keywords]

extract_zst_files(zst_directory, output_directory, keywords)


Processing /Users/philippsbresny/Documents/RedditComments/RC_2022-11.zst...
Processing /Users/philippsbresny/Documents/RedditComments/RC_2022-12.zst...
Processing /Users/philippsbresny/Documents/RedditComments/RC_2022-10.zst...
Anzahl der Zeilen, die das Schlüsselwort enthalten für Datei /Users/philippsbresny/Documents/RedditComments/RC_2022-11.zst: {'$aapl': 332, 'aapl.oq': 0, 'aapl aktie': 0, 'apple stock': 526, 'aapl stock': 44, '$msft': 71, 'msft.oq': 0, 'msft aktie': 0, 'microsoft stock': 79, 'msft stock': 21, '$googl': 61, 'googl.oq': 0, 'googl aktie': 0, 'alphabet stock': 16, 'googl stock': 5, '$nvda': 128, 'nvda.oq': 0, 'nvda aktie': 0, 'nvidia stock': 68, 'nvda stock': 17, '$meta': 261, 'meta.oq': 0, 'meta aktie': 0, 'meta platforms stock': 0, 'meta stock': 213, '$v': 33, 'v.n': 49, 'v aktie': 0, 'visa stock': 2, 'v stock': 15, '$ma': 7, 'ma.n': 4, 'ma aktie': 0, 'mastercard stock': 0, 'ma stock': 3, '$avgo': 2, 'avgo.oq': 0, 'avgo aktie': 0, 'broadcom stock': 0, 'avgo stock':

## FINANCIALS

In [8]:
# Jupyter Notebook Code
import os
from multiprocessing import Pool
from worker2 import extract_zst_file

def extract_zst_files(zst_directory, output_directory, keywords):
    zst_files = [os.path.join(zst_directory, filename) 
                 for filename in os.listdir(zst_directory) 
                 if filename.endswith(".zst") and not filename.startswith(".") 
                 and "2022" <= filename.split("_")[1][:4] <= "2022"
                 and filename.startswith("RC_")]

    with Pool(processes=3) as pool:
        pool.starmap(extract_zst_file, [(file_path, output_directory, keywords) for file_path in zst_files])


zst_directory = '/Users/philippsbresny/Documents/RedditComments'
output_directory = '/Users/philippsbresny/Library/CloudStorage/OneDrive-Persönlich/VSC/Lazarus_Project'
# Ihre Schlüsselwörter hier
keywords = ["$JPM", "JPM.N", "JPM Aktie", "JPMorgan Chase Stock", "JPM Stock", 
            "$BAC", "BAC.N", "BAC Aktie", "Bank of America Stock", "BAC Stock", 
            "$WFC", "WFC.N", "WFC Aktie", "Wells Fargo Stock", "WFC Stock", 
            "$MS", "MS.N", "MS Aktie", "Morgan Stanley Stock", "MS Stock", 
            "$AXP", "AXP.N", "AXP Aktie", "American Express Stock", "AXP Stock", 
            "$BX", "BX.N", "BX Aktie", "Blackstone Stock", "BX Stock",
            "$GS", "GS.N", "GS Aktie", "Goldman Sachs Stock", "GS Stock",
            "$BLK", "BLK.N", "BLK Aktie", "BlackRock Stock", "BLK Stock"]

keywords = [keyword.lower() for keyword in keywords]

extract_zst_files(zst_directory, output_directory, keywords)


Processing /Users/philippsbresny/Documents/RedditComments/RC_2022-11.zst...
Processing /Users/philippsbresny/Documents/RedditComments/RC_2022-12.zst...Processing /Users/philippsbresny/Documents/RedditComments/RC_2022-10.zst...

Anzahl der Zeilen, die das Schlüsselwort enthalten für Datei /Users/philippsbresny/Documents/RedditComments/RC_2022-11.zst: {'$jpm': 41, 'jpm.n': 1, 'jpm aktie': 0, 'jpmorgan chase stock': 0, 'jpm stock': 3, '$bac': 13, 'bac.n': 4, 'bac aktie': 0, 'bank of america stock': 6, 'bac stock': 1, '$wfc': 3, 'wfc.n': 1, 'wfc aktie': 0, 'wells fargo stock': 4, 'wfc stock': 0, '$ms': 9, 'ms.n': 0, 'ms aktie': 0, 'morgan stanley stock': 1, 'ms stock': 6, '$axp': 6, 'axp.n': 0, 'axp aktie': 0, 'american express stock': 0, 'axp stock': 0, '$bx': 8, 'bx.n': 0, 'bx aktie': 0, 'blackstone stock': 0, 'bx stock': 0, '$gs': 16, 'gs.n': 0, 'gs aktie': 0, 'goldman sachs stock': 1, 'gs stock': 5, '$blk': 10, 'blk.n': 1, 'blk aktie': 0, 'blackrock stock': 3, 'blk stock': 1}
Anzahl de

## ENERGY

In [9]:
# Jupyter Notebook Code
import os
from multiprocessing import Pool
from worker2 import extract_zst_file

def extract_zst_files(zst_directory, output_directory, keywords):
    zst_files = [os.path.join(zst_directory, filename) 
                 for filename in os.listdir(zst_directory) 
                 if filename.endswith(".zst") and not filename.startswith(".") 
                 and "2022" <= filename.split("_")[1][:4] <= "2022"
                 and filename.startswith("RC_")]

    with Pool(processes=3) as pool:
        pool.starmap(extract_zst_file, [(file_path, output_directory, keywords) for file_path in zst_files])


zst_directory = '/Users/philippsbresny/Documents/RedditComments'
output_directory = '/Users/philippsbresny/Library/CloudStorage/OneDrive-Persönlich/VSC/Lazarus_Project/temp_energy'
# Ihre Schlüsselwörter hier
keywords = ["$XOM", "XOM.N", "XOM Aktie", "Exxon Mobil Stock", "XOM Stock",
            "$CVX", "CVX.N", "CVX Aktie", "Chevron Stock", "CVX Stock",
            "$COP", "COP.N", "COP Aktie", "Conocophillips Stock", "COP Stock",
            "$SLB", "SLB.N", "SLB Aktie", "Schlumberger Stock", "SLB Stock",
            "$EOG", "EOG.N", "EOG Aktie", "EOG Resources Stock", "EOG Stock",
            "$EPD", "EPD.N", "EPD Aktie", "Enterprise Products Stock", "EPD Stock",
            "$OXY", "OXY.N", "OXY Aktie", "Occidental Petroleum Stock", "OXY Stock",
            "$MPC", "MPC.N", "MPC Aktie", "Marathon Petroleum Stock", "MPC Stock",]

keywords = [keyword.lower() for keyword in keywords]

extract_zst_files(zst_directory, output_directory, keywords)


Processing /Users/philippsbresny/Documents/RedditComments/RC_2022-10.zst...Processing /Users/philippsbresny/Documents/RedditComments/RC_2022-12.zst...

Processing /Users/philippsbresny/Documents/RedditComments/RC_2022-11.zst...
Anzahl der Zeilen, die das Schlüsselwort enthalten für Datei /Users/philippsbresny/Documents/RedditComments/RC_2022-11.zst: {'$xom': 52, 'xom.n': 1, 'xom aktie': 0, 'exxon mobil stock': 0, 'xom stock': 3, '$cvx': 21, 'cvx.n': 2, 'cvx aktie': 0, 'chevron stock': 6, 'cvx stock': 1, '$cop': 16, 'cop.n': 3, 'cop aktie': 0, 'conocophillips stock': 0, 'cop stock': 0, '$slb': 5, 'slb.n': 0, 'slb aktie': 0, 'schlumberger stock': 0, 'slb stock': 1, '$eog': 5, 'eog.n': 0, 'eog aktie': 0, 'eog resources stock': 0, 'eog stock': 0, '$epd': 1, 'epd.n': 0, 'epd aktie': 0, 'enterprise products stock': 0, 'epd stock': 0, '$oxy': 65, 'oxy.n': 0, 'oxy aktie': 0, 'occidental petroleum stock': 1, 'oxy stock': 4, '$mpc': 10, 'mpc.n': 0, 'mpc aktie': 0, 'marathon petroleum stock': 0, 

# PROOF OF CONCEPT TEST

In [66]:
import re

def keyword_search(text, keywords):
    for keyword in keywords:
        pattern = r'(?:^|\W)' + re.escape(keyword.lower()) + r'(?:$|\W)'
        if re.search(pattern, text.lower()):
            return "Treffer"
    return "kein Treffer"

keywords = ["$AAPL", "AAPL.OQ", "AAPL Aktie", "Apple Stock", "AAPL Stock",
            "$MSFT", "MSFT.OQ", "MSFT Aktie", "Microsoft Stock", "MSFT Stock",
            "$GOOGL", "GOOGL.OQ", "GOOGL Aktie", "Alphabet Stock", "GOOGL Stock",
            "$NVDA", "NVDA.OQ", "NVDA Aktie", "NVIDIA Stock", "NVDA Stock",
            "$META", "META.OQ", "META Aktie", "Meta Platforms Stock", "META Stock",
            "$V", "V.N", "V Aktie", "Visa Stock", "V Stock",
            "$MA", "MA.N", "MA Aktie", "Mastercard Stock", "MA Stock",
            "$AVGO", "AVGO.OQ", "AVGO Aktie", "Broadcom Stock", "AVGO Stock"]

texts = ["$aapl is going up", "Today Googl Aktie is booming", "Today I bough some shares of $MSFT",
         "Today I sold all my shares of $META.", "I found V.N.H.D amazing", "We need AAPL.OQ-Earnings so bad", 
         "some gibberish $VNU&%O", "The company made a "]

for text in texts:
    print(f"'{text}' --> {keyword_search(text, keywords)}")


'$aapl is going up' --> Treffer
'Today Googl Aktie is booming' --> Treffer
'Today I bough some shares of $MSFT' --> Treffer
'Today I sold all my shares of $META.' --> Treffer
'I found V.N.H.D amazing' --> Treffer
'We need AAPL.OQ-Earnings so bad' --> Treffer
'some gibberish $VNU&%O' --> kein Treffer
'The company made a ' --> kein Treffer
