In [None]:
import pandas as pd
import os
import csv
import glob

In [None]:
def process_text(text):
    replacements = {
        "\n": " ",
        ",&amp;": "and",
        ",&gt;": "greater-than",
        ",&lt;": "less-than",
        ",&ge;": "greater-than-or-equal",
        ",&le;": "less-than-or-equal",
        "&amp;": "and",
        "&gt;": "greater-than",
        "&lt;": "less-than",
        "&ge;": "greater-than-or-equal",
        "&le;": "less-than-or-equal"
    }
    for old, new in replacements.items():
        text = text.str.replace(old, new, case=False)
    return text

def process_combined_csv(year, input_dir):
    os.chdir(input_dir)
    extension = 'csv'
    all_files = glob.glob(f'*.{extension}')
    csv_list = [pd.read_csv(filename, index_col=None, header=0, lineterminator='\n') for filename in all_files]
    combined_csv = pd.concat(csv_list, ignore_index=True).sort_values("created_at")

    combined_csv["text"] = process_text(combined_csv["text"])
    combined_csv["created_at"] = combined_csv["created_at"].str[:13]

    exclude_keywords = ['pokemon', 'superman', 'galaxy', 'eclipse', 'solar plexus', 'solar-powered human', 'i will become your sun']
    place_exclude_keywords = ["international", "global", "earth", "galaxy", "milky way", "world", "everywhere", "anywhere"]
    combined_csv = combined_csv[~combined_csv.text.str.contains('|'.join(exclude_keywords), na=False, case=False)]
    combined_csv = combined_csv[~combined_csv.user_loc.str.contains('|'.join(place_exclude_keywords), na=False, case=False)]

    return combined_csv

def save_processed_csv(data, year, output_dir):
    output_path = os.path.join(output_dir, f"solarsent_{year}.csv")
    data.to_csv(output_path, index=False, encoding='utf-8')

In [None]:
years = ["2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022"]

for year in years:
    input_dir = f"../{year}"
    processed_data = process_combined_csv(year, input_dir)

    output_dir = "../"
    save_processed_csv(processed_data, year, output_dir)
