# Filtering Bitcoin related posts

In [None]:
import pandas as pd
import os

In [None]:
def remove_irrelevant_rows(df):
    '''Removes the [removed] and [deleted] rows from the dataframe. Removes discord spam links from the dataframe.'''
    for index, row in df.iterrows():
        if 'https://discord' in row['title']:
            df.drop(index, inplace=True)
    df = df[df['text'] != '[removed]']
    df = df[df['text'] != '[deleted]']
    df.reset_index(drop=True, inplace=True)
    return df

def search_keywords(df, keywords):
    '''Searches for keywords in the dataframe.'''
    df['contains_keyword'] = df['title'].apply(lambda x: any(keyword in x.lower() for keyword in keywords))
    return df

In [3]:
files = os.listdir(r'./submissions')
files

['wallstreetbets2019_2020_submissions.csv',
 'investing_submissions.csv',
 'wallstreetbets2022_2023_submissions.csv',
 'options_submissions.csv',
 'wallstreetbets2021_submissions.csv',
 'wallstreetbets2024_submissions.csv',
 'wallstreetbets2017_2018_submissions.csv',
 'stocks_submissions.csv',
 'stockstobuy_submissions.csv']

## For each DataFrame remove the irrelevant spam and filter for Bitcoin related posts

In [None]:
# Keywords used to filter for Bitcoin related posts
keywords = ["bitcoin", "btc", "binance", "crypto"]

if os.path.exists('./filtered_dfs'):
    print("Filtered directory already exists.")
else:
    os.makedirs('./filtered_dfs')
    print("Created filtered_dfs directory.")

for file in files:
    print('Handling file:', file)
    df = pd.read_csv(f"./submissions/{file}")
    # remove NaN values
    df.dropna(subset=['title'], inplace=True)
    df = remove_irrelevant_rows(df)
    keywords = ["bitcoin", "btc", "binance", "crypto"]

    df = search_keywords(df, keywords)
    df = df[df['contains_keyword'] == True]
    df.reset_index(drop=True, inplace=True)
    df.to_csv(f"./filtered_dfs/{file}", index=False)  
    print("Saved file:", file)