# 1. Data Processing

## 1.1 Data Collection

- Area: 49 U.S. states and D.C.
- Data: Google Maps reviews for stores

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import os
import re
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [2]:
folder_path = '../google-map-data/store-review'

file_list = [f'filtered_review-{state}.csv' for state in [
    'Nebraska', 'Pennsylvania', 'Arkansas', 'Tennessee', 'Minnesota', 'Nevada',
    'Alaska', 'Oregon', 'Wisconsin', 'Iowa', 'Kansas', 'Mississippi',
    'South_Dakota', 'Vermont', 'Texas', 'Maryland', 'Utah', 'New_Hampshire',
    'Arizona', 'Michigan', 'Alabama', 'South_Carolina', 'District_of_Columbia',
    'New_Jersey', 'West_Virginia', 'Kentucky', 'Ohio', 'Delaware', 'Florida',
    'Idaho', 'Louisiana', 'North_Dakota', 'Washington', 'Georgia', 'Illinois',
    'Wyoming', 'Missouri', 'California', 'New_Mexico', 'New_York', 'Connecticut',
    'Oklahoma', 'Colorado', 'North_Carolina', 'Montana', 'Maine', 'Massachusetts',
    'Indiana', 'Rhode_Island', 'Hawaii', 'Virginia'
]]

## 1.2 Data Cleaning

- Set up key words for health resources to filter the data
- January 2018 to August 2021

In [3]:
keywords = [
    "sanitizer",
    "soap",
    "toilet paper",
    "mask",
    "disinfectant",
    "gloves",
    "thermometer",
    "tissues",
    "wipes",
    "vitamins",
    "face shield",
    "lysol spray",
    "n95",
    "hand wash",
    "acetaminophen",
    "tylenol",
    "advil",
    "motrin",
    "dayquil",
    "nyquil",
    "mucinex",
    "robitussin",
    "sudafed",
    "test kit",
    "home test",
    "self test",
    "ibuprofen",
    "pepto-bismol",
    "tums",
    "robitussin",
    "pedialyte",
    "gatorade",
    "vick’s vaporub",
    "oseltamivir",
    "tamiflu",
    "zinc",
    "hydroxychloroquine",
    "respirators",
    "alcohol"
]

In [4]:
# Define a function to extract only the sentences containing keywords
def extract_sentences_with_keywords(text, pattern):
    # Split the text into sentences
    sentences = re.split(r'[.!?]', text)
    # Extract and return sentences containing the keywords
    return ' | '.join([sentence.strip() for sentence in sentences if pattern.search(sentence)])

# Create a regex pattern for the keywords
pattern = re.compile('|'.join(keywords), re.IGNORECASE)

# Initialize an empty DataFrame to store the filtered comments
all_filtered_comments_df = pd.DataFrame()

In [5]:
# Loop through each file and process the data
for file_name in file_list:
    # Construct the full file path
    file_path = os.path.join(folder_path, file_name)
    
    # Load the data
    reviews_df = pd.read_csv(file_path)

    # Drop duplicates based on 'name' and 'text' columns
    reviews_df.drop_duplicates(subset=['name', 'text'], inplace=True)
    
    # Ensure the 'time' field contains only valid timestamps
    reviews_df = reviews_df[reviews_df['time'].apply(lambda x: isinstance(x, (int, float)))]
    
    # Convert timestamp to datetime
    reviews_df['time'] = pd.to_datetime(reviews_df['time'], unit='ms')
    
    # Filter reviews from Jan 2018 onwards
    filtered_reviews_df = reviews_df[reviews_df['time'] >= '2018-01-01']
    
    # Handle missing values in 'text' column by replacing NaNs with empty strings
    filtered_reviews_df['text'] = filtered_reviews_df['text'].fillna('')
    
    # Extract only the sentences with keywords from the text
    filtered_reviews_df['text'] = filtered_reviews_df['text'].apply(lambda x: extract_sentences_with_keywords(x, pattern))
    
    # Filter comments that mention any of the keywords
    filtered_reviews_df['keywords_mentioned'] = filtered_reviews_df['text'].apply(lambda x: bool(pattern.search(x)))
    
    # Extract the filtered comments
    filtered_comments_df = filtered_reviews_df[filtered_reviews_df['keywords_mentioned']]
    
    # Analyze the content of filtered comments
    filtered_comments_df['mentioned_keywords'] = filtered_comments_df['text'].apply(lambda x: [kw for kw in keywords if kw in x.lower()])
    
    # Append the filtered comments to the main DataFrame
    all_filtered_comments_df = pd.concat([all_filtered_comments_df, filtered_comments_df])

In [6]:
all_filtered_comments_df['class'] = np.nan

all_filtered_comments_df

In [7]:
all_filtered_comments_df.to_csv('../filtered-labeled-data/unlabeled_review.csv', index=False)

## 1.3 Annotation

**1500 samples 80%-20% (1200 for training; 300 for testing)**
- Class 1: They had lots of toilet paper
- Class -1: Out of toilet paper, again…
- Class 9: Forcing you to where a mask will be Shopping elsewhere