# Pre-Processing for Ozone Variability Project

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
import statsmodels.api as sm
import numpy as np
import os

## Pre-Processing

In [None]:
# read ozone, press, rhdp, and temp folers:
folders = ['press', 'rhdp', 'temp']  # 'ozone' will be processed separately

data_ozone = pd.DataFrame()

ozone_folder = 'ozone'
for filename in os.listdir(ozone_folder):
    if filename.endswith('.csv'):
        file_path = os.path.join(ozone_folder, filename)
        df = pd.read_csv(file_path)
        df_la = df[df['County Name'] == 'Los Angeles']
        data_ozone = pd.concat([data_ozone, df_la], ignore_index=True)

data = data_ozone

In [None]:
data.shape

In [None]:
folders = ['press', 'temp', 'rhdp']  # List of folders to process

for folder in folders:
    suffix = '_' + folder  # Suffix for the columns from this folder

    # Initialize an empty DataFrame for this folder's data
    folder_data = pd.DataFrame()

    file_counter = 0  # Initialize the file counter

    for filename in os.listdir(folder):
        if filename.endswith('.csv'):
            file_counter += 1  # Increment the counter for each file

            file_path = os.path.join(folder, filename)
            temp_data = pd.read_csv(file_path)
            temp_data_la = temp_data[temp_data['County Name'] == 'Los Angeles']

            # Concatenate with the folder's DataFrame
            folder_data = pd.concat([folder_data, temp_data_la], ignore_index=True)

            # Print the file progress within the current folder
            print(f"Processing file '{filename}' in '{folder}' folder: File {file_counter}/24")

    # Merge the combined folder data with the main 'data' DataFrame
    data = pd.merge(data, folder_data, on=['Site Num', 'Date Local'], how='outer', suffixes=('', suffix))


In [None]:
data.shape

In [None]:
wind_folder = 'wind'
suffix_speed = '_windspeed'
suffix_direction = '_winddirection'

wind_speed_data = pd.DataFrame()
wind_direction_data = pd.DataFrame()

for filename in os.listdir(wind_folder):
    if filename.endswith('.csv'):
        file_path = os.path.join(wind_folder, filename)
        wind_df = pd.read_csv(file_path)

        # Filter for 'Los Angeles' and 'Wind Speed - Resultant'
        la_wind_speed_data = wind_df[(wind_df['County Name'] == 'Los Angeles') & 
                                     (wind_df['Parameter Name'] == "Wind Speed - Resultant")]

        # Filter for 'Los Angeles' and 'Wind Direction - Resultant'
        la_wind_direction_data = wind_df[(wind_df['County Name'] == 'Los Angeles') & 
                                        (wind_df['Parameter Name'] == "Wind Direction - Resultant")]

        # Concatenate with the respective DataFrames
        wind_speed_data = pd.concat([wind_speed_data, la_wind_speed_data], ignore_index=True)
        wind_direction_data = pd.concat([wind_direction_data, la_wind_direction_data], ignore_index=True)

# Merge Wind Speed and Wind Direction dataframes to the main dataframe
data = pd.merge(data, wind_speed_data, on=['Site Num', 'Date Local'], how='outer', suffixes=('', suffix_speed))
data = pd.merge(data, wind_direction_data, on=['Site Num', 'Date Local'], how='outer', suffixes=('', suffix_direction))


In [None]:
data.to_csv('./final_data/alldata.csv', index=False)