# Analysis of Data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
import statsmodels.api as sm
import numpy as np
import os

In [None]:
data = pd.read_csv('./final_data/alldata.csv')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
import numpy as np


data['date'] = pd.to_datetime(data['Date Local'])

# Aggregating the data by date
data['date'] = pd.to_datetime(data['date'])
daily_mean = data.groupby('date')['Arithmetic Mean'].mean()

# Histogram of daily mean values
plt.figure(figsize=(10, 6))
sns.histplot(daily_mean, bins=30, kde=True)
plt.title('Histogram of Daily Arithmetic Mean Ozone Concentration')
plt.xlabel('Arithmetic Mean Ozone Concentration')
plt.ylabel('Frequency')
plt.show()

# Seasonal Trend Decomposition
decomposed = seasonal_decompose(daily_mean, model='additive', period=365)

# Plotting the decomposed components
plt.figure(figsize=(12, 8))
plt.subplot(411)
plt.plot(decomposed.observed, label='Observed')
plt.legend(loc='upper left')
plt.subplot(412)
plt.plot(decomposed.trend, label='Trend')
plt.legend(loc='upper left')
plt.subplot(413)
plt.plot(decomposed.seasonal, label='Seasonal')
plt.legend(loc='upper left')
plt.subplot(414)
plt.plot(decomposed.resid, label='Residual')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# Filter data for the year 2000 and 2022
data['date'] = pd.to_datetime(data['date'])
data_2000 = data[data['date'].dt.year == 2000]
data_2022 = data[data['date'].dt.year == 2022]

# Plot histograms
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.hist(data_2000['Arithmetic Mean'], bins=30, alpha=0.7, color='blue', edgecolor='black')
plt.title('Histogram of Ozone Concentration for 2000')
plt.xlabel('Arithmetic Mean Ozone Concentration')
plt.ylabel('Frequency')

# Histogram for 2022
plt.subplot(1, 2, 2)
plt.hist(data_2022['Arithmetic Mean'], bins=30, alpha=0.7, color='green', edgecolor='black')
plt.title('Histogram of Ozone Concentration for 2022')
plt.xlabel('Arithmetic Mean Ozone Concentration')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
min_data = pd.read_csv('./final_data/new_min_data.csv')

In [None]:
# Calculating the correlation matrix
correlation_matrix = min_data.corr()

# Plotting the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", 
            xticklabels=['Ozone', 'Temperature', 'Relative Humidity', 'Wind Direction', 'Wind Speed'],
            yticklabels=['Ozone', 'Temperature', 'Relative Humidity', 'Wind Direction', 'Wind Speed'])
plt.title('Correlation Analysis of Air Quality Indicators')
plt.show()