### Data Visualization

In [None]:
# Convert Spark DataFrame to Pandas DataFrame
data_pd = data.toPandas()

# Convert 'Current_Date' column to datetime type
data_pd['Current_Date'] = pd.to_datetime(data_pd['Current_Date'])

In [None]:
import matplotlib.pyplot as plt

# Filter data for Colombo Proper
colombo_data = data_pd[data_pd['Location'] == 'Colombo Proper']

# Extract year from the 'Current_Date' column
colombo_data['Year'] = colombo_data['Current_Date'].dt.year

# Group data by year and calculate the mean HCHO reading for each year
colombo_yearly_mean = colombo_data.groupby('Year')['HCHO_reading'].mean()

# Plotting
plt.figure(figsize=(10, 6))
colombo_yearly_mean.plot(marker='o', linestyle='-')
plt.title('HCHO Level Changes in Colombo Proper Over the Years')
plt.xlabel('Year')
plt.ylabel('Mean HCHO Reading')
plt.grid(True)
plt.xticks(colombo_yearly_mean.index)  # Set x ticks to be the years
plt.tight_layout()
plt.show()


In [None]:
# Filter data for Colombo Proper
colombo_data = data_pd[data_pd['Location'] == 'Colombo Proper']

# Extract year and month from the 'Current_Date' column
colombo_data['Year'] = colombo_data['Current_Date'].dt.year
colombo_data['Month'] = colombo_data['Current_Date'].dt.month

# Create a new column combining year and month
colombo_data['Year_Month'] = colombo_data['Current_Date'].dt.to_period('M')

# Group data by year-month and calculate the mean HCHO reading for each month
colombo_monthly_mean = colombo_data.groupby('Year_Month')['HCHO_reading'].mean()

# Plotting
plt.figure(figsize=(14, 6))
colombo_monthly_mean.plot(marker='o', linestyle='-')
plt.title('HCHO Level Changes in Colombo Proper Over Time')
plt.xlabel('Year-Month')
plt.ylabel('Mean HCHO Reading')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming 'data_pd' is your pandas DataFrame
unique_locations = data_pd['Location'].unique()

# Create a single plot
plt.figure(figsize=(14, 7))

# Loop through each unique location
for location in unique_locations:
    location_data = data_pd[data_pd['Location'] == location]
    location_monthly_mean = location_data.groupby(location_data['Current_Date'].dt.to_period('M'))['HCHO_reading'].mean()
    
    # Convert index to string for plotting
    location_monthly_mean.index = location_monthly_mean.index.strftime('%Y-%m')
    
    plt.plot(location_monthly_mean.index, location_monthly_mean, marker='o', linestyle='-', label=location)

# Add labels and legend
plt.title('Mean HCHO Level Changes Over Time')
plt.xlabel('Year-Month')
plt.ylabel('Mean HCHO Reading')
plt.grid(True)
plt.xticks(rotation=45)
plt.legend()

# Show plot
plt.tight_layout()
plt.show()

In [None]:
def plot_hcho_levels(data_pd):
    
    # Initialize a list to store plots
    all_plots = []
    
    # Loop through each unique location
    for location in data_pd['Location'].unique():
        
        # Filter data for the current location
        location_data = data_pd[data_pd['Location'] == location]
        
        # Extract year and month from the 'Current_Date' column
        location_data['Year'] = location_data['Current_Date'].dt.year
        location_data['Month'] = location_data['Current_Date'].dt.month
        
        # Create a new column combining year and month
        location_data['Year_Month'] = location_data['Current_Date'].dt.to_period('M')
        
        # Group data by year-month and calculate the mean HCHO reading for each month
        location_monthly_mean = location_data.groupby('Year_Month')['HCHO_reading'].mean()
        
        # Plotting
        plt.figure(figsize=(14, 6))
        location_monthly_mean.plot(marker='o', linestyle='-')
        plt.title(f'HCHO Level Changes in {location} Over Time')
        plt.xlabel('Year-Month')
        plt.ylabel('Mean HCHO Reading')
        plt.grid(True)
        plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
        plt.tight_layout()
        
        # Append the current plot to the list of plots
        all_plots.append(plt)
    
    return all_plots

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set figure size
plt.figure(figsize=(14, 8))

# Plot histogram with hue parameter
sns.histplot(data=data_pd, x='HCHO_reading', hue='Location', kde=True, multiple='stack')

# Set title
plt.title('Distribution of HCHO Readings by Location')

# Show plot
plt.show()

In [None]:
def show_plot(
    data, 
    figsize=(15, 10),
    color="blue",
    linestyle="-",
    xlabel="Year",
    ylabel="HCHO Levels",
    label=None,
):

    plt.figure(figsize=figsize)
    plt.plot(data['Current_Date'], data['HCHO_reading'], color=color, linestyle=linestyle, label=label)  
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend(loc=2)

In [None]:
colombo_proper_data = data_pd[data_pd['Location'] == 'Colombo Proper']
colombo_proper_data_selected = colombo_proper_data[['Current_Date', 'HCHO_reading']]
colombo_proper_data_selected.head()
show_plot (colombo_proper_data_selected, label = 'HCHO')

In [None]:
# Import necessary libraries for time series analysis
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose

# Perform time series analysis for each city
def analyze_trends(location):
    analyze_data = data_pd[data_pd['Location'] == location] 
    analyze_data = analyze_data[['Current_Date', 'HCHO_reading']]

    # Convert 'Current_Date' column to datetime
    analyze_data['Current_Date'] = pd.to_datetime(analyze_data['Current_Date'])
    
    # Set 'Current_Date' as index
    analyze_data.set_index('Current_Date', inplace=True)
    
    # Perform seasonal decomposition
    decomposition = seasonal_decompose(analyze_data['HCHO_reading'], model='additive', period=365)
    
    # Extract components
    trend = decomposition.trend
    seasonal = decomposition.seasonal
    residual = decomposition.resid
    
    # Plot components
    plt.figure(figsize=(12, 8))
    plt.subplot(411)
    plt.plot(analyze_data['HCHO_reading'], label='Original')
    plt.legend(loc='best')
    plt.subplot(412)
    plt.plot(trend, label='Trend')
    plt.legend(loc='best')
    plt.subplot(413)
    plt.plot(seasonal, label='Seasonal')
    plt.legend(loc='best')
    plt.subplot(414)
    plt.plot(residual, label='Residual')
    plt.legend(loc='best')
    plt.tight_layout()
    plt.show()

    

#### Seasonal Trends Old

In [None]:
colombo_data_new = final_data.filter(final_data['Location'] == 'Colombo Proper').select('Current_Date', 'HCHO_reading').toPandas()

In [None]:
# Now you can pass colombo_data to the analyze_time_series function
colombo_trend, colombo_seasonal, colombo_long_term_trend = analyze_time_series(colombo_data_new['HCHO_reading'])-

In [None]:
# Plot the trend component
plt.figure(figsize=(10, 6))
plt.plot(colombo_trend, label='Trend')
plt.title('Trend Component')
plt.xlabel('Time')
plt.ylabel('HCHO Reading')
plt.legend()
plt.show()

# Plot the seasonal component
plt.figure(figsize=(10, 6))
plt.plot(colombo_seasonal, label='Seasonal')
plt.title('Seasonal Component')
plt.xlabel('Time')
plt.ylabel('HCHO Reading')
plt.legend()
plt.show()

# Plot the long-term trend component
plt.figure(figsize=(10, 6))
plt.plot(colombo_long_term_trend, label='Long-term Trend')
plt.title('Long-term Trend Component')
plt.xlabel('Time')
plt.ylabel('HCHO Reading')
plt.legend()
plt.show()

In [None]:
# Import necessary libraries for time series analysis
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose

# Perform time series analysis for each city
def analyze_time_series(city_data):
    # Decompose time series into trend, seasonal, and residual components
    decomposition = seasonal_decompose(city_data, period=12)  # Assuming a yearly seasonality
    trend = decomposition.trend
    seasonal = decomposition.seasonal
    residual = decomposition.resid
    
    # Fit ARIMA model to the residual component to identify long-term trends
    arima_model = ARIMA(residual, order=(1, 1, 1))  # Example ARIMA order, adjust as needed
    arima_result = arima_model.fit()
    # Get long-term trend from ARIMA model
    long_term_trend = arima_result.fittedvalues
    
    return trend, seasonal, long_term_trend

### COVID-19 Lockdown Impact Analysi

In [2]:
from datetime import datetime

def covid_impact(location):
    # COVID-19 lockdown Period
    lockdown_start_date = datetime.strptime("2020-03-15", "%Y-%m-%d")
    lockdown_end_date = datetime.strptime("2020-05-15", "%Y-%m-%d")

    # Filter data for location
    location_data = data_pd[data_pd['Location'] == location]

    # Extract year and month 
    location_data['Year'] = location_data['Current_Date'].dt.year
    location_data['Month'] = location_data['Current_Date'].dt.month

    # Create a new column combining year and month
    location_data['Year_Month'] = location_data['Current_Date'].dt.to_period('M')

    # Group data by year-month and calculate the mean HCHO reading for each month
    location_monthly_mean = location_data.groupby('Year_Month')['HCHO_reading'].mean()
    
    # Plotting
    plt.figure(figsize=(10, 4))
    location_monthly_mean.plot(marker='o', linestyle='-')

    # Add vertical markers for lockdown start and end dates
    plt.axvline(lockdown_start_date, color='red', linestyle='--', label='Lockdown Start')
    plt.axvline(lockdown_end_date, color='green', linestyle='--', label='Lockdown End')
    
    # Add labels and title
    plt.title(f'COVID-19 Lockdown Impact Analysis - {location}')
    plt.xlabel('Year')
    plt.ylabel('Mean HCHO Reading')
    plt.grid(True)
    plt.tight_layout()

    # Show legend
    plt.legend()
    plt.show()

In [1]:
from statsmodels.tsa.arima.model import ARIMA

def fit_arima(location):
    fit_data = model_pd[model_pd['Location'] == location]['HCHO_reading']

    # Split data into train and test sets
    train_size = int(len(fit_data) * 0.8)
    train, test = fit_data[:train_size], fit_data[train_size:]

    stepwise_fit = auto_arima(train, trace=True, suppress_warnings=True)

    model = ARIMA(train, order=(3,0,3))
    model_fit = model.fit()

    print(model_fit.summary())

    start = len(train)
    end = len(train) + len(test) - 1

    pred = model_fit.forecast(steps=len(test))
    pred = pd.Series(pred, index=test.index)
    print(pred)

    plt.figure(figsize=(10, 6))
    plt.plot(test, color='blue', label='Actual Test Values')
    plt.plot(pred, color='red', label='ARIMA Predictions')
    plt.legend()
    plt.title('ARIMA Model Predictions vs Actual Test Values')
    plt.xlabel('Date')
    plt.ylabel('HCHO Reading')
    plt.show()

In [None]:
locations = model_pd['Location'].unique()
for location in locations:
    predicted_values = train_arima_and_predict(model_pd, location)
    print(f"Predicted HCHO values for {location} in December 2023:")
    print(predicted_values)