# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import lightningchart as lc

lc.set_license("LICENSE_KEY")

# Loading and Processing Dataset

In [None]:
# Define the file path to the dataset containing India's AQI data.
file_path_india = 'datasets/India_AQI.csv'

# Read the CSV file into a Pandas DataFrame.
india_AQI = pd.read_csv(file_path_india)

# Display the first five rows of the DataFrame to get an overview of the data structure and content.
india_AQI.head()

In [None]:
# Perform forward filling to replace null values in the DataFrame.
india_AQI.ffill(inplace=True)

# Pollutants' Frequencies

In [None]:
# Define the AQI thresholds and descriptive colors for each pollutant's frequency chart
aqi_thresholds = {
    'PM2.5': [
        (0, 30, 'Good'),        
        (31, 60, 'Satisfactory'), 
        (61, 90, 'Moderate'),     
        (91, 120, 'Poor'),        
        (121, 250, 'Very Poor'),
        (251, float('inf'), 'Severe')
    ],
    'PM10': [
        (0, 50, 'Good'),
        (51, 100, 'Satisfactory'),
        (101, 250, 'Moderate'),
        (251, 350, 'Poor'),
        (351, 430, 'Very Poor'),
        (431, float('inf'), 'Severe')
    ],
    'NO2': [
        (0, 40, 'Good'),
        (41, 80, 'Satisfactory'),
        (81, 180, 'Moderate'),
        (181, 280, 'Poor'),
        (281, 400, 'Very Poor'),
        (401, float('inf'), 'Severe')
    ],
    'OZONE': [
        (0, 50, 'Good'),
        (51, 100, 'Satisfactory'),
        (101, 168, 'Moderate'),
        (169, 208, 'Poor'),
        (209, 748, 'Very Poor'),
        (749, float('inf'), 'Severe')
    ],
    'CO': [
        (0, 1.0, 'Good'),
        (1.1, 2.0, 'Satisfactory'),
        (2.1, 10.0, 'Moderate'),
        (10.1, 17.0, 'Poor'),
        (17.1, 34.0, 'Very Poor'),
        (34.1, float('inf'), 'Severe')
    ],
    'SO2': [
        (0, 40, 'Good'),
        (41, 80, 'Satisfactory'),
        (81, 380, 'Moderate'),
        (381, 800, 'Poor'),
        (801, 1600, 'Very Poor'),
        (1601, float('inf'), 'Severe')
    ],
    'NH3': [
        (0, 200, 'Good'),
        (201, 400, 'Satisfactory'),
        (401, 800, 'Moderate'),
        (801, 1200, 'Poor'),
        (1201, 1800, 'Very Poor'),
        (1801, float('inf'), 'Severe')
    ]
}

In [None]:
# Map descriptive color names to actual hex color codes
color_map = {
    'Good': lc.Color('green'),
    'Satisfactory': lc.Color('yellowgreen'),
    'Moderate': lc.Color('yellow'),
    'Poor': lc.Color('orange'),
    'Very Poor': lc.Color('red'),
    'Severe': lc.Color('darkred')
}

In [None]:
def assign_aqi_colors(chart, bin_edges, pollutant):
    # Get AQI thresholds for the specific pollutant
    thresholds = aqi_thresholds[pollutant]
    num_bins = len(bin_edges) - 1
    
    # Loop through each bin to assign colors based on AQI levels
    for i in range(num_bins):
        category_label = f'{bin_edges[i]:.1f} - {bin_edges[i+1]:.1f}'
        level = (bin_edges[i] + bin_edges[i+1]) / 2  # Mid-point of the bin range
        
        # Determine the color based on AQI thresholds
        for (lower, upper, color_name) in thresholds:
            if lower <= level <= upper:
                color = color_map[color_name]
                chart.set_bar_color(category=category_label, color=color)
                break

In [None]:
# Filter data for the specific pollutant 'PM2.5'
# The same filtering logic applies to other pollutants by replacing 'PM2.5' with the desired pollutant
pollutant_data = india_AQI[india_AQI['pollutant_id'] == 'PM2.5']

# Extract the average values of the pollutant
# The extraction process remains the same for other pollutants
y_values = pollutant_data['pollutant_avg'].tolist()

# Define the number of bins for the histogram
# The number of bins can be adjusted as needed; this setting can be reused for other pollutants
num_bins = 20

# Calculate the histogram data (counts and bin edges)
# Histogram calculation follows the same structure for other pollutants
counts, bin_edges = np.histogram(y_values, bins=num_bins)

# Prepare data for the bar chart by converting counts to a list of dictionaries
# This data preparation is consistent for other pollutants.
histogram_data = [{'category': f'{bin_edges[i]:.1f} - {bin_edges[i+1]:.1f}', 'value': int(counts[i])} for i in range(len(counts))]

# Initialize a vertical bar chart with a dark theme
# The chart initialization is the same for other pollutants, only the title changes
chart = lc.BarChart(
    vertical=True,
    theme=lc.Themes.Dark,
    title="Histogram of PM2.5 Levels' Frequency"
)

# Disable sorting to maintain the bin order
chart.set_sorting('disabled')

# Set the data for the bar chart
chart.set_data(histogram_data)

# Apply AQI-based colors to the bars
# The same function can be used for other pollutants by specifying the appropriate pollutant name
assign_aqi_colors(chart, bin_edges, 'PM2.5')

# Set titles for the category (X) and value (Y) axes
# Axis titles should be updated according to the specific pollutant being visualized
chart.category_axis.set_title("Levels in intervals")
chart.value_axis.set_title("Frequency")

chart.open()

In [None]:
pollutant_data = india_AQI[india_AQI['pollutant_id'] == 'PM10']
y_values = pollutant_data['pollutant_avg'].tolist()

num_bins = 20  
counts, bin_edges = np.histogram(y_values, bins=num_bins)

histogram_data = [{'category': f'{bin_edges[i]:.1f} - {bin_edges[i+1]:.1f}', 'value': int(counts[i])} for i in range(len(counts))]

chart = lc.BarChart(
    vertical=True,
    theme=lc.Themes.Dark,
    title="Histogram of PM10 Levels' Frequency"
)

chart.set_sorting('disabled')
chart.set_data(histogram_data)

assign_aqi_colors(chart, bin_edges, 'PM10')

chart.category_axis.set_title("Levels in intervals")
chart.value_axis.set_title("Frequency")

chart.open()

In [None]:
pollutant_data = india_AQI[india_AQI['pollutant_id'] == 'NO2']
y_values = pollutant_data['pollutant_avg'].tolist()

num_bins = 20  
counts, bin_edges = np.histogram(y_values, bins=num_bins)

histogram_data = [{'category': f'{bin_edges[i]:.1f} - {bin_edges[i+1]:.1f}', 'value': int(counts[i])} for i in range(len(counts))]

chart = lc.BarChart(
    vertical=True,
    theme=lc.Themes.Dark,
    title="Histogram of NO2 Levels' Frequency"
)


chart.set_sorting('disabled')
chart.set_data(histogram_data)

assign_aqi_colors(chart, bin_edges, 'NO2')

chart.category_axis.set_title("Levels in intervals")
chart.value_axis.set_title("Frequency")

chart.open()


In [None]:
pollutant_data = india_AQI[india_AQI['pollutant_id'] == 'NH3']
y_values = pollutant_data['pollutant_avg'].tolist()

num_bins = 20 
counts, bin_edges = np.histogram(y_values, bins=num_bins)

histogram_data = [{'category': f'{bin_edges[i]:.1f} - {bin_edges[i+1]:.1f}', 'value': int(counts[i])} for i in range(len(counts))]

chart = lc.BarChart(
    vertical=True,
    theme=lc.Themes.Dark,
    title="Histogram of NH3 Levels' Frequency"
)

chart.set_sorting('disabled')
chart.set_data(histogram_data)

assign_aqi_colors(chart, bin_edges, 'NH3')

chart.category_axis.set_title("Levels in intervals")
chart.value_axis.set_title("Frequency")

chart.open()

In [None]:
pollutant_data = india_AQI[india_AQI['pollutant_id'] == 'SO2']
y_values = pollutant_data['pollutant_avg'].tolist()

num_bins = 20  
counts, bin_edges = np.histogram(y_values, bins=num_bins)

histogram_data = [{'category': f'{bin_edges[i]:.1f} - {bin_edges[i+1]:.1f}', 'value': int(counts[i])} for i in range(len(counts))]

chart = lc.BarChart(
    vertical=True,
    theme=lc.Themes.Dark,
    title="Histogram of SO2 Levels' Frequency"
)

chart.set_sorting('disabled')
chart.set_data(histogram_data)

assign_aqi_colors(chart, bin_edges, 'SO2')

chart.category_axis.set_title("Levels in intervals")
chart.value_axis.set_title("Frequency")

chart.open()

In [None]:
pollutant_data = india_AQI[india_AQI['pollutant_id'] == 'CO']
y_values = pollutant_data['pollutant_avg'].tolist()

num_bins = 20  
counts, bin_edges = np.histogram(y_values, bins=num_bins)

histogram_data = [{'category': f'{bin_edges[i]:.1f} - {bin_edges[i+1]:.1f}', 'value': int(counts[i])} for i in range(len(counts))]

chart = lc.BarChart(
    vertical=True,
    theme=lc.Themes.Dark,
    title="Histogram of CO Levels' Frequency"
)

chart.set_sorting('disabled')
chart.set_data(histogram_data)

assign_aqi_colors(chart, bin_edges, 'CO')

chart.category_axis.set_title("Levels in intervals")
chart.value_axis.set_title("Frequency")

chart.open()

In [None]:
pollutant_data = india_AQI[india_AQI['pollutant_id'] == 'OZONE']
y_values = pollutant_data['pollutant_avg'].tolist()

num_bins = 20 
counts, bin_edges = np.histogram(y_values, bins=num_bins)

histogram_data = [{'category': f'{bin_edges[i]:.1f} - {bin_edges[i+1]:.1f}', 'value': int(counts[i])} for i in range(len(counts))]

chart = lc.BarChart(
    vertical=True,
    theme=lc.Themes.Dark,
    title="Histogram of OZONE Levels' Frequency"
)

chart.set_sorting('disabled')
chart.set_data(histogram_data)

assign_aqi_colors(chart, bin_edges, 'OZONE')

chart.category_axis.set_title("Levels in intervals")
chart.value_axis.set_title("Frequency")

chart.open()

# Pollutants' Severity

In [None]:
# Define AQI thresholds for each pollutant
aqi_thresholds = {
    'PM2.5': {'Good': 30, 'Satisfactory': 60, 'Moderate': 90, 'Poor': 120, 'Very Poor': 250},
    'PM10': {'Good': 50, 'Satisfactory': 100, 'Moderate': 250, 'Poor': 350, 'Very Poor': 430},
    'NO2': {'Good': 40, 'Satisfactory': 80, 'Moderate': 180, 'Poor': 280, 'Very Poor': 400},
    'OZONE': {'Good': 50, 'Satisfactory': 100, 'Moderate': 168, 'Poor': 208, 'Very Poor': 748},
    'CO': {'Good': 1.0, 'Satisfactory': 2.0, 'Moderate': 10.0, 'Poor': 17.0, 'Very Poor': 34.0},
    'SO2': {'Good': 40, 'Satisfactory': 80, 'Moderate': 380, 'Poor': 800, 'Very Poor': 1600},
    'NH3': {'Good': 200, 'Satisfactory': 400, 'Moderate': 800, 'Poor': 1200, 'Very Poor': 1800}
}

# Function to map pollutant values to AQI categories based on thresholds
def map_to_aqi_category(value, thresholds):
    if value <= thresholds['Good']:
        return 'Good'
    elif value <= thresholds['Satisfactory']:
        return 'Satisfactory'
    elif value <= thresholds['Moderate']:
        return 'Moderate'
    elif value <= thresholds['Poor']:
        return 'Poor'
    elif value <= thresholds['Very Poor']:
        return 'Very Poor'
    else:
        return 'Severe'
    
# Set colors based on AQI categories
colors = {
    'Good': lc.Color('green'),
    'Satisfactory': lc.Color('yellowgreen'),
    'Moderate': lc.Color('yellow'),
    'Poor': lc.Color('orange'),
    'Very Poor': lc.Color('red'),
    'Severe': lc.Color('darkred')
}

In [None]:
# Apply the function to create a new column for PM2.5 AQI categories
# The same logic can be applied to other pollutants (e.g., PM10, NO2, OZONE) by replacing 'PM2.5' with the desired pollutant
india_AQI['PM2.5_AQI_Category'] = india_AQI[india_AQI['pollutant_id'] == 'PM2.5']['pollutant_avg'].apply(
    lambda x: map_to_aqi_category(x, aqi_thresholds['PM2.5'])
)

# Count the occurrences of each AQI category and order them
# This approach can be used for other pollutants as well
ordered_categories = ['Good', 'Satisfactory', 'Moderate', 'Poor', 'Very Poor', 'Severe']
aqi_counts = india_AQI['PM2.5_AQI_Category'].value_counts().reindex(ordered_categories, fill_value=0)

# Prepare data for the bar chart
# The chart data preparation will follow the same structure for other pollutants
chart_data = [{'category': category, 'value': count} for category, count in aqi_counts.items()]

# Initialize a vertical bar chart with a dark theme
# The chart title and data can be adjusted for different pollutants
chart = lc.BarChart(
    vertical=True,
    theme=lc.Themes.Dark,
    title="PM2.5 Severity Distribution"
)

# Disable sorting to maintain the order of AQI categories
chart.set_sorting('disabled')

# Set the data for the bar chart
chart.set_data(chart_data)

# Apply the corresponding colors to each bar based on AQI categories
# Color application will be the same for other pollutants
for i, item in enumerate(chart_data):
    chart.set_bar_color(category=item['category'], color=colors[item['category']])
    
chart.value_axis.set_title("Occurence")
chart.open()

In [None]:
india_AQI['PM10_AQI_Category'] = india_AQI[india_AQI['pollutant_id'] == 'PM10']['pollutant_avg'].apply(lambda x: map_to_aqi_category(x, aqi_thresholds['PM10']))
aqi_counts = india_AQI['PM10_AQI_Category'].value_counts().reindex(ordered_categories, fill_value=0)

chart_data = [{'category': category, 'value': count} for category, count in aqi_counts.items()]

chart = lc.BarChart(
    vertical=True,
    theme=lc.Themes.Dark,
    title="PM10 Severity Distribution"
)

chart.set_sorting('disabled')
chart.set_data(chart_data)

for i, item in enumerate(chart_data):
    chart.set_bar_color(category=item['category'], color=colors[item['category']])

chart.value_axis.set_title("Occurence")
chart.open()

In [None]:
india_AQI['NO2_AQI_Category'] = india_AQI[india_AQI['pollutant_id'] == 'NO2']['pollutant_avg'].apply(lambda x: map_to_aqi_category(x, aqi_thresholds['NO2']))
aqi_counts = india_AQI['NO2_AQI_Category'].value_counts().reindex(ordered_categories, fill_value=0)

chart_data = [{'category': category, 'value': count} for category, count in aqi_counts.items()]

chart = lc.BarChart(
    vertical=True,
    theme=lc.Themes.Dark,
    title="NO2 Severity Distribution"
)

chart.set_sorting('disabled')
chart.set_data(chart_data)

for i, item in enumerate(chart_data):
    chart.set_bar_color(category=item['category'], color=colors[item['category']])

chart.value_axis.set_title("Occurence")
chart.open()

In [None]:
india_AQI['NH3_AQI_Category'] = india_AQI[india_AQI['pollutant_id'] == 'NH3']['pollutant_avg'].apply(lambda x: map_to_aqi_category(x, aqi_thresholds['NH3']))
aqi_counts = india_AQI['NH3_AQI_Category'].value_counts().reindex(ordered_categories, fill_value=0)

chart_data = [{'category': category, 'value': count} for category, count in aqi_counts.items()]

chart = lc.BarChart(
    vertical=True,
    theme=lc.Themes.Dark,
    title="NH3 Severity Distribution"
)

chart.set_sorting('disabled')
chart.set_data(chart_data)

for i, item in enumerate(chart_data):
    chart.set_bar_color(category=item['category'], color=colors[item['category']])

chart.value_axis.set_title("Occurence")
chart.open()

In [None]:
india_AQI['SO2_AQI_Category'] = india_AQI[india_AQI['pollutant_id'] == 'SO2']['pollutant_avg'].apply(lambda x: map_to_aqi_category(x, aqi_thresholds['SO2']))
aqi_counts = india_AQI['SO2_AQI_Category'].value_counts().reindex(ordered_categories, fill_value=0)

chart_data = [{'category': category, 'value': count} for category, count in aqi_counts.items()]

chart = lc.BarChart(
    vertical=True,
    theme=lc.Themes.Dark,
    title="SO2 Severity Distribution"
)

chart.set_sorting('disabled')
chart.set_data(chart_data)

for i, item in enumerate(chart_data):
    chart.set_bar_color(category=item['category'], color=colors[item['category']])

chart.value_axis.set_title("Occurence")
chart.open()

In [None]:
india_AQI['CO_AQI_Category'] = india_AQI[india_AQI['pollutant_id'] == 'CO']['pollutant_avg'].apply(lambda x: map_to_aqi_category(x, aqi_thresholds['CO']))
aqi_counts = india_AQI['CO_AQI_Category'].value_counts().reindex(ordered_categories, fill_value=0)

chart_data = [{'category': category, 'value': count} for category, count in aqi_counts.items()]

chart = lc.BarChart(
    vertical=True,
    theme=lc.Themes.Dark,
    title="CO Severity Distribution"
)

chart.set_sorting('disabled')
chart.set_data(chart_data)

for i, item in enumerate(chart_data):
    chart.set_bar_color(category=item['category'], color=colors[item['category']])

chart.value_axis.set_title("Occurence")
chart.open()

In [None]:
india_AQI['OZONE_AQI_Category'] = india_AQI[india_AQI['pollutant_id'] == 'OZONE']['pollutant_avg'].apply(lambda x: map_to_aqi_category(x, aqi_thresholds['OZONE']))
aqi_counts = india_AQI['OZONE_AQI_Category'].value_counts().reindex(ordered_categories, fill_value=0)

chart_data = [{'category': category, 'value': count} for category, count in aqi_counts.items()]

chart = lc.BarChart(
    vertical=True,
    theme=lc.Themes.Dark,
    title="OZONE Severity Distribution"
)

chart.set_sorting('disabled')
chart.set_data(chart_data)

for i, item in enumerate(chart_data):
    chart.set_bar_color(category=item['category'], color=colors[item['category']])

chart.value_axis.set_title("Occurence")
chart.open()

# State-by-State Comparison

In [None]:
# Group data by state and pollutant, calculating the mean pollutant levels for each state
state_comparison = india_AQI.groupby(['state', 'pollutant_id'])['pollutant_avg'].mean().unstack()

# Replace NaN values with 0
state_comparison = state_comparison.fillna(0)

# Prepare data for the stacked bar chart
categories = state_comparison.index.tolist()  # States
sub_categories = state_comparison.columns.tolist()  # Pollutants

# Convert the data to the required format for set_data_stacked
data_for_chart = [{'subCategory': pollutant, 'values': state_comparison[pollutant].tolist()} for pollutant in sub_categories]

# Initialize the bar chart with a black theme
chart = lc.BarChart(
    vertical=True,
    theme=lc.Themes.Black,
    title='Average Pollutant Levels by State'
)

# Set data for the stacked bar chart
chart.set_data_stacked(categories=categories, data=data_for_chart)

# Customize chart appearance
chart.set_value_label_display_mode('hidden')
chart.set_label_rotation(-90)

# Add a legend and display the chart
chart.add_legend().add(chart)
chart.open()

# Box Plot

In [None]:
# Initialize dictionaries to store clean data and outliers for each pollutant.
clean_data = {}
outliers_data = {}

 # Get the unique pollutant types
pollutant_types = india_AQI['pollutant_id'].unique()

# Loop through each pollutant type to identify outliers and clean data.
for pollutant in pollutant_types:
    pollutant_data = india_AQI[india_AQI['pollutant_id'] == pollutant]['pollutant_avg'].dropna().tolist()
    q1 = pd.Series(pollutant_data).quantile(0.25)  # Calculate first quartile
    q3 = pd.Series(pollutant_data).quantile(0.75)  # Calculate third quartile
    iqr = q3 - q1  # Calculate interquartile range
    lower_bound = q1 - 1.5 * iqr  # Define lower bound for outliers
    upper_bound = q3 + 1.5 * iqr  # Define upper bound for outliers
    
    # Separate the outliers from the non-outliers.
    outliers = [x for x in pollutant_data if x < lower_bound or x > upper_bound]
    non_outliers = [x for x in pollutant_data if lower_bound <= x <= upper_bound]
    
    outliers_data[pollutant] = outliers
    clean_data[pollutant] = non_outliers

# Create a box plot using the clean data (excluding outliers).
chart = lc.BoxPlot(
    data=clean_data,
    theme=lc.Themes.Dark,
    title='Pollutant Distribution Across Different Pollutants (Average Values)',
    xlabel='Pollutant Type',
    ylabel='Average Pollutant Frequency'
)

# Manually add outliers to the box plot with specific X coordinates.
x_coordinates = {
    'NO2': 4.5,
    'NH3': 6.5,
    'SO2': 8.5,
    'CO': 10.5,
    'OZONE': 12.5,
}

for pollutant, y_values in outliers_data.items():
    x_value = x_coordinates.get(pollutant, pollutant_types.tolist().index(pollutant))
    series = chart.add_point_series(
        sizes=True,
        rotations=True,
        lookup_values=True
    )
    series.append_samples(
        x_values=[x_value] * len(y_values),  # Assign correct X coordinate
        y_values=y_values,
        sizes=[10] * len(y_values),  # Set a fixed size for outliers
        lookup_values=[1] * len(y_values)  # Max lookup value for color
    )
    series.set_individual_point_color_enabled()
    series.set_point_color(lc.Color('red'))
    series.set_point_shape("triangle")

# Enable cursor mode to show the nearest data point.
chart.set_cursor_mode("show-nearest")

chart.open()