In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as pyo
import plotly.graph_objs as go
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load all 5 datasets
city1 = pd.read_csv(r"C:\Users\payal\Downloads\Beijing.csv")
city2 = pd.read_csv(r"C:\Users\payal\Downloads\Chengdu.csv")
city3 = pd.read_csv(r"C:\Users\payal\Downloads\Guangzhou.csv")
city4 = pd.read_csv(r"C:\Users\payal\Downloads\Shanghai.csv")
city5 = pd.read_csv(r"C:\Users\payal\Downloads\Shenyang.csv")

# Add 'city' column to each of the datasets
city1["city"] = "Beijing"
city2["city"] = "Chengdu"
city3["city"] = "Guangzhou"
city4["city"] = "Shanghai"
city5["city"] = "Shenyang"

### Que 1) Perform data cleaning, impute missing values, do feature engineering for “Season” feature map it with seasons.

### TASK 1: Merge all 5 datasets

In [None]:
# Merge all 5 datasets
chinese_cities = pd.concat([city1, city2, city3, city4, city5], ignore_index=True)
chinese_cities.sample(5)

In [None]:
# Add 'date' column to the merged dataset
chinese_cities["date"] = pd.to_datetime(chinese_cities[["month", "day", "year"]])

In [None]:
chinese_cities.sample(3)

In [None]:
# Check for datatypes & shape of the dataset
chinese_cities.info()
chinese_cities.shape

### TASK 2: Perform Data Cleaning

In [None]:
# Check for missing values
chinese_cities.isnull().sum()

### TASK 3: Impute missing values

In [None]:
# Copy the dataframe to a new dataframe
imputed_df = chinese_cities.copy()

# Filling missing values with mean, median, mode, ffill, bfill, or interpolate method
imputed_df = imputed_df.interpolate(method="bfill")
imputed_df.head()

In [None]:
# Check again for missing values after imputation
imputed_df.isnull().sum()

In [None]:
# Check for duplicate records
duplicate = imputed_df[imputed_df.duplicated()]
print("Duplicate Rows :")
duplicate

### TASK 4: Do feature engineering for "Season" and feature map it with seasons

In [None]:
# Get the unique values in column "season"
unique_values = imputed_df["season"].unique()

# Print the unique values
print("Unique Seasons")
print(unique_values)

In [None]:
# Map the values of the "season" feature to season names
seasons = {1: "Spring", 2: "Summer", 3: "Fall", 4: "Winter"}
imputed_df["season"] = imputed_df["season"].map(seasons)

In [None]:
imputed_df.sample(5)

### Que 2) Generate a line chart showing the temperature (y-axis) and dates (x-axis) for one of the five cities. Is there a noticeable seasonal pattern?

In [None]:
# Filter the DataFrame to include only records with the name 'Beijing'
filtered_df = imputed_df[imputed_df["city"] == "Beijing"]

# Plot the temperature values
fig = px.line(filtered_df, x="date", y="TEMP")

# Display the plot
fig.update_xaxes(title_text="Date")
fig.update_yaxes(title_text="Temperature (°C)")
fig.update_layout(title_text="Temperature Distribution over time")
fig.show()

Insights: The above line plot shows the Distribution of Temperature(°C) over the period of years for the city 'Beijing'. We can see that the temperature over the years is same with roughly min-temp = -19°C and max-temp = 42°C.

### Que 3) Create a boxplot showing the temperature values aggregated by month for one of the five cities. (Use plotly)

In [None]:
# Filter the DataFrame to include only records with the name 'Shenyang'
filtered_df = imputed_df[imputed_df["city"] == "Shenyang"]

px.box(filtered_df, x="month", y="TEMP", points="all")

# Adding standard deviation and mean
fig = go.Figure()
fig.add_trace(go.Box(x=filtered_df.month, y=filtered_df.TEMP, marker_color="purple", boxmean="sd"))

# Add labels and title
fig.update_xaxes(title_text="Month")
fig.update_yaxes(title_text="Temperature (°C)")
fig.update_layout(title_text="Temperature Distribution by Month for Shenyang")

Insights: The box plot shows the Temperature Distribution by months for the city 'Shenyang'. We can see that there is a variation of temperatures for each month. The temperatures in Shenyang are high for the months July and August & low for the months December and January.

### Que 4) Create a heatmap to generate correlation between numeric features.

In [None]:
# Select numerical features from "chinese_cities" dataset
num_features = ["year", "PM", "DEWP", "HUMI", "PRES", "TEMP", "Iws", "precipitation", "Iprec"]

# Create correlation matrix of numerical features
corr_matrix = imputed_df[num_features].corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(8, 5))
sns.heatmap(corr_matrix, annot=True, cmap="RdBu", fmt=".1f")
plt.title("Correlation of Numerical Features using Heatmap")
plt.show()

Insights: The heatmap shows the correlation between pairs of numerical features, with positive correlations shown in blue and negative correlations shown in red. The color scale ranges from -1 to 1, with -1 indicating a perfectly negative correlation, 0 indicates no correlation, and 1 indicates a perfectly positive correlation. We can see that there is a strong positive correlation between "Humidity" and "Dew Point". Also, there is a strong positive correlation between "Precipitation" and "Iprec". There is a strong negative correlation between "Temperature" and "Pressure".

### Que 5) Create a scatter plot using two features of your choice. Choose a pair of features that you believe have some correlation between them. Based on your visualization, do they seem to be correlated? (Use plotly)

In [None]:
# Create a scatter plot for Season vs. Temperature
fig = px.scatter(imputed_df, x="season", y="TEMP", color="season")

# Display the plot
fig.update_xaxes(title_text="Season")
fig.update_yaxes(title_text="Temperature (°C)")
fig.update_layout(title_text="Scatter Plot of Season vs Temperature")
fig.show()

Insights: The scatter plot of Temperature(y-axis) vs Season(x-axis) shows that the temperatures are highest in the summer, and lowest in the winter. This is as expected in general. So we can see that there is a strong correlation between the two features "Temperature" and "Seasons".

### Que 6) Create a single plot that illustrates the value of the PM column over time for each of the four cities. Color and label each city differently so that they can be distinguished easily.

In [None]:
# Create a line chart for the PM column of each city
fig = px.line(imputed_df, x="date", y="PM", color="city")

# Display the plot
fig.update_xaxes(title_text="Date")
fig.update_yaxes(title_text="PM")
fig.update_layout(title_text="PM Values by City")
fig.show()

Insights: The line chart shows the distribution of PM values over time for each of the five cities. The date(x-axis) and PM(y-axis) is denoting that there is a variation of PM levels over a period for each of the cities. Some cities have consistently higher or lower PM levels than others. 

### Que 7) How do meteorological factors (DEWP, HUMI, PRES, TEMP) correlate with PM levels? Create scatter plots to explore relationships. (Use plotly)

In [None]:
#Reference- https://plotly.com/python/splom/
# Create scatter plots for each pair of the features to check the correlation
fig = px.scatter_matrix(imputed_df, dimensions=["DEWP", "HUMI", "PRES", "TEMP", "PM"], color="PM")

# Display the plot
fig.update_layout(title_text="Scatter Plots of Meteorological Factors vs PM Levels")
fig.show()

Insights: The scatter plot shows the correlation between the meteorological factors((DEWP, HUMI, PRES, TEMP) with the PM levels. The scatter plot matrix shows the pairwise relationship between the features. We can notice that - 1)DEWP vs PM have strong positive correlation. 2)HUMI vs PM have strong positive correlation. 3)PRES vs PM have no correlation. 4)TEMP vs PM have no correlation.

### Que 8) How do PM levels compare between the five cities? Create bar charts or boxplots for a city-to-city comparison. (Use plotly )

In [None]:
# Create box plots for each city
fig = px.box(imputed_df, x="city", y="PM", color="city")

# Display the plot
fig.update_xaxes(title_text="City")
fig.update_yaxes(title_text="PM")
fig.update_layout(title_text="PM Levels by City")
fig.show()

Insights: The box plot shows the comparison of PM levels by city. We can interpret that the PM levels are higher for cities Beijing and Shenyang compared to the other cities.

### Que 9) Create a line plot to show the seasonal distribution of precipitation levels and examine how it relates to PM levels. (Use plotly)

In [None]:
# Create a line plot for the seasonal distribution of precipitation and PM levels
fig = px.line(imputed_df, x="season", y=["precipitation", "PM"], color="city", facet_col="city")

# Change the scale of the y-axis
fig.update_yaxes(range=[0, 1000])

# Display the plot
fig.update_xaxes(title_text="Season")
fig.update_layout(title_text="Seasonal Distribution of Precipitation vs PM Levels")
fig.show()

In [None]:
# Create a line plot of Season vs Precipitation Levels
fig = px.line(imputed_df, x="season", y="precipitation", color="city", facet_col="city")

# Change the scale of the y-axis
fig.update_yaxes(range=[0, 100])

# Display the plot
fig.update_layout(title_text="Seasonal Distribution of Precipitation Levels by City")
fig.show()

In [None]:
# Create a line plot of Season vs PM Levels
fig = px.line(imputed_df, x="season", y="PM", color="city", facet_col="city")

# Display the plot
fig.update_layout(title_text="Seasonal Distribution of PM Levels by City")
fig.show()

Insights: The line plots shows the seasonal distribution of each city with respect to precipitation and PM levels. Each line represents different city, and the color indicates which city it belongs to. The season(x-axis) and Precipitation and PM(y-axis) shows that generally the precipitation is highest in summer and lowest in winter. On the contrary, PM levels are generally highest in winter and lowest in summer, having variations for some cities.

### Que 10) Create a simple dashboard using pydash tutorial 1.

In [None]:
from dash import Dash, html, dcc, callback, Output, Input
import dash_core_components as dcc
import dash_html_components as html

app = Dash(__name__)

app.layout = html.Div([
    html.H1(children = "Dashboard of Precipitation over time by City", style = {"textAlign": "center"}),
    dcc.Dropdown(imputed_df.city.unique(), "Shanghai", id = "dropdown-selection"),
    dcc.Graph(id="graph-content")
])

@callback(
    Output("graph-content", "figure"),
    Input("dropdown-selection", "value")
)

def update_graph(value):
    dff = imputed_df[imputed_df["city"] == value]
    return px.line(dff, x="date", y="Iprec")


if __name__ == "__main__":
    app.run_server(debug=True, port=8051)

Insights: This simple pydash dashboard shows the distribution of precipitation over a period for each of the 5 cities.The dashboard consists of a dropdown menu to select the city and a line plot showing the precipitation over time for the selected city. And the plot represents date(x-axis) and cumulative precipitation:Iprec(y-axis). There is a variation of precipitation over time for each cities.