# Climate Data Exploratory Data Analysis

## Introduction
This notebook contains an exploratory data analysis of climate data from 1900 to 2023. The dataset includes global temperatures, CO2 concentration, sea level rise, and Arctic ice area.

Your task is to perform a comprehensive EDA following the requirements in the README.md file.

In [27]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



# Set plot styling
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')
%matplotlib inline

## 1. Data Preparation

Load the climate data and perform necessary cleaning and aggregation.

In [None]:
# Load the dataset
df = pd.read_csv('data/Climate_Change_Indicators.csv') # Place the correct path to the file you are reading here (Make sure to load using the relative path)

# Display the first few rows of the dataset
df.head()

In [None]:
# Check for missing values and basic information about the dataset
print("Dataset Information:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())

In [None]:
# TODO: Aggregate data by year to create a 124-year time series
df_aggregated = df.groupby('Year').mean().reset_index()

# Display the first few rows of the aggregated dataset
print(df_aggregated.head(10))

## 2. Univariate Analysis

Analyze each climate variable independently.

In [None]:

# Rename columns for clarity
df_aggregated.rename(columns={
    "Global Average Temperature (°C)": "Temperature",
    "CO2 Concentration (ppm)": "CO2",
    "Sea Level Rise (mm)": "Sea_Level",
    "Arctic Ice Area (million km²)": "Ice_Area"
}, inplace=True)

# Define variables for analysis
variables = ["Temperature", "CO2", "Sea_Level", "Ice_Area"]

# Compute and print descriptive statistics
stats_df = df_aggregated[variables].describe().round(2)

for column in variables:
    print(f"\nDescriptive Statistics for {column}:")
    print(stats_df[column].to_string())


In [None]:

# Plot histograms with KDE
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
axes = axes.flatten()  # Flatten to easily iterate over

for i, column in enumerate(variables):
    sns.histplot(df_aggregated[column], bins=20, kde=True, color="blue", ax=axes[i])
    axes[i].set_xlabel(column, fontsize=12)
    axes[i].set_ylabel("Frequency", fontsize=12)
    axes[i].set_title(f"Histogram of {column}", fontsize=14)

plt.tight_layout()
plt.show()


In [None]:

# Plot box plots for each variable
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
axes = axes.flatten()

for i, column in enumerate(variables):
    sns.boxplot(x=df_aggregated[column], ax=axes[i], whis=1.5)
    axes[i].set_title(f'Box Plot for {column}', fontsize=14)
    axes[i].set_xlabel(column, fontsize=12)
    axes[i].set_ylabel('Value', fontsize=12)

plt.tight_layout()
plt.show()


In [None]:

# Plot KDE plots for each variable
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
axes = axes.flatten()

for i, column in enumerate(variables):
    sns.kdeplot(df_aggregated[column], color='red', ax=axes[i])
    axes[i].set_title(f'KDE Plot of {column}', fontsize=14)
    axes[i].set_xlabel(column, fontsize=12)
    axes[i].set_ylabel('Density', fontsize=12)

plt.tight_layout()
plt.show()


In [None]:

# Plot Distribution plots (displot) for each variable
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
axes = axes.flatten()

for i, column in enumerate(variables):
    sns.histplot(df_aggregated[column], bins=20, kde=True, color="blue", ax=axes[i])
    axes[i].set_title(f'Distribution Plot of {column}', fontsize=14)
    axes[i].set_xlabel(column, fontsize=12)
    axes[i].set_ylabel('Frequency', fontsize=12)

plt.tight_layout()
plt.show()


In [None]:

# Time Series Analysis for each variable
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
axes = axes.flatten()

for i, column in enumerate(variables):
    axes[i].plot(df_aggregated["Year"], df_aggregated[column], marker='o', linestyle='-', label=f"{column} Trend")
    axes[i].set_xlabel("Year", fontsize=12)
    axes[i].set_ylabel(column, fontsize=12)
    axes[i].set_title(f"Time Series (1900-2023) of {column}", fontsize=14)
    axes[i].legend()

plt.tight_layout()
plt.show()


## 3. Bivariate Analysis

Explore relationships between pairs of climate variables.

In [None]:

# Bivariate Analysis - Scatter plots for pairs of variables
sns.pairplot(df_aggregated[variables], diag_kind='kde', markers='o')
plt.suptitle('Pair Plot of Climate Variables', y=1.02, fontsize=16)
plt.tight_layout()
plt.show()


In [None]:

# Compute correlation coefficients (excluding the 'Year' variable)
correlation_matrix = df_aggregated[variables].corr()
print("Correlation Coefficients:\n", correlation_matrix)


In [None]:


# Correlation Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap', fontsize=16)
plt.tight_layout()  # Ensures everything fits well in the figure
plt.show()


In [None]:

# Line plots to analyze trends over time
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
variables = ['Temperature', 'CO2', 'Sea_Level', 'Ice_Area']
colors = ['r', 'g', 'b', 'purple']

for ax, var, color in zip(axes.flatten(), variables, colors):
    sns.lineplot(data=df_aggregated, x='Year', y=var, ax=ax, color=color)
    ax.set_title(f'Trend of {var} Over Time', fontsize=14)
    ax.set_xlabel('Year', fontsize=12)
    ax.set_ylabel(var, fontsize=12)

plt.tight_layout()
plt.show()


## 4. Multivariate Analysis

Investigate relationships among three or more variables.

In [None]:


# Create a 3D figure and scatter plot
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection="3d")

# Scatter plot
sc = ax.scatter(df_aggregated['Year'], df_aggregated['CO2'], df_aggregated['Temperature'], 
                c=df_aggregated['Sea_Level'], cmap='coolwarm', s=50, alpha=0.7)

# Set labels and title
ax.set_xlabel('Year', fontsize=12)
ax.set_ylabel('CO2 Levels', fontsize=12)
ax.set_zlabel('Temperature', fontsize=12)
ax.set_title('3D Scatter Plot of Temperature vs CO2 vs Year', fontsize=14)

# Color bar
cbar = plt.colorbar(sc)
cbar.set_label('Sea Level', fontsize=12)

plt.show()


In [None]:



# Reshape data for Seaborn
df_melted = df_aggregated.melt(id_vars='Year', value_vars=['Temperature', 'CO2', 'Sea_Level', 'Ice_Area'])

# Create multiple line plots for each variable using FacetGrid
g = sns.FacetGrid(df_melted, col='variable', col_wrap=2, height=4, sharex=True)
g.map(sns.lineplot, 'Year', 'value')

# Set the title and show the plot
g.set_titles("{col_name}", fontsize=14)
g.set_axis_labels("Year", "Value", fontsize=12)

plt.tight_layout()
plt.show()


## 5. Conclusions and Insights

Summarize your findings and discuss their implications.