In [None]:
# Essential libraries for data manipulation and visualization
import pandas as pd
import numpy as np  # Often useful for numerical operations
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px  # For interactive plots (optional)

# Configure visualizations (optional, for aesthetics)
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)  # Default figure size for matplotlib

print("Libraries imported successfully!")

## 2. Load Dataset
Here, we load the chosen dataset into a pandas DataFrame and perform an initial inspection.


In [None]:
# Load your dataset
# Replace 'your_dataset.csv' with the actual path or URL to your dataset
try:
    df = pd.read_csv('your_dataset.csv')  # TODO: Replace with your dataset path/URL
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: Dataset file not found. Please check the path.")
    # Create a dummy DataFrame for demonstration if the file is not found
    data = {
        'country': ['USA', 'China', 'India', 'Germany', 'USA', 'China', 'India', 'Germany'],
        'year': [2019, 2019, 2019, 2019, 2020, 2020, 2020, 2020],
        'co2_emissions': [5000, 10000, 2500, 800, 4800, 10200, 2600, 750],
        'population': [328, 1433, 1366, 83, 331, 1439, 1380, 83],
        'gdp_per_capita': [65000, 10000, 2100, 46000, 63000, 10500, 2000, 45000]
    }
    df = pd.DataFrame(data)
    print("Loaded a dummy dataset for demonstration.")

print("\nFirst 5 rows of the dataset:")
print(df.head())

print("\nDataset Info:")
df.info()

print("\nDescriptive Statistics:")
print(df.describe())

## 3. Data Cleaning (Brief)
This section covers essential data cleaning steps, such as handling missing values, correcting data types, or removing duplicates if necessary.


In [None]:
print("\nMissing values per column:")
print(df.isnull().sum())

# Uncomment and adapt the following lines if needed:
# for col in df.select_dtypes(include=np.number).columns:
#     if df[col].isnull().any():
#         df[col].fillna(df[col].mean(), inplace=True)

# for col in df.select_dtypes(include='object').columns:
#     if df[col].isnull().any():
#         df[col].fillna(df[col].mode()[0], inplace=True)

# df.dropna(subset=['critical_column_name'], inplace=True)
# df.drop_duplicates(inplace=True)

print("\nData after initial cleaning (if any steps were applied):")
print(df.head())
df.info()

## 4. Formulate Questions or Hypotheses

1. **Question 1:** How have CO2 emissions changed over time for the top 5 emitting countries in the most recent year available?
2. **Question 2:** Is there a correlation between GDP per capita and CO2 emissions per capita for countries in a specific year?
3. **Question 3:** What is the distribution of CO2 emissions among different countries for the latest available year?


### 5.1 Answering Question 1


In [None]:
if 'country' in df.columns and 'year' in df.columns and 'co2_emissions' in df.columns:
    countries_q1 = ['USA', 'China']
    df_q1 = df[df['country'].isin(countries_q1) & (df['year'] >= 2019)]

    if not df_q1.empty:
        plt.figure(figsize=(12, 7))
        sns.lineplot(data=df_q1, x='year', y='co2_emissions', hue='country', marker='o')
        plt.title('CO2 Emissions Over Time for USA and China (Example)')
        plt.xlabel('Year')
        plt.ylabel('CO2 Emissions (Example Units)')
        plt.legend(title='Country')
        plt.show()


### 5.2 Answering Question 2


In [None]:
if 'year' in df.columns and 'gdp_per_capita' in df.columns and 'co2_emissions' in df.columns:
    df_q2 = df[df['year'] == 2020].copy()
    if not df_q2.empty and 'population' in df_q2.columns:
        df_q2['co2_emissions_per_capita'] = df_q2['co2_emissions'] / df_q2['population']
        df_q2.replace([np.inf, -np.inf], np.nan, inplace=True)
        df_q2.dropna(subset=['co2_emissions_per_capita', 'gdp_per_capita'], inplace=True)

        if not df_q2.empty:
            plt.figure(figsize=(12, 7))
            sns.scatterplot(data=df_q2, x='gdp_per_capita', y='co2_emissions_per_capita',
                            hue='country', size='population', sizes=(50, 500), alpha=0.7)
            plt.title('GDP per Capita vs. CO2 Emissions per Capita in 2020 (Example)')
            plt.xlabel('GDP per Capita')
            plt.ylabel('CO2 Emissions per Capita')
            plt.xscale('log')
            plt.yscale('log')
            plt.legend(title='Country', bbox_to_anchor=(1.05, 1), loc='upper left')
            plt.tight_layout()
            plt.show()


### 5.3 Answering Question 3


In [None]:
if 'year' in df.columns and 'country' in df.columns and 'co2_emissions' in df.columns:
    df_q3_latest_year = df[df['year'] == df['year'].max()]
    df_q3_top_emitters = df_q3_latest_year.nlargest(10, 'co2_emissions')

    if not df_q3_top_emitters.empty:
        plt.figure(figsize=(14, 8))
        sns.barplot(data=df_q3_top_emitters, x='co2_emissions', y='country', palette='viridis')
        plt.title(f'Top 10 CO2 Emitting Countries in {df["year"].max()} (Example)')
        plt.xlabel('CO2 Emissions')
        plt.ylabel('Country')
        plt.tight_layout()
        plt.show()


## 6. Inference or Conclusion

- **Q1 Summary:** CO2 emissions trends vary by country.
- **Q2 Summary:** Positive correlation between GDP per capita and CO2 emissions per capita.
- **Q3 Summary:** Emissions concentrated in a few top countries.

**Insights:** Economic development often increases emissions, though patterns vary.

**Limitations:** Dummy dataset, simplified assumptions.
