# Basic EDA - CO2 Emissions Dataset (1996)

This notebook covers basic data exploration and cleaning steps.

## 1. Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('/home/z/my-project/upload/1996.csv', low_memory=False)

print("Data loaded successfully!")

## 2. Basic DataFrame Information

In [None]:
# Shape of the dataset
print("Shape of DataFrame:")
print(f"Rows: {df.shape[0]}")
print(f"Columns: {df.shape[1]}")

In [None]:
# Data types
print("Data Types:")
print(df.dtypes)

In [None]:
# DataFrame info
print("DataFrame Info:")
df.info()

In [None]:
# Column names
print("Column Names:")
print(df.columns.tolist())

## 3. Preview Data

In [None]:
# First 5 rows
print("First 5 Rows:")
df.head()

In [None]:
# Last 5 rows
print("Last 5 Rows:")
df.tail()

## 4. Descriptive Statistics

In [None]:
# Descriptive statistics for numerical columns
print("Descriptive Statistics:")
df.describe()

In [None]:
# Descriptive statistics for all columns (including categorical)
print("Descriptive Statistics (All Columns):")
df.describe(include='all')

## 5. Null Values Analysis

In [None]:
# Total null values in dataset
total_nulls = df.isnull().sum().sum()
print(f"Total Null Values: {total_nulls}")

In [None]:
# Null values by column
print("Null Values by Column:")
null_counts = df.isnull().sum()
null_pct = (df.isnull().sum() / len(df)) * 100
null_df = pd.DataFrame({
    'Null Count': null_counts,
    'Null Percentage': null_pct.round(2)
})
null_df = null_df[null_df['Null Count'] > 0].sort_values('Null Count', ascending=False)
print(null_df)

In [None]:
# Null values by row
print("Null Values by Row (Top 10 rows with most nulls):")
null_by_row = df.isnull().sum(axis=1)
print(null_by_row.sort_values(ascending=False).head(10))

In [None]:
# Rows with all null values
all_null_rows = df.isnull().all(axis=1).sum()
print(f"Rows with all null values: {all_null_rows}")

## 6. Duplicate Analysis

In [None]:
# Total duplicate rows
duplicate_rows = df.duplicated().sum()
print(f"Total Duplicate Rows: {duplicate_rows}")

In [None]:
# Check for duplicates based on key columns
key_columns = ['Country_code_A3', 'ipcc_code_1996_for_standard_report', 'Substance']
key_cols_exist = [col for col in key_columns if col in df.columns]

if key_cols_exist:
    duplicates_on_keys = df.duplicated(subset=key_cols_exist, keep=False).sum()
    print(f"Duplicate rows based on {key_cols_exist}: {duplicates_on_keys}")
else:
    print("Key columns not found in dataset")

In [None]:
# Show duplicate rows (if any)
if duplicate_rows > 0:
    print("Duplicate Rows:")
    print(df[df.duplicated(keep=False)])
else:
    print("No duplicate rows found.")

## 7. Column Name Cleaning

In [None]:
# Identify columns with unnamed or problematic names
print("Columns with 'Unnamed' in name:")
unnamed_cols = [col for col in df.columns if 'Unnamed' in str(col)]
print(f"Count: {len(unnamed_cols)}")
print(unnamed_cols[:10])  # Show first 10

In [None]:
# Identify actual data columns vs empty columns
print("\nAnalyzing column content...")

# Check which columns have data
non_null_counts = df.count()
empty_cols = non_null_counts[non_null_counts == 0].index.tolist()
data_cols = non_null_counts[non_null_counts > 0].index.tolist()

print(f"Empty columns (all null): {len(empty_cols)}")
print(f"Columns with data: {len(data_cols)}")

In [None]:
# Show columns with data
print("Columns with data:")
print(data_cols)

In [None]:
# Create cleaned dataframe - remove empty columns
df_cleaned = df[data_cols].copy()

print(f"Original shape: {df.shape}")
print(f"Cleaned shape: {df_cleaned.shape}")
print(f"Columns removed: {df.shape[1] - df_cleaned.shape[1]}")

In [None]:
# Rename columns for clarity (strip whitespace, lowercase)
print("\nCleaning column names...")
df_cleaned.columns = df_cleaned.columns.str.strip()
print("Column names cleaned (whitespace stripped)")

In [None]:
# Show cleaned column names
print("Cleaned DataFrame Columns:")
print(df_cleaned.columns.tolist())

## 8. Value Counts for Categorical Columns

In [None]:
# Identify categorical columns
categorical_cols = df_cleaned.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical Columns: {len(categorical_cols)}")
print(categorical_cols)

In [None]:
# Value counts for each categorical column
for col in categorical_cols:
    print(f"\n{'='*50}")
    print(f"Value Counts for: {col}")
    print(f"{'='*50}")
    unique_count = df_cleaned[col].nunique()
    print(f"Unique values: {unique_count}")
    
    if unique_count <= 20:
        print(df_cleaned[col].value_counts())
    else:
        print(f"Top 15 values:")
        print(df_cleaned[col].value_counts().head(15))

## 9. Value Counts for Key Columns

In [None]:
# IPCC Annex
if 'IPCC_annex' in df_cleaned.columns:
    print("IPCC Annex Distribution:")
    print(df_cleaned['IPCC_annex'].value_counts())

In [None]:
# Country Groups
if 'C_group_IM24_sh' in df_cleaned.columns:
    print("\nCountry Groups Distribution:")
    print(df_cleaned['C_group_IM24_sh'].value_counts())

In [None]:
# Substance
if 'Substance' in df_cleaned.columns:
    print("\nSubstance Distribution:")
    print(df_cleaned['Substance'].value_counts())

In [None]:
# Fossil/Bio classification
if 'fossil_bio' in df_cleaned.columns:
    print("\nFossil/Bio Classification:")
    print(df_cleaned['fossil_bio'].value_counts())

In [None]:
# IPCC Emission Categories
if 'ipcc_code_1996_for_standard_report_name' in df_cleaned.columns:
    print("\nIPCC Emission Categories:")
    print(df_cleaned['ipcc_code_1996_for_standard_report_name'].value_counts())

In [None]:
# Countries
if 'Name' in df_cleaned.columns:
    print("\nCountries (Count of records per country - Top 20):")
    print(df_cleaned['Name'].value_counts().head(20))

## 10. Summary Statistics for Numerical Columns

In [None]:
# Identify numerical columns (year columns)
year_cols = [col for col in df_cleaned.columns if col.startswith('Y_')]
print(f"Year columns found: {len(year_cols)}")
print(f"Year range: {year_cols[0]} to {year_cols[-1] if year_cols else 'N/A'}")

In [None]:
# Convert year columns to numeric
print("Converting year columns to numeric...")
for col in year_cols:
    df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')

print("Conversion complete.")

In [None]:
# Descriptive statistics for year columns
if year_cols:
    print("Descriptive Statistics for Year Columns:")
    print(df_cleaned[year_cols].describe())

In [None]:
# Null values in year columns
if year_cols:
    print("Null Values in Year Columns:")
    year_nulls = df_cleaned[year_cols].isnull().sum()
    print(year_nulls)

## 11. Final Cleaned DataFrame Summary

In [None]:
print("="*60)
print("FINAL CLEANED DATAFRAME SUMMARY")
print("="*60)
print(f"\nShape: {df_cleaned.shape}")
print(f"Rows: {df_cleaned.shape[0]}")
print(f"Columns: {df_cleaned.shape[1]}")
print(f"\nTotal Null Values: {df_cleaned.isnull().sum().sum()}")
print(f"Total Duplicate Rows: {df_cleaned.duplicated().sum()}")
print(f"\nColumn Names:")
print(df_cleaned.columns.tolist())

In [None]:
# Final DataFrame info
print("Final DataFrame Info:")
df_cleaned.info()

In [None]:
# Preview cleaned data
print("Cleaned Data - First 5 Rows:")
df_cleaned.head()

In [None]:
# Preview cleaned data
print("Cleaned Data - Last 5 Rows:")
df_cleaned.tail()

In [None]:
# Save cleaned dataframe
df_cleaned.to_csv('/home/z/my-project/download/cleaned_1996_data.csv', index=False)
print("Cleaned data saved to: /home/z/my-project/download/cleaned_1996_data.csv")

## 12. Quick Reference Summary

In [None]:
# Create a summary dictionary
summary = {
    'Metric': [
        'Original Rows',
        'Original Columns',
        'Cleaned Rows',
        'Cleaned Columns',
        'Empty Columns Removed',
        'Total Null Values (Original)',
        'Total Null Values (Cleaned)',
        'Duplicate Rows',
        'Categorical Columns',
        'Numerical (Year) Columns',
        'Year Range'
    ],
    'Value': [
        df.shape[0],
        df.shape[1],
        df_cleaned.shape[0],
        df_cleaned.shape[1],
        df.shape[1] - df_cleaned.shape[1],
        df.isnull().sum().sum(),
        df_cleaned.isnull().sum().sum(),
        df_cleaned.duplicated().sum(),
        len(categorical_cols),
        len(year_cols),
        f"{year_cols[0].replace('Y_', '')} - {year_cols[-1].replace('Y_', '')}" if year_cols else 'N/A'
    ]
}

summary_df = pd.DataFrame(summary)
print("="*60)
print("EDA SUMMARY")
print("="*60)
print(summary_df.to_string(index=False))