# CO2 Emissions Data - Basic EDA

## Operations Covered:
1. Data Loading
2. Shape, Info, Dtypes
3. Head & Tail
4. Describe (Statistics)
5. Null Values Analysis
6. Duplicate Detection
7. Value Counts
8. Column Names Cleaning
9. Data Cleaning
10. Export Cleaned Data

## 1. Import Libraries & Load Data

In [None]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('2006.csv')
print("Data loaded successfully!")

## 2. Shape

In [None]:
# Shape of the dataframe
print("Shape of the dataset:")
print(f"Rows: {df.shape[0]}")
print(f"Columns: {df.shape[1]}")
df.shape

## 3. Info

In [None]:
# DataFrame info
df.info()

## 4. Dtypes

In [None]:
# Data types of all columns
print("Data Types:")
df.dtypes

## 5. Column Names

In [None]:
# All column names
print("Column Names:")
print(df.columns.tolist())

## 6. Head (First 5 Rows)

In [None]:
# First 5 rows
df.head()

## 7. Tail (Last 5 Rows)

In [None]:
# Last 5 rows
df.tail()

## 8. Describe (Statistical Summary)

In [None]:
# Statistical summary for numerical columns
df.describe()

In [None]:
# Statistical summary for categorical columns
df.describe(include='object')

## 9. Null Values Analysis

In [None]:
# Null values count per column
print("Null Values Count:")
df.isnull().sum()

In [None]:
# Null values percentage per column
print("Null Values Percentage:")
null_pct = (df.isnull().sum() / len(df) * 100).round(2)
null_pct

In [None]:
# Total null values in dataset
total_nulls = df.isnull().sum().sum()
total_cells = df.shape[0] * df.shape[1]
print(f"Total Null Values: {total_nulls}")
print(f"Total Cells: {total_cells}")
print(f"Null Percentage: {(total_nulls/total_cells*100):.2f}%")

In [None]:
# Columns with null values
print("Columns with Null Values:")
null_columns = df.columns[df.isnull().any()].tolist()
print(null_columns)
print(f"\nTotal columns with nulls: {len(null_columns)}")

## 10. Duplicate Analysis

In [None]:
# Total duplicate rows
duplicate_count = df.duplicated().sum()
print(f"Total Duplicate Rows: {duplicate_count}")

In [None]:
# Show duplicate rows (if any)
if duplicate_count > 0:
    print("Duplicate Rows:")
    df[df.duplicated()]
else:
    print("No duplicate rows found.")

## 11. Value Counts (Categorical Columns)

In [None]:
# Identify categorical columns
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical Columns: {cat_cols}")

In [None]:
# Value counts for IPCC_annex
print("Value Counts - IPCC_annex:")
df['IPCC_annex'].value_counts()

In [None]:
# Value counts for Substance
print("Value Counts - Substance:")
df['Substance'].value_counts()

In [None]:
# Value counts for fossil_bio
print("Value Counts - fossil_bio:")
df['fossil_bio'].value_counts()

In [None]:
# Value counts for IPCC emission categories
print("Value Counts - ipcc_code_2006_for_standard_report_name:")
df['ipcc_code_2006_for_standard_report_name'].value_counts()

In [None]:
# Number of unique countries
print(f"Unique Countries: {df['Name'].nunique()}")
print(f"Unique Country Codes: {df['Country_code_A3'].nunique()}")

## 12. Column Names Cleaning

In [None]:
# Check current column names
print("Current Column Names:")
print(df.columns.tolist())

In [None]:
# Clean column names - strip whitespace and convert to lowercase (optional)
# Keeping original names as they are already clean

# Check if column names have any issues
print("Column Names Analysis:")
print(f"- Columns with leading/trailing spaces: {sum(df.columns != df.columns.str.strip())}")
print(f"- Columns with special characters: {sum(df.columns.str.contains(r'[^a-zA-Z0-9_]'))}")
print(f"- Columns starting with number: {sum(df.columns.str.match(r'^\d'))}")

print("\nColumn names are already clean!")

## 13. Data Cleaning

In [None]:
# Create a copy for cleaning
df_cleaned = df.copy()
print(f"Original shape: {df_cleaned.shape}")

In [None]:
# 1. Remove duplicate rows
initial_rows = len(df_cleaned)
df_cleaned = df_cleaned.drop_duplicates()
removed_duplicates = initial_rows - len(df_cleaned)
print(f"Removed {removed_duplicates} duplicate rows")
print(f"Shape after removing duplicates: {df_cleaned.shape}")

In [None]:
# 2. Check for negative values in year columns
year_cols = [col for col in df_cleaned.columns if col.startswith('Y_')]
print(f"Year columns found: {len(year_cols)}")

negative_count = 0
for col in year_cols:
    neg_count = (df_cleaned[col] < 0).sum()
    if neg_count > 0:
        print(f"{col}: {neg_count} negative values")
        negative_count += neg_count

print(f"\nTotal negative values found: {negative_count}")

In [None]:
# 3. Replace negative values with NaN (emissions should not be negative)
for col in year_cols:
    df_cleaned.loc[df_cleaned[col] < 0, col] = np.nan

print(f"Replaced {negative_count} negative values with NaN")

In [None]:
# 4. Final shape after cleaning
print("="*50)
print("CLEANING SUMMARY")
print("="*50)
print(f"Original shape: {df.shape}")
print(f"Cleaned shape: {df_cleaned.shape}")
print(f"Rows removed: {len(df) - len(df_cleaned)}")
print(f"Negative values replaced with NaN: {negative_count}")

## 14. Cleaned Data Overview

In [None]:
# Cleaned data info
df_cleaned.info()

In [None]:
# Cleaned data head
df_cleaned.head()

In [None]:
# Cleaned data describe
df_cleaned.describe()

## 15. Export Cleaned Data to Excel

In [None]:
# Export cleaned data to Excel
output_file = 'CO2_Emissions_Cleaned_Data.xlsx'
df_cleaned.to_excel(output_file, index=False)
print(f"Cleaned data exported to: {output_file}")