# CO2 Emissions Data - Basic EDA

This notebook contains basic Exploratory Data Analysis including:
- Data loading and inspection
- Null values check
- Duplicate rows check
- Column name cleaning
- Data types and basic statistics

## 1. Import Libraries & Load Data

In [None]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('/home/z/my-project/upload/Totals By Country.csv')

print("Data loaded successfully!")

## 2. Basic Data Inspection

### 2.1 Shape of DataFrame

In [None]:
# Shape of the dataframe
print("Shape of DataFrame:")
print(f"Rows: {df.shape[0]}")
print(f"Columns: {df.shape[1]}")
df.shape

### 2.2 Column Names

In [None]:
# Column names
print("Column Names:")
df.columns.tolist()

### 2.3 Data Types

In [None]:
# Data types of each column
print("Data Types:")
df.dtypes

### 2.4 DataFrame Info

In [None]:
# DataFrame information
df.info()

### 2.5 First 5 Rows (Head)

In [None]:
# First 5 rows
df.head()

### 2.6 Last 5 Rows (Tail)

In [None]:
# Last 5 rows
df.tail()

### 2.7 Descriptive Statistics

In [None]:
# Descriptive statistics for all columns
df.describe()

In [None]:
# Descriptive statistics for object columns
df.describe(include='object')

## 3. Null Values Check

In [None]:
# Total null values in dataframe
total_nulls = df.isnull().sum().sum()
print(f"Total null values: {total_nulls}")

In [None]:
# Null values per column
null_counts = df.isnull().sum()
null_counts = null_counts[null_counts > 0]

if len(null_counts) > 0:
    print("Columns with null values:")
    print(null_counts)
    print(f"\nTotal columns with nulls: {len(null_counts)}")
else:
    print("No null values found!")

In [None]:
# Null values percentage per column
null_percentage = (df.isnull().sum() / len(df)) * 100
null_percentage = null_percentage[null_percentage > 0]

if len(null_percentage) > 0:
    print("Null percentage per column:")
    print(null_percentage.sort_values(ascending=False))

## 4. Duplicate Rows Check

In [None]:
# Total duplicate rows
duplicate_count = df.duplicated().sum()
print(f"Total duplicate rows: {duplicate_count}")

In [None]:
# Check for duplicate country names
duplicate_names = df['Name'].duplicated().sum()
print(f"Duplicate country names: {duplicate_names}")

In [None]:
# Check for duplicate country codes
duplicate_codes = df['Country_code_A3'].duplicated().sum()
print(f"Duplicate country codes: {duplicate_codes}")

## 5. Column Name Cleaning

In [None]:
# Check for column name issues
print("Original column names:")
print(df.columns.tolist())

In [None]:
# Check for leading/trailing spaces in column names
columns_with_spaces = [col for col in df.columns if col != col.strip()]
if columns_with_spaces:
    print(f"Columns with leading/trailing spaces: {columns_with_spaces}")
else:
    print("No leading/trailing spaces in column names.")

In [None]:
# Clean column names - strip whitespace and convert to lowercase (optional)
df_cleaned = df.copy()

# Strip whitespace from column names
df_cleaned.columns = df_cleaned.columns.str.strip()

print("Column names after cleaning:")
print(df_cleaned.columns.tolist())

## 6. Value Counts for Categorical Columns

In [None]:
# Value counts for 'IPCC_annex'
print("Value counts for 'IPCC_annex':")
df['IPCC_annex'].value_counts()

In [None]:
# Value counts for 'C_group_IM24_sh' (Regional groups)
print("Value counts for 'C_group_IM24_sh':")
df['C_group_IM24_sh'].value_counts()

In [None]:
# Value counts for 'Substance'
print("Value counts for 'Substance':")
df['Substance'].value_counts()

In [None]:
# Unique countries count
print(f"Number of unique countries: {df['Name'].nunique()}")
print(f"\nFirst 20 country names:")
df['Name'].head(20).tolist()

In [None]:
# Unique country codes
print(f"Number of unique country codes: {df['Country_code_A3'].nunique()}")

## 7. Data Cleaning Summary

In [None]:
print("="*60)
print("DATA CLEANING SUMMARY")
print("="*60)
print(f"\nOriginal Shape: {df.shape}")
print(f"Cleaned Shape: {df_cleaned.shape}")
print(f"\nTotal Null Values: {df.isnull().sum().sum()}")
print(f"Total Duplicate Rows: {df.duplicated().sum()}")
print(f"\nColumns: {len(df.columns)}")
print(f"Rows: {len(df)}")
print("="*60)