In [None]:
import pandas as pd # type: ignore
import matplotlib.pyplot as plt # type: ignore
import seaborn as sns # type: ignore

In [None]:
df = pd.read_csv('data/Climate_Change_Indicators.csv')
df.head(5)

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

###### The line of code `missing_values = df.isnull().sum()` is used to identify and count the number of missing (null) values in each column of a pandas DataFrame. 
###### `df.isnull()` method indicates whether the corresponding element in df is NaN (missing value) or not. True indicates a missing value, and False indicates a non-missing value. `.sum()` method sums up the True values along each column. Finally, the total count of missing values in the corresponding column of the original DataFrame is printed.
###### `missing_values` is a `pandas Series` where the index represents the column names and the values represent the count of missing values in each column.

###### Since, there are no missing values in our original data, there is no need to handle/clean missing values.

In [None]:
# Check for inconsistent values

# Defining the expected ranges or criteria for column 'Year'
criteria = {
    'Year': (1900, 2023)
}

# Function to check for inconsistent values
def check_inconsistent_values(df, criteria):
    inconsistent_values = {}
    for column, (min_val, max_val) in criteria.items():
        inconsistent_values[column] = df[(df[column] < min_val) | (df[column] > max_val)]
    return inconsistent_values

In [None]:
# Check for inconsistent values
inconsistent_values = check_inconsistent_values(df, criteria)

# Print inconsistent values for each column
for column, values in inconsistent_values.items():
    if not values.empty:
        print(f"Inconsistent values in '{column}' column:\n", values)
    else:
        print(f"No inconsistent values in '{column}' column.")

###### In this code, we have calculated descriptive statistics (Mean, Median, Range, Standard Deviation, Minimum and Maximum) for Arctic Ice Area (million km²) and rounded them to two decimal values.

In [None]:
summary_statistics = df.describe()
print(summary_statistics)

In [None]:
# Aggregate the data by year, computing the average for each climate variable

df_grouped_by_year = df.groupby("Year").mean()
df_grouped_by_year

Documentation -Data Preparation

1. Load the Dataset:
- The dataset Climate_Change_Indicators.csv is loaded into a Pandas DataFrame using the following command:
df = pd.read_csv('data/Climate_Change_Indicators.csv')
df.head(5)

This reads the CSV file and displays the first five rows for a quick preview.



2. Check for Missing Values:
- To ensure data quality, we check for missing values in each column:

missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

The .isnull() method identifies missing values (NaN).
.sum() counts the number of missing values in each column.
The output confirms that there are no missing values in the dataset, so no cleaning is required.

3. Check for Inconsistencies
- Before proceeding with analysis, the dataset is checked for inconsistencies like incorrect data types, duplicate records, and anomalies. If necessary, corrections are applied.


4. Aggregate Data by Year
- To analyze trends over time, the dataset is grouped by year, and the average of each variable is calculated:

df_grouped_by_year = df.groupby("Year").mean()
df_grouped_by_year.head()
- This groups the dataset by Year and calculates the mean of all numerical variables.

Conclusion:
- The data is successfully loaded.
- No missing values were found.
- The Year column was verified for inconsistencies, ensuring only values between 1980 and 2023 exist.
- The data was aggregated by year to prepare for further analysis.



###Calculate descriptive statistics (mean, median, range, std deviation, etc.) for Global Temperature and CO2 Concentration.

In [None]:
# Descripitve statistics for Global Temperature

globalTemperature_col = df['Global Average Temperature (°C)']

globalTemperature_stats = {
    
        'Mean':     globalTemperature_col.mean(),
        'Median':   globalTemperature_col.median(),
        'Minimum':  globalTemperature_col.min(),
        'Maximum':  globalTemperature_col.max(),
        'Range':   globalTemperature_col.max() - globalTemperature_col.min(),
        'Std_dev':   globalTemperature_col.std()
    
}
globalTemperature_stats

In [None]:
# Descripitve statistics for CO2 Concentration


co2_concentration_col = df['CO2 Concentration (ppm)']

co2_concentration_stats = {
    
        'Mean':     co2_concentration_col.mean(),
        'Median':   co2_concentration_col.median(),
        'Minimum':  co2_concentration_col.min(),
        'Maximum':  co2_concentration_col.max(),
        'Range':   co2_concentration_col.max() - co2_concentration_col.min(),
        'Std_dev':   co2_concentration_col.std()
    
}
co2_concentration_stats

In [None]:
# Calculate descriptive statistics for 'Sea Level Rise (mm)'
sea_level_rise_col = df['Sea Level Rise (mm)']

sea_level_rise_stats = {
    'Statistics': ['Mean', 'Median', 'Range', 'Standard Deviation', 'Minimum', 'Maximum'],
    'Value': [
        round(sea_level_rise_col.mean(), 2),
        round(sea_level_rise_col.median(), 2),
        round(sea_level_rise_col.max() - sea_level_rise_col.min(), 2),
        round(sea_level_rise_col.std(), 2),
        round(sea_level_rise_col.min(), 2),
        round(sea_level_rise_col.max(), 2)
    ]
}

In [None]:
sea_level_rise_stats_df = pd.DataFrame(sea_level_rise_stats)
sea_level_rise_stats_df

###### In this code, we have calculated descriptive statistics (Mean, Median, Range, Standard Deviation, Minimum and Maximum) for Sea Level Rise (mm) and rounded them to two decimal values.

In [None]:
# Calculate descriptive statistics for 'Arctic Ice Area (million km²)'
arctic_ice_area_col = df['Arctic Ice Area (million km²)']

arctic_ice_area_stats = {
    'Statistics': ['Mean', 'Median', 'Range', 'Standard Deviation', 'Minimum', 'Maximum'],
    'Value': [
        round(arctic_ice_area_col.mean(), 2),
        round(arctic_ice_area_col.median(), 2),
        round(arctic_ice_area_col.max() - arctic_ice_area_col.min(), 2),
        round(arctic_ice_area_col.std(), 2),
        round(arctic_ice_area_col.min(), 2),
        round(arctic_ice_area_col.max(), 2)
    ]
}

In [None]:
arctic_ice_area_stats_df = pd.DataFrame(arctic_ice_area_stats)
arctic_ice_area_stats_df

###### In this code, we have calculated descriptive statistics (Mean, Median, Range, Standard Deviation, Minimum and Maximum) for Arctic Ice Area (million km²) and rounded them to two decimal values.

In [None]:
#creating histogarm and boxplots (Global Average Temperature (°C) and CO2 Concentration (ppm))

#Histogarm for Global Average Temperature (°C)
plt.Figure(figsize=(10, 6))
plt.hist(df_grouped_by_year['Global Average Temperature (°C)'], bins=20, color='blue', alpha=0.7)
plt.title('Distribution of Global Average Temperature')
plt.xlabel('Temperature (°C)')
plt.ylabel('Frequency') # ---> frequency of the temperature means like specific range ma kati choti vayo tw
plt.show()


In [None]:
#boxplot for Global Average Temperature (°C)
plt.Figure(figsize=(10, 6))
sns.boxplot(df_grouped_by_year['Global Average Temperature (°C)'])
plt.title('Boxplot of Global Average Temperature')
plt.xlabel('Temperature (°C)')
plt.show()

In [None]:
#Histogarm for CO2 Concentration (ppm)
plt.Figure(figsize=(10, 6))
plt.hist(df_grouped_by_year['CO2 Concentration (ppm)'], bins=20, color='red', alpha=0.7)
plt.title('Distribution of CO2 Concentration (ppm)')
plt.xlabel('CO2 Concentration (ppm)')
plt.ylabel('Frequency')
plt.show()

In [None]:
#boxplot for CO2 Concentration (ppm)
plt.Figure(figsize=(10, 6))
sns.boxplot(df_grouped_by_year['CO2 Concentration (ppm)'],color='red')
plt.title('Boxplot of CO2 Concentration (ppm)')
plt.xlabel('CO2 Concentration (ppm)')
plt.show()