# Import libraries and load datasets

In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
dataset=pd.read_csv('owid-co2-data.csv', delimiter=';')
dataset_codebook=pd.read_csv('owid-co2-codebook.csv', delimiter=';')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [None]:
dataset.head()

# Variables and units

In [None]:
# Get df shape
row, col = dataset.shape
print("Number of rows: " + str(row))
print("Number of columns: " + str(col))

In [None]:
dataset.info()

In [None]:
dataset_codebook.head(55)

# Temporal coverage

In [None]:
#Temporal coverage: Number of considered years
start_year = dataset['year'].min()
end_year = dataset['year'].max()

print(f'The dataset covers the period [{start_year} - {end_year}].')

In [None]:
# Count number of values available per year
by_year = dataset.groupby(by = 'year').count()

In [None]:
# Spatial coverage: Number of countries
n_countries = dataset["country"].nunique()
n_countries

In [None]:
by_year[['methane', 'nitrous_oxide', 'total_ghg']].plot(figsize=(12,6),
    title = 'Count of available values per year for other GHG');

# Missing data

Dealing with missing data/value is one of the most tricky but common parts of data cleaning. 
We divide the heatmap into two parts that we can visualize the missing data.

In [None]:
cols = dataset.columns[0:20] # From column 1 to 19
colours = ['#000099', '#ffff00'] # specify the colours - yellow is missing. blue is not missing.
sns.heatmap(dataset[cols].isnull(), cmap=sns.color_palette(colours))

In [None]:
cols = dataset.columns[20:38] # From column 1 to 20
colours = ['#000099', '#ffff00'] # specify the colours - yellow is missing. blue is not missing.
sns.heatmap(dataset[cols].isnull(), cmap=sns.color_palette(colours))

In [None]:
#checking for missing values of each variable
dataset.isnull().sum()

We create a list below showing the percentage of missing values for each of the features.This list is a useful summary that can complement the heatmap visualization.

In [None]:
# % of rows missing in each column
for col in dataset.columns:
    pct_missing = np.mean(dataset[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))

# Extract the outliers - boxplot

A boxplot is a standardized way of displaying the distribution of data based on a five number summary (“minimum”, first quartile (Q1), median, third quartile (Q3), and “maximum”). It can tell you about your outliers and what their values are. It can also tell you if your data is symmetrical, how tightly your data is grouped, and if and how your data is skewed.

In [None]:
# First verson: boxplot 
draft=dataset.select_dtypes(include=['float'])
draft[['methane','total_ghg','nitrous_oxide']].boxplot(figsize=(15,5))

In [None]:
draft.head()

We will determine an upper limit, we consider the 95% of the data. Beyond this limit, the data will be considered as outliers.

In [None]:
upper_lim=dataset['methane'].quantile(.95)
df_outliers=dataset[dataset['methane']>upper_lim]
df_outliers['methane']

In [None]:
upper_lim=dataset['nitrous_oxide'].quantile(.95)
df_outliers=dataset[dataset['nitrous_oxide']>upper_lim]
df_outliers['nitrous_oxide']

In [None]:
upper_lim=dataset['total_ghg'].quantile(.95)
df_outliers=dataset[dataset['total_ghg']>upper_lim]
df_outliers['total_ghg']

# Extract the outliers - z-score

Z score is an important concept in statistics. Z score is also called standard score. This score helps to understand if a data value is greater or smaller than mean and how far away it is from the mean. More specifically, Z score tells how many standard deviations away a data point is from the mean. Formula: Z score = (x -mean) / std. deviation. Here, we suppose if the z score of a data point is more than 3, it indicates that the data point is quite different from the other data points. Such a data point can be an outlier.

In [None]:
#Second version : detect outliers using Z-score
outliers=[]
def detect_outliers(data):
    threshold=3
    mean=np.mean(data)
    std=np.std(data)
    
    for i in data:
        z_score=(i-mean)/std
        if np.abs(z_score)>threshold:
            outliers.append(i)
    return outliers

In [None]:
#detect the values of methane considered as outliers
methane_out= detect_outliers(dataset['methane'])
print("They are \t", len(methane_out),"outliers in methane using z-score method.")

In [None]:
#detect the values of nitrous_oxide considered as outliers
nitrous_oxide_out= detect_outliers(dataset['nitrous_oxide'])
print("They are \t", len(nitrous_oxide_out),"outliers in nitrous_oxide using z-score method.")

In [None]:
#detect the values of nitrous_oxide considered as outliers
total_ghg_out= detect_outliers(dataset['total_ghg'])
print("They are \t", len(total_ghg_out),"outliers in total_ghg using z-score method.")

# Unnecessary : Uninformative / Repetitive 

Sometimes one feature is uninformative because it has too many rows being the same value.
For that, we create a list of features with a high percentage of the same value. For example, we specify below to show features with over 95% rows being the same value. We woul look at the features after the variable "year", which are more meaningful, as we examine the values of each column.

In [None]:
num_rows = len(dataset.index)
low_information_cols = [] 


for col in dataset.columns:
    cnts = dataset[col].value_counts(dropna=False)
    top_pct = (cnts/num_rows).iloc[0]
    
    if top_pct > 0.95:
        low_information_cols.append(col)
    print('{0}: {1:.5f}%'.format(col, top_pct*100))
    print(cnts)
    print()

We can look into these variables one by one to see whether they are informative or not. We won’t show the details here.