In [1]:
# Dependencies and Setup
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from path import Path
import numpy as np

In [2]:
# File to Load 
file = "methane_hist_emissions.csv"
methane_emissions = pd.read_csv(file, index_col=0)

In [3]:
methane_emissions.head()

Unnamed: 0_level_0,Sector,Gas,Unit,2018,2017,2016,2015,2014,2013,2012,...,1999,1998,1997,1996,1995,1994,1993,1992,1991,1990
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
China,Total including LUCF,CH4,MTCO2e,1238.95,1239.28,1242.43,1237.79,1206.51,1178.21,1151.28,...,826.57,814.8,792.55,824.63,798.9,774.1,758.01,752.1,743.51,730.78
China,Total excluding LUCF,CH4,MTCO2e,1238.63,1239.13,1242.15,1237.52,1206.21,1178.02,1151.1,...,825.69,813.83,791.73,823.86,798.45,773.65,757.56,751.66,743.07,730.34
Russia,Total including LUCF,CH4,MTCO2e,853.0,852.12,856.0,837.01,833.59,827.06,827.98,...,604.57,606.5,614.99,634.53,643.52,670.71,728.51,784.68,867.35,933.79
Russia,Total excluding LUCF,CH4,MTCO2e,849.57,850.17,852.55,835.56,830.22,825.64,824.27,...,598.7,597.72,610.38,622.22,639.73,666.92,724.72,780.89,863.52,929.97
China,Energy,CH4,MTCO2e,739.58,741.73,743.88,746.03,723.02,700.01,677.0,...,303.13,289.36,275.59,261.82,248.05,234.28,229.86,225.44,221.02,216.6


In [4]:
# Drop unused columns
methane_emissions_df = methane_emissions.drop(columns=["Gas", "Unit"], axis =1)
methane_emissions_df.head()

Unnamed: 0_level_0,Sector,2018,2017,2016,2015,2014,2013,2012,2011,2010,...,1999,1998,1997,1996,1995,1994,1993,1992,1991,1990
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
China,Total including LUCF,1238.95,1239.28,1242.43,1237.79,1206.51,1178.21,1151.28,1106.41,1064.2,...,826.57,814.8,792.55,824.63,798.9,774.1,758.01,752.1,743.51,730.78
China,Total excluding LUCF,1238.63,1239.13,1242.15,1237.52,1206.21,1178.02,1151.1,1106.19,1063.83,...,825.69,813.83,791.73,823.86,798.45,773.65,757.56,751.66,743.07,730.34
Russia,Total including LUCF,853.0,852.12,856.0,837.01,833.59,827.06,827.98,817.7,804.22,...,604.57,606.5,614.99,634.53,643.52,670.71,728.51,784.68,867.35,933.79
Russia,Total excluding LUCF,849.57,850.17,852.55,835.56,830.22,825.64,824.27,814.67,801.36,...,598.7,597.72,610.38,622.22,639.73,666.92,724.72,780.89,863.52,929.97
China,Energy,739.58,741.73,743.88,746.03,723.02,700.01,677.0,635.32,593.65,...,303.13,289.36,275.59,261.82,248.05,234.28,229.86,225.44,221.02,216.6


In [5]:
# Find data types
methane_emissions_df.dtypes

Sector     object
2018      float64
2017      float64
2016      float64
2015      float64
2014      float64
2013      float64
2012      float64
2011      float64
2010      float64
2009      float64
2008      float64
2007      float64
2006      float64
2005      float64
2004      float64
2003      float64
2002      float64
2001      float64
2000      float64
1999      float64
1998      float64
1997      float64
1996      float64
1995      float64
1994      float64
1993      float64
1992      float64
1991      float64
1990      float64
dtype: object

In [6]:
# Explore null emissions values
methane_emissions_df.isnull().sum()

Sector    0
2018      0
2017      0
2016      0
2015      0
2014      0
2013      0
2012      0
2011      0
2010      0
2009      0
2008      0
2007      0
2006      0
2005      0
2004      0
2003      0
2002      0
2001      0
2000      0
1999      0
1998      0
1997      0
1996      0
1995      0
1994      0
1993      0
1992      0
1991      0
1990      9
dtype: int64

In [7]:
# Create a list of countries from methane_emission_df 
countries = []
for country in methane_emissions_df.index.unique():
    countries.append(country)
sorted(countries)

['Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape Verde',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Cook Islands',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Republic',
 "Côte d'Ivoire",
 'Democratic Republic of the Congo',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'European Union (27)',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Grenada',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 

In [8]:
len(countries)

195

In [9]:
pip install xlrd

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\Oxana\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [10]:
# GDP data to Load 
file = "World GDP Data.xls"
GDP = pd.read_excel(file, index_col=0)
GDP.head()

FileNotFoundError: [Errno 2] No such file or directory: 'World GDP Data.xls'

In [None]:
# Drop unused GDP columns
GDP_df = GDP.drop(columns = ["Country Code", "Indicator Name", "Indicator Code", "2019", "2020"])
GDP_df

In [None]:
# Find data types
GDP_df.dtypes

In [None]:
# Explore null GDP values
GDP_df.isnull().sum()

In [None]:
# Drop rows with countries that don't have emission data
GDP_df_same = GDP_df[GDP_df.index.isin(countries)]
GDP_df_same

In [None]:
# See what countries have probably different spelling
GDP_rest = GDP_df[~GDP_df.index.isin(countries)]
sorted(GDP_rest.index.unique())

In [None]:
# Correct spelling in GDP_df
GDP_clean = GDP_df.rename(index={'Bahamas, The': "Bahamas", 
                                 'Brunei Darussalam': "Brunei", 
                                 'Egypt, Arab Rep.': "Egypt", 
                                 'Gambia, The': 'Gambia',
                                 'Iran, Islamic Rep.': 'Iran',
                                 'Micronesia, Fed. Sts.': 'Micronesia',
                                 'Russian Federation': 'Russia',
                                 'Syrian Arab Republic': 'Syria',
                                 'Venezuela, RB': 'Venezuela',
                                 'Yemen, Rep.': 'Yemen'})

In [None]:
# Correct spelling in methate_emissions_df
methane_clean = methane_emissions_df.rename(index={'Democratic Republic of the Congo': 'Congo, Dem. Rep.',
                                                  'Kyrgyzstan': 'Kyrgyz Republic',
                                                  'Laos': 'Lao PDR',
                                                  'Republic of Congo': 'Congo, Rep.',
                                                  'Saint Kitts and Nevis': 'St. Kitts and Nevis',
                                                  'Saint Lucia': 'St. Lucia',
                                                  'Saint Vincent and the Grenadines': 'St. Vincent and the Grenadines',
                                                  'Slovakia': 'Slovak Republic'})

In [None]:
# Create a list of countries with corrected spelling
all_countries = []
for country in methane_clean.index.unique():
    all_countries.append(country)
sorted(all_countries)

In [None]:
# Drop rows with countries that don't have emission data (with corrected country names)
GDP_same = GDP_clean[GDP_clean.index.isin(all_countries)]
GDP_same

In [None]:
# Check for null values in GDP_same DataFrame
GDP_same.isnull().sum()

In [None]:
# Drop years with multiple missing GDP data
GDP_2000 = GDP_same.drop(columns = ["1990","1991","1992","1993","1994","1995","1996","1997","1998","1999"])
GDP_2000

In [None]:
# Exporting merged_transposed data in csv to get null values
# GDP_2000.to_csv("GDP_2000.csv", index = True)

In [None]:
# Read in normalized GDP data after filling in the NaNs, deleting of South Sudan and dividing by 1000000000
file = "GDP_2000_normalized.csv"
GDP_2000_normalized = pd.read_csv(file, index_col=0)

In [None]:
# Create a list of countries from methane_clean dataframe
same_countries = []
for country in GDP_2000_normalized.index.unique():
    same_countries.append(country)
sorted(same_countries)

In [None]:
# Filter methane_clean dataframe for same country names as in GDP_same dataframe
methane_same = methane_clean[methane_clean.index.isin(same_countries)]
len(methane_same.index.unique())

In [None]:
# Drop years with multiple missing GDP data
methane_2000 = methane_same.drop(columns = ["1990","1991","1992","1993","1994","1995","1996","1997","1998","1999"])
methane_2000

In [None]:
# Transpose indices and column headers in methane emission dataframe
df_transposed = methane_2000.copy(deep=True)
df_transposed = df_transposed.groupby([df_transposed.index,'Sector']).mean().transpose()
df_transposed.head()

In [None]:
# Transpose indices and column headers in GDP dataframe
gdp_transposed = GDP_2000_normalized.copy(deep=True)
gdp_transposed = gdp_transposed.transpose()
gdp_transposed = gdp_transposed.rename(columns={col:(col, 'GDP') for col in gdp_transposed.columns})
gdp_transposed.head()

In [None]:
# Plot
#df_transposed.plot(kind='line', legend=False, figsize=(20,20))


In [None]:
# Merge methane and GDP transposet dataframes
merged = df_transposed.merge(gdp_transposed,how="left", right_index=True, left_index=True, validate="one_to_one")
cols = pd.MultiIndex.from_tuples([(x,y) for x,y in merged.columns])

merged = merged.transpose().reindex(cols)
merged.transpose().head()
#merged = merged.dropna(axis=0, how='any')


In [None]:
# Exporting merged_transposed data in csv to get null values
# merged.transpose().to_csv("merged_transposed.csv", index = True)

In [None]:
# Finding null count based on year
merged.isnull().sum()

In [None]:
# Finding null count based on country and Sector
merged.transpose().isnull().sum()

In [None]:
# trying to show NaN values
selected_rows = merged.transpose()[merged.transpose().isna()]
print(selected_rows)

In [None]:
merged.xs('China', level=0 , drop_level=False).iloc[:,::-1].transpose().plot(kind="line", figsize=(20,20), legend=True, logy=True)

In [None]:
# merged_data_scaled.xs('1', level=0, drop_level=False).iloc[:,::-1].transpose().plot(kind="line", figsize=(20,20), legend=True, logy=True)

In [None]:
# See correlation between our values
merged.transpose().corr()[merged.transpose().corr()>0.7].dropna(how='all').dropna(how='all', axis=1)

In [None]:
# Binary encoding Sector column
methane_emissions_encoded = pd.get_dummies(methane_same, columns=["Sector"])
methane_emissions_encoded.head()

In [None]:
# set up our target variable and features
y = methane_emissions_encoded["2018"]
X = methane_emissions_encoded.drop(columns=["2018", "1990"], axis=1)

In [None]:
# Initiate linear model
regr = linear_model.LinearRegression()
regr.fit(X, y)

In [None]:
# Prediction
y_pred = regr.predict(X)
print(y_pred.shape)

In [None]:
# 
print(regr.coef_)
print(regr.intercept_)

In [None]:
merged_df = methane_same.loc[methane_same['Sector']=="Total including LUCF"].merge(GDP_same, how='inner', left_index=True, right_index=True, validate="one_to_one", suffixes=["_gdp","_meth"])
#merged_df.dropna(how='any', axis=0, inplace=True)

In [None]:
merged_df.head()

In [None]:
#merged_df.corr().style.bar(color="#d65f5f")

In [None]:
# methane_emissions_df.transpose().groupby('Sector').plot(kind='line',y="1990",use_index=False, figsize=(35,10), subplots=True, sharey=True)
#methane_emissions_df.plot(kind='line', figsize=(15,10), sharex=True, sharey=True, loglog=True)