In [1]:
import pandas as pd
df=pd.read_csv("owid-co2-data.csv")
print(df.head())

       country  year iso_code  population  gdp  cement_co2  \
0  Afghanistan  1850      AFG   3752993.0  NaN         NaN   
1  Afghanistan  1851      AFG   3767956.0  NaN         NaN   
2  Afghanistan  1852      AFG   3783940.0  NaN         NaN   
3  Afghanistan  1853      AFG   3800954.0  NaN         NaN   
4  Afghanistan  1854      AFG   3818038.0  NaN         NaN   

   cement_co2_per_capita  co2  co2_growth_abs  co2_growth_prct  ...  \
0                    NaN  NaN             NaN              NaN  ...   
1                    NaN  NaN             NaN              NaN  ...   
2                    NaN  NaN             NaN              NaN  ...   
3                    NaN  NaN             NaN              NaN  ...   
4                    NaN  NaN             NaN              NaN  ...   

   share_global_other_co2  share_of_temperature_change_from_ghg  \
0                     NaN                                   NaN   
1                     NaN                                 0.165   

In [2]:
data = df[["country", "year", "co2", "co2_growth_prct", "population", 
           "ghg_per_capita", "land_use_change_co2", "temperature_change_from_co2", 
           "gdp", "iso_code"]].copy()
print(data.info())
print(data.head(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48058 entries, 0 to 48057
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   country                      48058 non-null  object 
 1   year                         48058 non-null  int64  
 2   co2                          30308 non-null  float64
 3   co2_growth_prct              25136 non-null  float64
 4   population                   39495 non-null  float64
 5   ghg_per_capita               6354 non-null   float64
 6   land_use_change_co2          37022 non-null  float64
 7   temperature_change_from_co2  41724 non-null  float64
 8   gdp                          14564 non-null  float64
 9   iso_code                     39717 non-null  object 
dtypes: float64(7), int64(1), object(2)
memory usage: 3.7+ MB
None
       country  year  co2  co2_growth_prct  population  ghg_per_capita  \
0  Afghanistan  1850  NaN              NaN   3752993.0

In [9]:
data['ghg_per_capita']=data.groupby("country")["ghg_per_capita"].transform('median')
global_median=data["ghg_per_capita"].median()
data["ghg_per_capita"].fillna(global_median,inplace=True)

data["gdp"]=data.groupby(["country","year"])["gdp"].transform('median')
gdp_median=data["gdp"].median()
data["gdp"].fillna(gdp_median,inplace=True)

data["population"]=data.groupby(["country","year"])["population"].transform('median')
population_median=data["population"].median()
data["population"].fillna(population_median,inplace=True)

data['iso_code'].fillna(data['iso_code'].mode()[0], inplace=True)

data["co2"]=data.groupby(["country","year"])["co2"].transform('mean')
data["co2"].fillna(data["co2"].mean(),inplace=True)

data["co2_growth_prct"]=data.groupby(["country","year"])["co2"].transform('mean')
data["co2_growth_prct"].fillna(data["co2_growth_prct"].mean(),inplace=True)

print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48058 entries, 0 to 48057
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   country                      48058 non-null  object 
 1   year                         48058 non-null  int64  
 2   co2                          48058 non-null  float64
 3   co2_growth_prct              48058 non-null  float64
 4   population                   48058 non-null  float64
 5   ghg_per_capita               48058 non-null  float64
 6   land_use_change_co2          37022 non-null  float64
 7   temperature_change_from_co2  41724 non-null  float64
 8   gdp                          48058 non-null  float64
 9   iso_code                     48058 non-null  object 
dtypes: float64(7), int64(1), object(2)
memory usage: 3.7+ MB
None


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["ghg_per_capita"].fillna(global_median,inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["gdp"].fillna(gdp_median,inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

In [12]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document

In [14]:
clean_doc=[]
for index,row in data.iterrows():
    document=(
        f"The climate intelligence metrics for {row['country']} in {row['year']} show: "
        f"Total CO2 Emissions: {row['co2']:.2f}, "
        f"CO2 Growth (%): {row['co2_growth_prct']:.2f}%, "
        f"Population: {row['population']:.0f}, "
        f"GHG per Capita: {row['ghg_per_capita']:.2f}, "
        f"Land Use CO2: {row['land_use_change_co2']:.2f}, "
        f"GDP: {row['gdp']:.2f}."
    )

    doc=Document(
        page_content=document,
        metadata={
            "source": "owid-co2-data.csv",
                "country": row['country'],
                "year": int(row['year']), 
            }
    )
    
    clean_doc.append(doc)

pdf_loader = PyPDFLoader("esg_report_pdf.pdf")
pdf_docs = pdf_loader.load()

all_docs=clean_doc + pdf_docs

In [22]:
csv_rows=[]
for docs in all_docs[:500]:
    csv_rows.append(docs)
print(csv_rows)
    

[Document(metadata={'source': 'owid-co2-data.csv', 'country': 'Afghanistan', 'year': 1850}, page_content='The climate intelligence metrics for Afghanistan in 1850 show: Total CO2 Emissions: 391.27, CO2 Growth (%): 391.27%, Population: 3752993, GHG per Capita: 0.76, Land Use CO2: 2.98, GDP: 25979985920.00.'), Document(metadata={'source': 'owid-co2-data.csv', 'country': 'Afghanistan', 'year': 1851}, page_content='The climate intelligence metrics for Afghanistan in 1851 show: Total CO2 Emissions: 391.27, CO2 Growth (%): 391.27%, Population: 3767956, GHG per Capita: 0.76, Land Use CO2: 3.00, GDP: 25979985920.00.'), Document(metadata={'source': 'owid-co2-data.csv', 'country': 'Afghanistan', 'year': 1852}, page_content='The climate intelligence metrics for Afghanistan in 1852 show: Total CO2 Emissions: 391.27, CO2 Growth (%): 391.27%, Population: 3783940, GHG per Capita: 0.76, Land Use CO2: 3.02, GDP: 25979985920.00.'), Document(metadata={'source': 'owid-co2-data.csv', 'country': 'Afghanista