## Regression Project

### Problem Statement
Agricultural activities have a negative impact on the enironment by contributing to CO2 emissions which in turn impacts climate change.

In [1]:
##these are the packages needed for data cleaning and eda
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

### Import the dataset

In [2]:
df = pd.read_csv("co2_emissions_from_agri.csv")


### Data Overview
After importing the data, I'm going to get a high level overview to understand its structure.

In [3]:
#inspect the top 5 rows
df.head()

Unnamed: 0,Area,Year,Savanna fires,Forest fires,Crop Residues,Rice Cultivation,Drained organic soils (CO2),Pesticides Manufacturing,Food Transport,Forestland,...,Manure Management,Fires in organic soils,Fires in humid tropical forests,On-farm energy use,Rural population,Urban population,Total Population - Male,Total Population - Female,total_emission,Average Temperature °C
0,Afghanistan,1990,14.7237,0.0557,205.6077,686.0,0.0,11.807483,63.1152,-2388.803,...,319.1763,0.0,0.0,,9655167.0,2593947.0,5348387.0,5346409.0,2198.963539,0.536167
1,Afghanistan,1991,14.7237,0.0557,209.4971,678.16,0.0,11.712073,61.2125,-2388.803,...,342.3079,0.0,0.0,,10230490.0,2763167.0,5372959.0,5372208.0,2323.876629,0.020667
2,Afghanistan,1992,14.7237,0.0557,196.5341,686.0,0.0,11.712073,53.317,-2388.803,...,349.1224,0.0,0.0,,10995568.0,2985663.0,6028494.0,6028939.0,2356.304229,-0.259583
3,Afghanistan,1993,14.7237,0.0557,230.8175,686.0,0.0,11.712073,54.3617,-2388.803,...,352.2947,0.0,0.0,,11858090.0,3237009.0,7003641.0,7000119.0,2368.470529,0.101917
4,Afghanistan,1994,14.7237,0.0557,242.0494,705.6,0.0,11.712073,53.9874,-2388.803,...,367.6784,0.0,0.0,,12690115.0,3482604.0,7733458.0,7722096.0,2500.768729,0.37225


In [4]:
#replace whitespace in column names with _
df.columns = [col.replace(" ", "_") for col in df.columns]

In [5]:
#inspect the bottom 5 rows
df.tail()

Unnamed: 0,Area,Year,Savanna_fires,Forest_fires,Crop_Residues,Rice_Cultivation,Drained_organic_soils_(CO2),Pesticides_Manufacturing,Food_Transport,Forestland,...,Manure_Management,Fires_in_organic_soils,Fires_in_humid_tropical_forests,On-farm_energy_use,Rural_population,Urban_population,Total_Population_-_Male,Total_Population_-_Female,total_emission,Average_Temperature_°C
6960,Zimbabwe,2016,1190.0089,232.5068,70.9451,7.4088,0.0,75.0,251.1465,76500.2982,...,282.5994,0.0,0.0,417.315,10934468.0,5215894.0,6796658.0,7656047.0,98491.026347,1.12025
6961,Zimbabwe,2017,1431.1407,131.1324,108.6262,7.9458,0.0,67.0,255.7975,76500.2982,...,255.59,0.0,0.0,398.1644,11201138.0,5328766.0,6940631.0,7810471.0,97159.311553,0.0465
6962,Zimbabwe,2018,1557.583,221.6222,109.9835,8.1399,0.0,66.0,327.0897,76500.2982,...,257.2735,0.0,0.0,465.7735,11465748.0,5447513.0,7086002.0,7966181.0,97668.308205,0.516333
6963,Zimbabwe,2019,1591.6049,171.0262,45.4574,7.8322,0.0,73.0,290.1893,76500.2982,...,267.5224,0.0,0.0,444.2335,11725970.0,5571525.0,7231989.0,8122618.0,98988.062799,0.985667
6964,Zimbabwe,2020,481.9027,48.4197,108.3022,7.9733,0.0,73.0,238.7639,76500.2982,...,266.7316,0.0,0.0,444.2335,11980005.0,5700460.0,7385220.0,8284447.0,96505.221853,0.189


In [6]:
#Check the shape of the data, i.e., how many columns and rows the dataframe consists of
df.shape

(6965, 31)

The dataframe consists:
1. Columns = 6965
2. Rows = 31

In [7]:
#Get a summary of the columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6965 entries, 0 to 6964
Data columns (total 31 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Area                             6965 non-null   object 
 1   Year                             6965 non-null   int64  
 2   Savanna_fires                    6934 non-null   float64
 3   Forest_fires                     6872 non-null   float64
 4   Crop_Residues                    5576 non-null   float64
 5   Rice_Cultivation                 6965 non-null   float64
 6   Drained_organic_soils_(CO2)      6965 non-null   float64
 7   Pesticides_Manufacturing         6965 non-null   float64
 8   Food_Transport                   6965 non-null   float64
 9   Forestland                       6472 non-null   float64
 10  Net_Forest_conversion            6472 non-null   float64
 11  Food_Household_Consumption       6492 non-null   float64
 12  Food_Retail         

- Population values should be integers, temperature should be float

- Each column should have a total of 6965 non-null values. 

- Notice that some columns have null values that will need to be looked at. 

#### Change Area column name to Country

In [10]:
df = df.rename(columns = {'Area': 'Country'})

In [11]:
df.head()

Unnamed: 0,Country,Year,Savanna_fires,Forest_fires,Crop_Residues,Rice_Cultivation,Drained_organic_soils_(CO2),Pesticides_Manufacturing,Food_Transport,Forestland,...,Manure_Management,Fires_in_organic_soils,Fires_in_humid_tropical_forests,On-farm_energy_use,Rural_population,Urban_population,Total_Population_-_Male,Total_Population_-_Female,total_emission,Average_Temperature_°C
0,Afghanistan,1990,14.7237,0.0557,205.6077,686.0,0.0,11.807483,63.1152,-2388.803,...,319.1763,0.0,0.0,,9655167.0,2593947.0,5348387.0,5346409.0,2198.963539,0.536167
1,Afghanistan,1991,14.7237,0.0557,209.4971,678.16,0.0,11.712073,61.2125,-2388.803,...,342.3079,0.0,0.0,,10230490.0,2763167.0,5372959.0,5372208.0,2323.876629,0.020667
2,Afghanistan,1992,14.7237,0.0557,196.5341,686.0,0.0,11.712073,53.317,-2388.803,...,349.1224,0.0,0.0,,10995568.0,2985663.0,6028494.0,6028939.0,2356.304229,-0.259583
3,Afghanistan,1993,14.7237,0.0557,230.8175,686.0,0.0,11.712073,54.3617,-2388.803,...,352.2947,0.0,0.0,,11858090.0,3237009.0,7003641.0,7000119.0,2368.470529,0.101917
4,Afghanistan,1994,14.7237,0.0557,242.0494,705.6,0.0,11.712073,53.9874,-2388.803,...,367.6784,0.0,0.0,,12690115.0,3482604.0,7733458.0,7722096.0,2500.768729,0.37225


#### Check sum of NaN values is each column

In [12]:
# checking for missing values
df.isnull().sum()

Country                               0
Year                                  0
Savanna_fires                        31
Forest_fires                         93
Crop_Residues                      1389
Rice_Cultivation                      0
Drained_organic_soils_(CO2)           0
Pesticides_Manufacturing              0
Food_Transport                        0
Forestland                          493
Net_Forest_conversion               493
Food_Household_Consumption          473
Food_Retail                           0
On-farm_Electricity_Use               0
Food_Packaging                        0
Agrifood_Systems_Waste_Disposal       0
Food_Processing                       0
Fertilizers_Manufacturing             0
IPPU                                743
Manure_applied_to_Soils             928
Manure_left_on_Pasture                0
Manure_Management                   928
Fires_in_organic_soils                0
Fires_in_humid_tropical_forests     155
On-farm_energy_use                  956


In [47]:
column_mv = df.isnull().sum()/len(df) * 100
column_mv

Country                             0.000000
Year                                0.000000
Savanna_fires                       0.445083
Forest_fires                        1.335248
Crop_Residues                      19.942570
Rice_Cultivation                    0.000000
Drained_organic_soils_(CO2)         0.000000
Pesticides_Manufacturing            0.000000
Food_Transport                      0.000000
Forestland                          7.078248
Net_Forest_conversion               7.078248
Food_Household_Consumption          6.791098
Food_Retail                         0.000000
On-farm_Electricity_Use             0.000000
Food_Packaging                      0.000000
Agrifood_Systems_Waste_Disposal     0.000000
Food_Processing                     0.000000
Fertilizers_Manufacturing           0.000000
IPPU                               10.667624
Manure_applied_to_Soils            13.323762
Manure_left_on_Pasture              0.000000
Manure_Management                  13.323762
Fires_in_o

We have quite a few columns missing values, but the % of missing values in each column is not enough to drop the entire dataset from the column.

Iaa

In [51]:
df[df['Savanna_fires'].isnull()]


Unnamed: 0,Country,Year,Savanna_fires,Forest_fires,Crop_Residues,Rice_Cultivation,Drained_organic_soils_(CO2),Pesticides_Manufacturing,Food_Transport,Forestland,...,Manure_Management,Fires_in_organic_soils,Fires_in_humid_tropical_forests,On-farm_energy_use,Rural_population,Urban_population,Total_Population_-_Male,Total_Population_-_Female,total_emission,Average_Temperature_°C
2835,Holy See,1990,,,,9616.267306,0.0,11.481085,26.262663,0.0,...,,0.058149,,,0.0,768.0,340.0,370.0,18688.406793,0.905583
2836,Holy See,1991,,,,9654.499823,0.0,11.481085,26.262663,0.0,...,,0.058149,,,0.0,774.0,330.0,360.0,18726.64161,-0.098083
2837,Holy See,1992,,,,9616.267306,0.0,11.481085,26.262663,0.0,...,,0.058149,,,0.0,779.0,330.0,360.0,18684.924938,0.773167
2838,Holy See,1993,,,,9616.267306,0.0,11.481085,26.262663,0.0,...,,0.058149,,,0.0,778.0,330.0,360.0,18684.874838,0.564417
2839,Holy See,1994,,,,9649.123125,0.0,11.481085,26.262663,0.0,...,,0.058149,,,0.0,778.0,320.0,350.0,18717.728958,1.4445
2840,Holy See,1995,,,,9687.355642,0.0,11.481085,26.262663,0.0,...,,0.058149,,,0.0,780.0,320.0,350.0,18755.973575,0.267167
2841,Holy See,1996,,,,9687.355642,0.0,11.481085,26.262663,0.0,...,,0.058149,,,0.0,778.0,320.0,350.0,18755.981975,0.245167
2842,Holy See,1997,,,,9649.123125,0.0,11.481085,26.262663,0.0,...,,0.058149,,,0.0,782.0,320.0,340.0,18717.754458,0.889583
2843,Holy See,1998,,,,9649.123125,0.0,11.481085,26.262663,0.0,...,,0.058149,,,0.0,781.0,320.0,340.0,18717.784658,0.77825
2844,Holy See,1999,,,,9649.123125,0.0,11.481085,26.262663,0.0,...,,0.058149,,,0.0,781.0,310.0,340.0,18717.817658,0.9265
