## Understanding and Cleaning data
### Problem Statement:
Investingating Factor Affecting Gross Domestic Product (GDP)

In [1]:
import pandas as pd
import matplotlib.pyplot as pylt
import numpy as np
%matplotlib inline

### Import dataset

In [2]:
data = pd.read_csv("Drivers of GDP per Capita.csv")
data.head()

Unnamed: 0,Country Name,Country Code,year,Access to electricity (% of pop),Agricultural land (% of land area),Agricultural raw materials exports (% ),Agricultural raw materials imports (%),Agriculture value added (% of GDP),Arable land (% of land area),Average precipitation in depth (mm per year),CPIA gender equality rating (1=low to 6=high),"Death rate, crude (per 1,000 people)",Employment in agriculture (% of total employment),Fertilizer consumption (% of fertilizer production),"Literacy rate, adult total (% of people ages 15 and above)",Livestock production index,Mineral rents (% of GDP),Population,Rural population (% of total population),GDP per capita (US$)
0,Afghanistan,AFG,1960,,,,,,,,,32.219,,,,,,8996973.0,91.599,59.773194
1,Afghanistan,AFG,1961,,57.745918,,,,11.717673,,,31.649,,,,43.37,,9169410.0,91.316,59.860874
2,Afghanistan,AFG,1962,,57.837821,53.755852,0.966617,,11.794259,327.0,,31.093,,,,43.99,,9351441.0,91.024,58.458015
3,Afghanistan,AFG,1963,,57.914407,61.100605,1.394438,,11.870845,,,30.551,,,,47.03,,9543205.0,90.724,78.706388
4,Afghanistan,AFG,1964,,58.010906,54.437183,1.498234,,11.947431,,,30.022,,,,48.56,,9744781.0,90.414,82.095231


### Informative Summary

In [3]:
# data shape
data.shape

(16043, 20)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16043 entries, 0 to 16042
Data columns (total 20 columns):
 #   Column                                                      Non-Null Count  Dtype  
---  ------                                                      --------------  -----  
 0   Country Name                                                16043 non-null  object 
 1   Country Code                                                16043 non-null  object 
 2   year                                                        16043 non-null  int64  
 3   Access to electricity (% of pop)                            6153 non-null   float64
 4   Agricultural land (% of land area)                          12999 non-null  float64
 5   Agricultural raw materials exports (% )                     8911 non-null   float64
 6   Agricultural raw materials imports (%)                      8939 non-null   float64
 7   Agriculture value added (% of GDP)                          9670 non-null   float64
 

## Data Cleaning

### Filter your dataset such that you return a dataframe that has zero null values for the response variable `GDP per capita (US$)`

In [5]:
filtered_df = data.dropna(subset= ["GDP per capita (US$)"])

# Check if all null value have been droped
filtered_df["GDP per capita (US$)"].isnull().sum()

0

### Remove rows where 50 % or more predictor variables have null values.


In [6]:
response_var = ["Country Name", "Country Code", "year", "GDP per capita (US$)"]
# predictor variables
predictor_var = [col for col in filtered_df.columns if col not in response_var]
# calculate percentage for null value
percentage_null = filtered_df[predictor_var].isnull().sum(axis=1) / len(predictor_var)
# filter row with half or more null value
filtered_df = filtered_df[percentage_null < 0.5]
filtered_df.shape

(9435, 20)

### Remove four predictors that has the least non-null values

In [7]:
# least predictor variable with the least non-null value
predictor_least_non_null = filtered_df[predictor_var].count().nsmallest(4).index

# drop the variables
filtered_df = filtered_df.drop(columns=predictor_least_non_null)

In [8]:
filtered_df.shape

(9435, 16)

## import generalized row data

In [9]:
gen_row = pd.read_csv("Generalized Rows.csv")
gen_row.head()

Unnamed: 0,Country Code,Country Name
0,CSS,Caribbean small states
1,EAP,East Asia & Pacific (excluding high income)
2,EMU,Euro area
3,EUU,European Union
4,FCS,Fragile and conflict affected situations


### Within the dataset, there are rows that are generalized to region or continent
Remove rows that correspond to the data in this CSV file: gen_rows

In [10]:
# Identify the rows in the main dataset that match the generalized regions or continents
gen_country = gen_row["Country Name"].unique()
rows_to_remove = filtered_df[filtered_df['Country Name'].isin(gen_country)].index

# Remove those rows from the main dataset
filtered_df = filtered_df.drop(rows_to_remove)
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8628 entries, 2 to 16040
Data columns (total 16 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   Country Name                                       8628 non-null   object 
 1   Country Code                                       8628 non-null   object 
 2   year                                               8628 non-null   int64  
 3   Access to electricity (% of pop)                   5055 non-null   float64
 4   Agricultural land (% of land area)                 8227 non-null   float64
 5   Agricultural raw materials exports (% )            7333 non-null   float64
 6   Agricultural raw materials imports (%)             7341 non-null   float64
 7   Agriculture value added (% of GDP)                 7500 non-null   float64
 8   Arable land (% of land area)                       8224 non-null   float64
 9   Death rate, 

### Most data released before 1990 have lot of null value

In [11]:
# Convert 'year' column to datetime type
filtered_df['year'] = pd.to_datetime(filtered_df['year'], format='%Y')

# drop data from 1990 and below
filtered_df = filtered_df[filtered_df["year"] >= '1990']
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5909 entries, 42 to 16040
Data columns (total 16 columns):
 #   Column                                             Non-Null Count  Dtype         
---  ------                                             --------------  -----         
 0   Country Name                                       5909 non-null   object        
 1   Country Code                                       5909 non-null   object        
 2   year                                               5909 non-null   datetime64[ns]
 3   Access to electricity (% of pop)                   5055 non-null   float64       
 4   Agricultural land (% of land area)                 5514 non-null   float64       
 5   Agricultural raw materials exports (% )            4826 non-null   float64       
 6   Agricultural raw materials imports (%)             4818 non-null   float64       
 7   Agriculture value added (% of GDP)                 5526 non-null   float64       
 8   Arable land (% of lan

In [12]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5909 entries, 42 to 16040
Data columns (total 16 columns):
 #   Column                                             Non-Null Count  Dtype         
---  ------                                             --------------  -----         
 0   Country Name                                       5909 non-null   object        
 1   Country Code                                       5909 non-null   object        
 2   year                                               5909 non-null   datetime64[ns]
 3   Access to electricity (% of pop)                   5055 non-null   float64       
 4   Agricultural land (% of land area)                 5514 non-null   float64       
 5   Agricultural raw materials exports (% )            4826 non-null   float64       
 6   Agricultural raw materials imports (%)             4818 non-null   float64       
 7   Agriculture value added (% of GDP)                 5526 non-null   float64       
 8   Arable land (% of lan

## Imputing Missing Values

In [13]:
df = filtered_df4.copy()
df.info()

NameError: name 'filtered_df4' is not defined

Create helper function `calculate_null_percentage`  for calculating the percentage of null values for that predictor variable with respect to the total total rows for that particular country.

In [None]:
def calculate_null_percentage(df, country_name, predictor_column):
    """
    Calculate the percentage of null values for a specific predictor variable
    with respect to the total rows for a particular country.

    Parameters:
    - df: DataFrame containing the data
    - country_name: Name of the country
    - predictor_column: Name of the predictor column

    Returns:
    - Percentage of null values for the predictor variable with respect to the total rows for the country
    """
    # Filter the DataFrame for the specified country
    country_df = df[df['Country Name'] == country_name]
    
    # Count the total number of rows for the country
    total_rows = len(country_df)
    
    # Count the number of null values for the predictor column
    null_count = country_df[predictor_column].isnull().sum()
    
    # Calculate the percentage of null values
    null_percentage = (null_count / total_rows) * 100
    
    return null_percentage

# Example:
percentage_null = calculate_null_percentage(df, 'Zimbabwe', 'Access to electricity (% of pop)')
print(percentage_null)

0.0


### Automation of imputing null value

In [None]:
def impute_missing_values(df, columns_to_impute):
    """
    Impute missing values in the specified columns with the mean value.

    Parameters:
    - df: DataFrame containing the data
    - columns_to_impute: List of column names to be imputed

    Returns:
    - DataFrame with null values filled
    """
    for column in columns_to_impute:
        # Calculate the mean value of the column
        mean_value = df[column].mean()
        
        # Fill null values with the mean value
        df[column].fillna(mean_value, inplace=True)
    
    return df


columns_to_impute = ['Access to electricity (% of pop)', 'Agricultural land (% of land area)', 'Agricultural raw materials exports (% )', 'Agricultural raw materials imports (%)', 'Agriculture value added (% of GDP)', 'Arable land (% of land area)', 'Death rate, crude (per 1,000 people)',  'Employment in agriculture (% of total employment)', 'Livestock production index']
imputed_df = impute_missing_values(df, columns_to_impute)
imputed_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5909 entries, 42 to 16040
Data columns (total 16 columns):
 #   Column                                             Non-Null Count  Dtype         
---  ------                                             --------------  -----         
 0   Country Name                                       5909 non-null   object        
 1   Country Code                                       5909 non-null   object        
 2   year                                               5909 non-null   datetime64[ns]
 3   Access to electricity (% of pop)                   5909 non-null   float64       
 4   Agricultural land (% of land area)                 5909 non-null   float64       
 5   Agricultural raw materials exports (% )            5909 non-null   float64       
 6   Agricultural raw materials imports (%)             5909 non-null   float64       
 7   Agriculture value added (% of GDP)                 5909 non-null   float64       
 8   Arable land (% of lan

We need to perform a mean imputation for missing values per country
for columns that have less than 10% of missing values. Write a
function that takes the dataframe and predictor variables as input
parameters, uses the function in `calculate_null_percentage` above to calculate the percentage
of missing values for each predictor variable for each country. The
function should return a dictionary with countries as keys and a list of
predictors to be imputed for each country. 

In [None]:
def values_to_change(df,predictors):
    """
    Returns dictionary containing country: column name to impute missing value
    
    Parameters:
        - df: dataframe
        - predictors: list of column to impute
        
    Returns:
        - to_impute: (dict) cointain Country Name as Key and Columns to inpute as value
        
    """
    to_impute = {}
    for country in df['Country Name'].unique():
        col=[]
        for p in predictors:
            null_perc = calculate_null_percentage(df,country,p)
            if null_perc<10:
                col.append(p)
        to_impute[country] = col
    return to_impute
        
c=values_to_change(df, columns_to_impute)

In [None]:
c['Algeria']

['Access to electricity (% of pop)',
 'Agricultural land (% of land area)',
 'Agricultural raw materials exports (% )',
 'Agricultural raw materials imports (%)',
 'Agriculture value added (% of GDP)',
 'Arable land (% of land area)',
 'Death rate, crude (per 1,000 people)',
 'Employment in agriculture (% of total employment)',
 'Livestock production index']

In [None]:
imputed_df = pd.DataFrame()
for country in c.keys():
    country_data = df[df['Country Name'] == country].reindex()
    returned_df = impute_missing_values(country_data, c[country])
    imputed_df = pd.concat([imputed_df, returned_df], ignore_index=True)

In [None]:
imputed_df.dropna().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5909 entries, 0 to 5908
Data columns (total 16 columns):
 #   Column                                             Non-Null Count  Dtype         
---  ------                                             --------------  -----         
 0   Country Name                                       5909 non-null   object        
 1   Country Code                                       5909 non-null   object        
 2   year                                               5909 non-null   datetime64[ns]
 3   Access to electricity (% of pop)                   5909 non-null   float64       
 4   Agricultural land (% of land area)                 5909 non-null   float64       
 5   Agricultural raw materials exports (% )            5909 non-null   float64       
 6   Agricultural raw materials imports (%)             5909 non-null   float64       
 7   Agriculture value added (% of GDP)                 5909 non-null   float64       
 8   Arable land (% of 

In [None]:
# Read the regions dataset
regions_df = pd.read_csv('Region_dataset.csv')

# Merge the datasets on the 'Country Code' column
merged_df = pd.merge(imputed_df, regions_df, on='Country Code', how='left')

# Reorder the columns to make 'Region' the second column
merged_df = merged_df[['Country Name', 'Country Code', 'Region'] + [col for col in merged_df.columns if col not in ['Country Name', 'Country Code', 'Region']]]

In [None]:
cleaned_df = merged_df
cleaned_df.head()

Unnamed: 0,Country Name,Country Code,Region,year,Access to electricity (% of pop),Agricultural land (% of land area),Agricultural raw materials exports (% ),Agricultural raw materials imports (%),Agriculture value added (% of GDP),Arable land (% of land area),"Death rate, crude (per 1,000 people)",Employment in agriculture (% of total employment),Livestock production index,Mineral rents (% of GDP),Population,Rural population (% of total population),GDP per capita (US$)
0,Afghanistan,AFG,Asia,2002-01-01,78.98277,57.827099,4.087927,1.747928,38.627892,11.760561,11.048,64.318001,103.58,0.0,22600770.0,77.739,179.426611
1,Afghanistan,AFG,Asia,2003-01-01,78.98277,58.06758,4.087927,1.747928,37.418855,11.904543,10.704,63.647999,97.99,0.0,23680871.0,77.647,190.683814
2,Afghanistan,AFG,Asia,2004-01-01,78.98277,58.069111,4.087927,1.747928,29.721067,11.971939,10.356,63.298,105.03,0.0,24726684.0,77.5,211.382117
3,Afghanistan,AFG,Asia,2005-01-01,23.0,58.06758,4.087927,1.747928,31.114855,11.95509,10.003,62.244999,103.31,0.0,25654277.0,77.297,242.031285
4,Afghanistan,AFG,Asia,2006-01-01,28.560974,58.06758,4.087927,1.747928,28.635969,11.938241,9.645,61.453999,91.66,0.0,26433049.0,77.093,263.733692
