# Food project

## Import data, packages and useful functions from book_of_functions.py 

I made two functions to use them as tools in my EDA workflow. They aim to automatize file extraction in a desired folder and make an overall check about NaN in .csv files. 

Obviously they were not essentials for the purpose of this project, but it seemed a good occasion to use them here. 

Here is a short explaination:

- **zip_extractor**, takes as input a folder name. If the folder doesn't exist it's created. Then, it looks for *.zip files* in the same path of this script file path, extracts them, and puts them in the provided folder, or, if not specified, in the same file path of this script;

- **csv_nan_reader**, looks for *.csv files* in a target folder (provided by input), skipping 'checkpoint' files (created automatically by Jupyter), and read them in pandas as DataFrames. If a file got not a .csv extension, it will be notified. If the folder is empty or it doesn't exist, this function will notify it too.For each file are printed out: 
    - filename and parental folder;
    - Dataframe shape;
    - number of total cells;
    - number of cells with missing;
    - % of missing data;
    - rows and columns containing missing;
    - rows and columns in original dataset;
    - rows and columns remained after drop;
    - the effect of dropping rows and columns in terms of remaining data and which is the best method.

In [32]:
import pandas as pd
import numpy as np 
import os
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', 70)

In [23]:
# %load book_of_functions.py
import pandas as pd
import numpy as np
import os
from zipfile import ZipFile


def zip_extractor():
    """This function looks for zip files and extracts them in the same file path of this script, or in a folder provided 
    with an input. If the folder doesn't exist, it will be created with the provided name."""
    
    destination = input('Where do you want to put the extracted files?\nPress enter if you want to extract files in the current path: ').capitalize()

    if destination == '':
        pass
    elif destination not in os.listdir():
        os.mkdir(destination)

    for dirname, _, filenames in os.walk(os.getcwd()):
        for filename in filenames:
            if '.zip' in filename:
                with ZipFile(filename, "r") as zip:
                    zip.extractall(f'{destination}')

    print("Extraction: Done")
    
    
def csv_nan_reader():
    
    """This function search for .csv files in a target folder (provided by input), skipping 'checkpoint' files 
    and read them in pandas as DataFrame. Then for each file are printed out: 
    
    - filename and parental folder 
    - Dataframe shape 
    - number of total cells 
    - number of cells with missings 
    - % of missing data 
    - rows and columns containing missings 
    - rows and columns in original dataset 
    - rows and columns remained after drop 
    - the effect of dropping rows and columns in terms of remaining data and which is the best method.
    
    If a folder is empty or doesn't exist, this function will segnalate it."""
    
    target_folder = input('Write here folder name: ').capitalize()
    datano = 0

    # Check if the folder exist
    if os.path.isdir(target_folder):
        
        # Check if the folder is not empty
        if os.listdir(target_folder):
            
            for dirname, _, filenames in os.walk(target_folder):
                for filename in filenames:
                    if not 'checkpoint' in filename:
                        name, file_ext = os.path.splitext(filename)
                        
                        if '.csv' in file_ext:
                            
                            path = os.path.join(dirname, filename)
                            df = pd.read_csv(f'{path}')
                            
                            # Extract main information of the file and count Dataframes
                            datano += 1
                            print(f'\n\nDataframe No: {datano}')
                            print('_'*20+'Start'+'_'*20)
                            print(f'This is the Dataset "{filename}" from folder "{dirname}"')
                            print(filename,'shape',df.shape)
                            
                            # How many total missing values do we have?
                            total_cells = np.product(df.shape)
                            print('Total cells:', total_cells)
                            
                            total_missing = df.isnull().sum().sum()
                            print('Total cells with missings:', total_missing)
                            
                            # Count rows containing missing values and check what happens if you drop rows
                            rows_with_missing = df[df.isnull().any(axis=1)].index.to_list()
                            print('\nTotal lenght of missing rows:', len(rows_with_missing))
                            
                            if len(rows_with_missing) > 10:
                                print(f'Rows containing missing values:\n{rows_with_missing[:10]}...[{rows_with_missing[-1]}]')
                            else:
                                print('Rows containing missing values:\n',rows_with_missing)
                                
                            drop_rows = df.dropna()
                            rows_removal_perc = round((1-drop_rows.shape[0]/df.shape[0])*100,2)
                            print(f"\nRows in original dataset: {df.shape[0]}")
                            print(f"Rows remained after drop: {drop_rows.shape[0]}")
                            print('\nDropping rows with NaN removed',rows_removal_perc,'% of the data!')
                            print(f'Shape of Dataframe after rows manipulation: {drop_rows.shape}')
                            
                            # Count columns containing missing values and check what happens if you drop columns
                            cols_with_missing = [col for col in df.columns if df[col].isnull().any()]
                            print('\n\nTotal lenght of missing columns:', len(cols_with_missing))
                            
                            if len(cols_with_missing) > 5:
                                print(f'Columns containing missing values:\n{cols_with_missing[:5]}...[{cols_with_missing[-1]}]')                                
                            else:
                                print('Columns containing missing values:\n',cols_with_missing)
                                
                            drop_cols = df.dropna(axis=1)
                            cols_removal_perc = round((1-drop_cols.shape[1]/df.shape[1])*100,2)
                            print(f"\nColumns in original dataset: {df.shape[1]}")
                            print(f"Columns remained after drop: {drop_cols.shape[1]}")
                            print('\nDropping columns with NaN removed',cols_removal_perc,'% of the data!')
                            print(f'Shape of Dataframe after columns manipulation: {drop_cols.shape}\n\n')
                            
                            # Compare rows drop vs cols drop and print what is better
                            if rows_removal_perc < cols_removal_perc:
                                print(f"Dropping rows with NaN is the best approach.({rows_removal_perc}% vs {cols_removal_perc}%)\n\n")
                            elif rows_removal_perc > cols_removal_perc:
                                print(f"Dropping columns with NaN is the best approach.({cols_removal_perc}% vs {rows_removal_perc}%)\n\n")
                            else:
                                print("Dropping rows or columns produced the same output.")
                            print('-'*20+'End'+'-'*20+'\n\n')
                            
                        else:
                            print(f"\nI found a file with different extension from csv. '{name}' is a '{file_ext}' file.")
            
            print('\nAll Done.')
        
        else:
            print('\nThis folder is empty!')
    else:
        print("\nSelected folder doesn't exist in this path!")

In [None]:
# !kaggle datasets download -d dorbicycle/world-foodfeed-production
# !kaggle datasets download -d selfvivek/environment-impact-of-food-production
# zip_extractor()

In [111]:
fao_df = pd.read_csv('Data/FAO.csv')
df = fao_df.copy()

In [112]:
df.head()

Unnamed: 0,Area Abbreviation,Area Code,Area,Item Code,Item,Element Code,Element,Unit,latitude,longitude,Y1961,Y1962,Y1963,Y1964,Y1965,Y1966,Y1967,Y1968,Y1969,Y1970,Y1971,Y1972,Y1973,Y1974,Y1975,Y1976,Y1977,Y1978,Y1979,Y1980,Y1981,Y1982,Y1983,Y1984,Y1985,Y1986,Y1987,Y1988,Y1989,Y1990,Y1991,Y1992,Y1993,Y1994,Y1995,Y1996,Y1997,Y1998,Y1999,Y2000,Y2001,Y2002,Y2003,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
0,AFG,2,Afghanistan,2511,Wheat and products,5142,Food,1000 tonnes,33.94,67.71,1928.0,1904.0,1666.0,1950.0,2001.0,1808.0,2053.0,2045.0,2154.0,1819.0,1963.0,2215.0,2310.0,2335.0,2434.0,2512.0,2282.0,2454.0,2443.0,2129.0,2133.0,2068.0,1994.0,1851.0,1791.0,1683.0,2194.0,1801.0,1754.0,1640.0,1539.0,1582.0,1840.0,1855.0,1853.0,2177.0,2343.0,2407.0,2463.0,2600.0,2668.0,2776.0,3095.0,3249.0,3486.0,3704.0,4164.0,4252.0,4538.0,4605.0,4711.0,4810,4895
1,AFG,2,Afghanistan,2805,Rice (Milled Equivalent),5142,Food,1000 tonnes,33.94,67.71,183.0,183.0,182.0,220.0,220.0,195.0,231.0,235.0,238.0,213.0,205.0,233.0,246.0,246.0,255.0,263.0,235.0,254.0,270.0,259.0,248.0,217.0,217.0,197.0,186.0,200.0,193.0,202.0,191.0,199.0,197.0,249.0,218.0,260.0,319.0,254.0,326.0,347.0,270.0,372.0,411.0,448.0,460.0,419.0,445.0,546.0,455.0,490.0,415.0,442.0,476.0,425,422
2,AFG,2,Afghanistan,2513,Barley and products,5521,Feed,1000 tonnes,33.94,67.71,76.0,76.0,76.0,76.0,76.0,75.0,71.0,72.0,73.0,74.0,71.0,70.0,72.0,76.0,77.0,80.0,60.0,65.0,64.0,64.0,60.0,55.0,53.0,51.0,48.0,46.0,46.0,47.0,46.0,43.0,43.0,40.0,50.0,46.0,41.0,44.0,50.0,48.0,43.0,26.0,29.0,70.0,48.0,58.0,236.0,262.0,263.0,230.0,379.0,315.0,203.0,367,360
3,AFG,2,Afghanistan,2513,Barley and products,5142,Food,1000 tonnes,33.94,67.71,237.0,237.0,237.0,238.0,238.0,237.0,225.0,227.0,230.0,234.0,223.0,219.0,225.0,240.0,244.0,255.0,185.0,203.0,198.0,202.0,189.0,174.0,167.0,160.0,151.0,145.0,145.0,148.0,145.0,135.0,132.0,120.0,155.0,143.0,125.0,138.0,159.0,154.0,141.0,84.0,83.0,122.0,144.0,185.0,43.0,44.0,48.0,62.0,55.0,60.0,72.0,78,89
4,AFG,2,Afghanistan,2514,Maize and products,5521,Feed,1000 tonnes,33.94,67.71,210.0,210.0,214.0,216.0,216.0,216.0,235.0,232.0,236.0,200.0,201.0,216.0,228.0,231.0,234.0,240.0,228.0,234.0,228.0,226.0,210.0,199.0,192.0,182.0,173.0,170.0,154.0,148.0,137.0,144.0,126.0,90.0,141.0,150.0,159.0,108.0,90.0,99.0,72.0,35.0,48.0,89.0,63.0,120.0,208.0,233.0,249.0,247.0,195.0,178.0,191.0,200,200


In [115]:
old_keys = [n for n in df.columns if 'Y' in n]
new_keys = [n[1:] for n in old_keys]

for n in range(len(old_keys)):
    df.rename(columns={old_keys[n]: new_keys[n]}, inplace=True)

In [116]:
df.columns

Index(['Area Abbreviation', 'Area Code', 'Area', 'Item Code', 'Item',
       'Element Code', 'Element', 'Unit', 'latitude', 'longitude', '1961',
       '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970',
       '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979',
       '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988',
       '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997',
       '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006',
       '2007', '2008', '2009', '2010', '2011', '2012', '2013'],
      dtype='object')

In [60]:
production_df = pd.read_csv('Data/Food_Production.csv')

Unnamed: 0,Food product,Land use change,Animal Feed,Farm,Processing,Transport,Packging,Retail,Total_emissions,Eutrophying emissions per 1000kcal (gPO₄eq per 1000kcal),Eutrophying emissions per kilogram (gPO₄eq per kilogram),Eutrophying emissions per 100g protein (gPO₄eq per 100 grams protein),Freshwater withdrawals per 1000kcal (liters per 1000kcal),Freshwater withdrawals per 100g protein (liters per 100g protein),Freshwater withdrawals per kilogram (liters per kilogram),Greenhouse gas emissions per 1000kcal (kgCO₂eq per 1000kcal),Greenhouse gas emissions per 100g protein (kgCO₂eq per 100g protein),Land use per 1000kcal (m² per 1000kcal),Land use per kilogram (m² per kilogram),Land use per 100g protein (m² per 100g protein),Scarcity-weighted water use per kilogram (liters per kilogram),Scarcity-weighted water use per 100g protein (liters per 100g protein),Scarcity-weighted water use per 1000kcal (liters per 1000 kilocalories)
0,Wheat & Rye (Bread),0.1,0.0,0.8,0.2,0.1,0.1,0.1,1.4,,,,,,,,,,,,,,
1,Maize (Meal),0.3,0.0,0.5,0.1,0.1,0.1,0.0,1.1,,,,,,,,,,,,,,
2,Barley (Beer),0.0,0.0,0.2,0.1,0.0,0.5,0.3,1.1,,,,,,,,,,,,,,
3,Oatmeal,0.0,0.0,1.4,0.0,0.1,0.1,0.0,1.6,4.281357,11.23,8.638462,183.911552,371.076923,482.4,0.945482,1.907692,2.897446,7.6,5.846154,18786.2,14450.92308,7162.104461
4,Rice,0.0,0.0,3.6,0.1,0.1,0.1,0.1,4.0,9.514379,35.07,49.394366,609.983722,3166.760563,2248.4,1.207271,6.267606,0.759631,2.8,3.943662,49576.3,69825.77465,13449.89148
5,Potatoes,0.0,0.0,0.2,0.0,0.1,0.0,0.0,0.3,4.754098,3.48,20.470588,80.737705,347.647059,59.1,0.628415,2.705882,1.202186,0.88,5.176471,2754.2,16201.17647,3762.568306
6,Cassava,0.6,0.0,0.2,0.0,0.1,0.0,0.0,0.9,0.708419,0.69,7.666667,,,0.0,1.355236,14.666667,1.858316,1.81,20.111111,0.0,,
7,Cane Sugar,1.2,0.0,0.5,0.0,0.8,0.1,0.0,2.6,4.820513,16.92,,176.666667,,620.1,0.911681,,0.581197,2.04,,16438.6,,4683.361823
8,Beet Sugar,0.0,0.0,0.5,0.2,0.6,0.1,0.0,1.4,1.541311,5.41,,62.022792,,217.7,0.51567,,0.521368,1.83,,9493.3,,2704.643875
9,Other Pulses,0.0,0.0,1.1,0.0,0.1,0.4,0.0,1.6,5.008798,17.08,7.977581,,203.503036,435.7,0.524927,0.836058,4.565982,15.57,7.272303,22477.4,10498.55208,
