<h3>Loading and Displaying the Data </h3>

In [18]:
import pandas as pd

# Load the CSV file into a DataFrame
data_path = 'C:\\Users\\serge\\FinalYear\\DAV\\DAV_CA1_SERG\\world-data-2023.csv'  
data  = pd.read_csv(data_path)

# Display all column 
print("Available columns in the dataset:", data .columns.tolist())

# selecting data from the dataset
selected_columns = [
    'Country',
    'Density\n(P/Km2)', 
    'Agricultural Land( %)',
    'Population',
    'Co2-Emissions',  
    'Urban_population'
]

data = data[selected_columns]
print(data.head())


Available columns in the dataset: ['Country', 'Density\n(P/Km2)', 'Abbreviation', 'Agricultural Land( %)', 'Land Area(Km2)', 'Armed Forces size', 'Birth Rate', 'Calling Code', 'Capital/Major City', 'Co2-Emissions', 'CPI', 'CPI Change (%)', 'Currency-Code', 'Fertility Rate', 'Forested Area (%)', 'Gasoline Price', 'GDP', 'Gross primary education enrollment (%)', 'Gross tertiary education enrollment (%)', 'Infant mortality', 'Largest city', 'Life expectancy', 'Maternal mortality ratio', 'Minimum wage', 'Official language', 'Out of pocket health expenditure', 'Physicians per thousand', 'Population', 'Population: Labor force participation (%)', 'Tax revenue (%)', 'Total tax rate', 'Unemployment rate', 'Urban_population', 'Latitude', 'Longitude']
       Country Density\n(P/Km2) Agricultural Land( %)  Population Co2-Emissions Urban_population
0  Afghanistan               60                58.10%  38,041,754         8,672        9,797,273
1      Albania              105                43.10%  

<h3>Cleaning the Data</h3>

In [19]:
# renaming to handle the newline issue
data = data[selected_columns]
data = data.rename(columns={'Density\n(P/Km2)': 'Density (P/Km²)'})

# cleaning percentage data and commas in numbers
data['Agricultural Land( %)'] = data['Agricultural Land( %)'].str.rstrip('%').astype(float) / 100
data['Co2-Emissions'] = data['Co2-Emissions'].str.replace(',', '').astype(float)
# renaming to handle the newline issue

# Setting display options for better readability
pd.set_option('display.max_columns', None)  # Ensure all columns are shown
pd.set_option('display.width', 120)  # Set the display width for readability
pd.set_option('display.precision', 2)  # Set the precision for floating point numbers

# Checking for empty rows
if data.isnull().any(axis=1).sum() > 0:
    print("There are empty rows in the dataset. Here are the details:")
    print(data[data.isnull().any(axis=1)])
else:
    print("No empty rows found.")

# Print the first five rows of the data
print(data.head())


There are empty rows in the dataset. Here are the details:
                            Country Density (P/Km²)  Agricultural Land( %)  Population  Co2-Emissions Urban_population
56                         Eswatini              67                    NaN   1,093,238            NaN              NaN
73                     Vatican City           2,003                    NaN         836            NaN              NaN
113                          Monaco          26,337                    NaN      38,964            NaN           38,964
120                           Nauru             541                    NaN      10,084            NaN              NaN
128                 North Macedonia              83                    NaN   1,836,713            NaN              NaN
133  Palestinian National Authority             847                    NaN         NaN            NaN              NaN
149                      San Marino             566                   0.17      33,860            NaN       

<h4>Dealing with empty rows</h4>

In [22]:
# Replace NaN with the mean of the column for 'Co2-Emissions'
data['Co2-Emissions'].fillna(data['Co2-Emissions'].mean(), inplace=True)

# Drop rows where there is no agricultural land and urban population data
data.dropna(subset=['Agricultural Land( %)', 'Urban_population'], inplace=True)

# Checking for empty rows
if data.isnull().any(axis=1).sum() > 0:
    print("There are empty rows in the dataset. Here are the details:")
    print(data[data.isnull().any(axis=1)])
else:
    print("No empty rows found.")


No empty rows found.
