In [292]:
import pandas as pd

In [293]:
df = pd.read_csv('world population.csv')
df.head(5)

Unnamed: 0,Rank,Country(or dependency),Population(2020),Yearly_Change,Net_Change,Density(P/Km²),Land_Area(Km²),Migrants(net),Fertility_Rate,Median_Age,Urban _Pop %,World_Share,Continent
0,1,China,1439323776,0.39 %,5540090,153,9388211,-348399,1.7,38,61 %,18.47 %,Asia
1,2,India,1380004385,0.99 %,13586631,464,2973190,-532687,2.2,28,35 %,17.70 %,Asia
2,3,United States,331002651,0.59 %,1937734,36,9147420,954806,1.8,38,83 %,4.25 %,North America
3,4,Indonesia,273523615,1.07 %,2898047,151,1811570,-98955,2.3,30,56 %,3.51 %,Asia
4,5,Pakistan,220892340,2.00 %,4327022,287,770880,-233379,3.6,23,35 %,2.83 %,Asia


In [294]:
df.isnull().sum().any()

True

There is some null values and punctuations. so we have to clean it

### preprocessing

* drop unnecessary column
* fill null values
* remove punctuation
* proper column data-type 

In [295]:
# actually we don't need this col so drop it 

df = df.drop('Rank', axis=1)

In [296]:
# fill all null values with 0 

df = df.fillna(0)

In [297]:
# remove punctuations

df.replace(',','', regex=True, inplace=True)

In [298]:
df.head(1)

Unnamed: 0,Country(or dependency),Population(2020),Yearly_Change,Net_Change,Density(P/Km²),Land_Area(Km²),Migrants(net),Fertility_Rate,Median_Age,Urban _Pop %,World_Share,Continent
0,China,1439323776,0.39 %,5540090,153,9388211,-348399,1.7,38,61 %,18.47 %,Asia


### Our dataset contains percentages and dollar signs. If we wanted, we could have removed these in the previous simple rules [ replace() ] . But we will apply different rules here.


In [299]:
# in our datasets their are some percentage, we have to remove them. and we are gonna using rstrip().
# well, basically rstrip() returns a copy of the string with trailing characters removed (based on the string argument passed)

df['Yearly_Change'] = df['Yearly_Change'].str.rstrip('%').astype('float64')
df['Urban _Pop %'] = df['Urban _Pop %'].str.rstrip('%')
df['World_Share'] = df['World_Share'].str.rstrip('%')

In [300]:
# but still our 'Urban_Pop % and Median_Age' is not free to go. this cols contain some non-numeric value. 
# we have clean it and make it numeric

df['Urban _Pop %'] = pd.to_numeric(df['Urban _Pop %'], errors='coerce')
df['Median_Age'] = pd.to_numeric(df['Median_Age'], errors='coerce')
df['Fertility_Rate'] = pd.to_numeric(df['Fertility_Rate'], errors='coerce')

In [301]:
# now fill this nan values with zero

import numpy as np

df['Urban _Pop %'] = df['Urban _Pop %'].replace(np.nan, 0)
df['Median_Age'] = df['Median_Age'].replace(np.nan, 0)
df['Fertility_Rate'] = df['Fertility_Rate'].replace(np.nan, 0)

In [302]:
# remove dollar '$' sign

df['Density(P/Km²)'] = [x.strip('$') for x in df['Density(P/Km²)']]

In [303]:
# now change data type

df['Population(2020)'] = df['Population(2020)'].astype('int64')
df['Net_Change'] = df['Net_Change'].astype('int64')
df['Yearly_Change'] = df['Yearly_Change'].astype('float64')
df['Density(P/Km²)'] = df['Yearly_Change'].astype('float64')
df['Land_Area(Km²)'] = df['Land_Area(Km²)'].astype('int64')
df['Migrants(net)'] = df['Migrants(net)'].astype('int64')
df['Fertility_Rate'] = df['Fertility_Rate'].astype('float64')
df['Median_Age'] = df['Median_Age'].astype('int64')
df['Urban _Pop %'] = df['Urban _Pop %'].astype('int64')
df['World_Share'] = df['World_Share'].astype('float64')

In [None]:
## Our Desired Dataset is now crystal clear

In [305]:
df.head(10)

Unnamed: 0,Country(or dependency),Population(2020),Yearly_Change,Net_Change,Density(P/Km²),Land_Area(Km²),Migrants(net),Fertility_Rate,Median_Age,Urban _Pop %,World_Share,Continent
0,China,1439323776,0.39,5540090,0.39,9388211,-348399,1.7,38,61,18.47,Asia
1,India,1380004385,0.99,13586631,0.99,2973190,-532687,2.2,28,35,17.7,Asia
2,United States,331002651,0.59,1937734,0.59,9147420,954806,1.8,38,83,4.25,North America
3,Indonesia,273523615,1.07,2898047,1.07,1811570,-98955,2.3,30,56,3.51,Asia
4,Pakistan,220892340,2.0,4327022,2.0,770880,-233379,3.6,23,35,2.83,Asia
5,Brazil,212559417,0.72,1509890,0.72,8358140,21200,1.7,33,88,2.73,South America
6,Nigeria,206139589,2.58,5175990,2.58,910770,-60000,5.4,18,52,2.64,Africa
7,Bangladesh,164689383,1.01,1643222,1.01,130170,-369501,2.1,28,39,2.11,Asia
8,Russia,145934462,0.04,62206,0.04,16376870,182456,1.8,40,74,1.87,Europe
9,Mexico,128932753,1.06,1357224,1.06,1943950,-60000,2.1,29,84,1.65,North America


#### Here are several ways to remove the dollar sign

In [291]:
#1. df['Density(P/Km²)'] = df['Density(P/Km²)'].apply(lambda x: float(x.split()[0].replace('$', '')))

#2. df['Density(P/Km²)'] = df['Density(P/Km²)'].apply(lambda x: x.strip('$'))

#3. df['Density(P/Km²)'] = df['Density(P/Km²)'].apply(lambda x: x.replace('$',''))

#4. df['Density(P/Km²)'] = [x.strip('$') for x in df['Density(P/Km²)']]

#5. df['Density(P/Km²)'] = [x.strip('$') for x in df['Density(P/Km²)']]

#6. df['Density(P/Km²)'] = [x[1:] for x in df['Density(P/Km²)']]