In [1]:
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


#### DATASET INSERTING

In [4]:
data=pd.read_csv("Population_Data.csv")
data.head()

Unnamed: 0,Region,Office Location Id,Indians,Foreigners,Indian_Male,Indian_Female,Foreigners_Male,Foreigners_Female,Total Population
0,Region 31,1,643596,2883782,440445,203151,2763718,72515,3527378
1,Region 17,9,319933,1501899,213477,106456,1449303,27671,1821832
2,Region 12,4,194379,650744,161803,32576,631660,10652,845123
3,Region 22,15,107360,470708,85343,22017,450267,6389,578068
4,Region 23,13,55351,329980,31796,23555,325105,3684,385331


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Region              38 non-null     object
 1   Office Location Id  38 non-null     int64 
 2   Indians             38 non-null     object
 3   Foreigners          38 non-null     object
 4   Indian_Male         38 non-null     object
 5   Indian_Female       38 non-null     object
 6   Foreigners_Male     38 non-null     object
 7   Foreigners_Female   38 non-null     object
 8   Total Population    38 non-null     object
dtypes: int64(1), object(8)
memory usage: 2.8+ KB


Here the numbers are separated by comas so numbers are taken as object

### Converting features into Numeric

In [6]:
numeric=['Indians','Foreigners',
          'Indian_Male','Indian_Female',
            'Foreigners_Male','Foreigners_Female',
          'Total Population' ]

In [7]:
#defining a function to remove comas

def cleaner(z):
    return z.replace(',','')

In [8]:
for i in data[numeric]:
    data[i]=data[i].apply(cleaner)

In [9]:
data.head()

Unnamed: 0,Region,Office Location Id,Indians,Foreigners,Indian_Male,Indian_Female,Foreigners_Male,Foreigners_Female,Total Population
0,Region 31,1,643596,2883782,440445,203151,2763718,72515,3527378
1,Region 17,9,319933,1501899,213477,106456,1449303,27671,1821832
2,Region 12,4,194379,650744,161803,32576,631660,10652,845123
3,Region 22,15,107360,470708,85343,22017,450267,6389,578068
4,Region 23,13,55351,329980,31796,23555,325105,3684,385331


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Region              38 non-null     object
 1   Office Location Id  38 non-null     int64 
 2   Indians             38 non-null     object
 3   Foreigners          38 non-null     object
 4   Indian_Male         38 non-null     object
 5   Indian_Female       38 non-null     object
 6   Foreigners_Male     38 non-null     object
 7   Foreigners_Female   38 non-null     object
 8   Total Population    38 non-null     object
dtypes: int64(1), object(8)
memory usage: 2.8+ KB


Still the data in object type so we use numeric type explicitly

### Converting data type to Numeric type explicitly

In [11]:
data[numeric]=data[numeric].apply(pd.to_numeric)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Region              38 non-null     object
 1   Office Location Id  38 non-null     int64 
 2   Indians             38 non-null     int64 
 3   Foreigners          38 non-null     int64 
 4   Indian_Male         38 non-null     int64 
 5   Indian_Female       38 non-null     int64 
 6   Foreigners_Male     38 non-null     int64 
 7   Foreigners_Female   38 non-null     int64 
 8   Total Population    38 non-null     int64 
dtypes: int64(8), object(1)
memory usage: 2.8+ KB


### Verifying The Integrity of Data

In [13]:
#here Indias population and Foreigners population sum should we equal to total population
 # so we are adding both and subrating the total population


data[['Indians','Foreigners']].sum().sum()-data['Total Population'].sum()

0

In [15]:
#similarly checking for male female

data[['Indian_Male','Indian_Female','Foreigners_Male','Foreigners_Female']].sum().sum()-data['Total Population'].sum()

-112859

Here some people doesn't find male or female so let's create another feature called OTHER for not identifying themselves

In [16]:
#creating other for not identifying people

MF_sum=data['Indian_Male']+data['Indian_Female']+data['Foreigners_Female']+data['Foreigners_Male']

data['others']=data['Total Population']-MF_sum

data.head()

Unnamed: 0,Region,Office Location Id,Indians,Foreigners,Indian_Male,Indian_Female,Foreigners_Male,Foreigners_Female,Total Population,others
0,Region 31,1,643596,2883782,440445,203151,2763718,72515,3527378,47549
1,Region 17,9,319933,1501899,213477,106456,1449303,27671,1821832,24925
2,Region 12,4,194379,650744,161803,32576,631660,10652,845123,8432
3,Region 22,15,107360,470708,85343,22017,450267,6389,578068,14052
4,Region 23,13,55351,329980,31796,23555,325105,3684,385331,1191


In [17]:
data['Region'].nunique(),data['Office Location Id'].nunique()

(38, 38)

In [19]:
#Here Region ,office location Id ,total population doesn't provide useful Information So drop

data1=data.drop(columns=['Region','Office Location Id','Total Population'])
data1.head()

Unnamed: 0,Indians,Foreigners,Indian_Male,Indian_Female,Foreigners_Male,Foreigners_Female,others
0,643596,2883782,440445,203151,2763718,72515,47549
1,319933,1501899,213477,106456,1449303,27671,24925
2,194379,650744,161803,32576,631660,10652,8432
3,107360,470708,85343,22017,450267,6389,14052
4,55351,329980,31796,23555,325105,3684,1191
