## Data Preprocessing of a Population (Dataset from Kaggle)

### Importing the libraries and the dataset

In [158]:
import warnings
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")


In [159]:
df = pd.read_csv('county_population_by_race.csv')
df.head(5)

Unnamed: 0,county,total_population_of_one_race,total_population_of_one_race_white_alone,total_population_of_one_race_black_or_african_american_alone,total_population_of_one_race_american_indian_and_alaska_native_alone,total_population_of_one_race_asian_alone,total_population_of_one_race_native_hawaiian_and_other_pacific_islander_alone,total_population_of_one_race_some_other_race_alone,total_population_of_two_or_more_races
0,"Autauga County, Alabama",55648,42160,11445,217,881,35,910,3157
1,"Baldwin County, Alabama",216743,189399,18217,1582,2067,143,5335,15024
2,"Barbour County, Alabama",24523,11317,11933,116,117,1,1039,700
3,"Bibb County, Alabama",21534,16555,4413,60,32,9,465,759
4,"Blount County, Alabama",55478,50663,845,337,178,24,3431,3656


In [160]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3222 entries, 0 to 3221
Data columns (total 9 columns):
 #   Column                                                                         Non-Null Count  Dtype 
---  ------                                                                         --------------  ----- 
 0   county                                                                         3222 non-null   object
 1   total_population_of_one_race                                                   3222 non-null   object
 2   total_population_of_one_race_white_alone                                       3222 non-null   object
 3   total_population_of_one_race_black_or_african_american_alone                   3222 non-null   object
 4   total_population_of_one_race_american_indian_and_alaska_native_alone           3222 non-null   object
 5   total_population_of_one_race_asian_alone                                       3222 non-null   object
 6   total_population_of_one_race_nati

In [161]:
df.dtypes

county                                                                           object
total_population_of_one_race                                                     object
total_population_of_one_race_white_alone                                         object
total_population_of_one_race_black_or_african_american_alone                     object
total_population_of_one_race_american_indian_and_alaska_native_alone             object
total_population_of_one_race_asian_alone                                         object
total_population_of_one_race_native_hawaiian_and_other_pacific_islander_alone    object
total_population_of_one_race_some_other_race_alone                               object
total_population_of_two_or_more_races                                            object
dtype: object

In [162]:
df.shape

(3222, 9)

### Checking the number of duplicate rows in the dataset

In [163]:
dupli_rows = df[df.duplicated()]
print("No. of duplicated rows : ", dupli_rows.shape)

No. of duplicated rows :  (0, 9)


### Command to drop the duplicate rows

In [147]:
df = df.drop_duplicates()
df.shape

(3222, 9)

### Checking if there are any null values present

In [164]:
print(df.isnull().sum())

county                                                                           0
total_population_of_one_race                                                     0
total_population_of_one_race_white_alone                                         0
total_population_of_one_race_black_or_african_american_alone                     0
total_population_of_one_race_american_indian_and_alaska_native_alone             0
total_population_of_one_race_asian_alone                                         0
total_population_of_one_race_native_hawaiian_and_other_pacific_islander_alone    0
total_population_of_one_race_some_other_race_alone                               0
total_population_of_two_or_more_races                                            0
dtype: int64


### Renaming the names of the columns

In [165]:
df = df.rename(columns={'total_population_of_one_race': 'One Race', 'total_population_of_one_race_white_alone': 'White races', 
                        'total_population_of_one_race_black_or_african_american_alone': 'Black races',
                       'total_population_of_one_race_american_indian_and_alaska_native_alone': 'Native american',
                       'total_population_of_one_race_asian_alone': 'Asian Races',
                       'total_population_of_one_race_native_hawaiian_and_other_pacific_islander_alone': 'Hawaiian races',
                       'total_population_of_one_race_some_other_race_alone': 'other races',
                       'total_population_of_two_or_more_races': 'two or more mixed races'})
df.head(5)


Unnamed: 0,county,One Race,White races,Black races,Native american,Asian Races,Hawaiian races,other races,two or more mixed races
0,"Autauga County, Alabama",55648,42160,11445,217,881,35,910,3157
1,"Baldwin County, Alabama",216743,189399,18217,1582,2067,143,5335,15024
2,"Barbour County, Alabama",24523,11317,11933,116,117,1,1039,700
3,"Bibb County, Alabama",21534,16555,4413,60,32,9,465,759
4,"Blount County, Alabama",55478,50663,845,337,178,24,3431,3656


In [166]:
df['county'].unique()

array(['Autauga County, Alabama', 'Baldwin County, Alabama',
       'Barbour County, Alabama', ..., 'Yabucoa Municipio, Puerto Rico',
       'Yauco Municipio, Puerto Rico', 'United States'], dtype=object)

### Changing the datatypes

In [167]:

# The population should be integer instead of object datatype

#df['One Race'] = [x.replace(',','') for x in df['One Race']]
#df['One Race'] = df['One Race'].astype(float)

df['White races'] = [x.replace(',','') for x in df['White races']]
df['White races'] = df['White races'].astype(float)

df['Black races'] = [x.replace(',','') for x in df['Black races']]
df['Black races'] = df['Black races'].astype(float)

df['Native american'] = [x.replace(',','') for x in df['Native american']]
df['Native american'] = df['Native american'].astype(float)

df['Asian Races'] = [x.replace(',','') for x in df['Asian Races']]
df['Asian Races'] = df['Asian Races'].astype(float)

df['Hawaiian races'] = [x.replace(',','') for x in df['Hawaiian races']]
df['Hawaiian races'] = df['Hawaiian races'].astype(float)

df['other races'] = [x.replace(',','') for x in df['other races']]
df['other races'] = df['other races'].astype(float)

df['two or more mixed races'] = [x.replace(',','') for x in df['two or more mixed races']]
df['two or more mixed races'] = df['two or more mixed races'].astype(float)

In [168]:
df.dtypes

county                      object
One Race                    object
White races                float64
Black races                float64
Native american            float64
Asian Races                float64
Hawaiian races             float64
other races                float64
two or more mixed races    float64
dtype: object

### Data description

In [169]:
df.describe()

Unnamed: 0,White races,Black races,Native american,Asian Races,Hawaiian races,other races,two or more mixed races
count,3222.0,3222.0,3222.0,3222.0,3222.0,3222.0,3222.0
mean,126975.5,25585.7,2319.1,12345.16,428.468343,17588.38,21518.83
std,3600920.0,725953.2,65838.57,352959.1,12351.096604,495755.0,598217.4
min,27.0,0.0,0.0,0.0,0.0,0.0,6.0
25%,7771.25,90.0,48.0,34.0,2.0,135.0,523.25
50%,19382.0,842.5,147.0,129.5,9.0,540.0,1472.0
75%,52524.5,5305.5,543.25,791.25,37.0,2727.25,5524.0
max,204277300.0,41104200.0,3727135.0,19886050.0,689966.0,27915720.0,33848940.0


## Analysing and Visualising the Data

### Checking the Correlation

In [170]:
data_corr = df.corr()
data_corr

Unnamed: 0,White races,Black races,Native american,Asian Races,Hawaiian races,other races,two or more mixed races
White races,1.0,0.998575,0.998397,0.995135,0.98538,0.994807,0.998501
Black races,0.998575,1.0,0.997145,0.994643,0.983296,0.994778,0.997891
Native american,0.998397,0.997145,1.0,0.995878,0.985065,0.996838,0.998343
Asian Races,0.995135,0.994643,0.995878,1.0,0.986854,0.997223,0.99656
Hawaiian races,0.98538,0.983296,0.985065,0.986854,1.0,0.982743,0.985997
other races,0.994807,0.994778,0.996838,0.997223,0.982743,1.0,0.997657
two or more mixed races,0.998501,0.997891,0.998343,0.99656,0.985997,0.997657,1.0


In [172]:
df.drop(['White races', 'Black races','Native american','Hawaiian races','other races','two or more mixed races'], axis = 1, inplace = True)
df.head(15)

Unnamed: 0,county,One Race,Asian Races
0,"Autauga County, Alabama",55648,881.0
1,"Baldwin County, Alabama",216743,2067.0
2,"Barbour County, Alabama",24523,117.0
3,"Bibb County, Alabama",21534,32.0
4,"Blount County, Alabama",55478,178.0
5,"Bullock County, Alabama",10109,9.0
6,"Butler County, Alabama",18464,143.0
7,"Calhoun County, Alabama",110412,1190.0
8,"Chambers County, Alabama",33562,401.0
9,"Cherokee County, Alabama",24051,56.0
