# Data Cleaning and Exploration Exercise

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import glob as gb

files = gb.glob('../data/states*.csv')
df_list = [pd.read_csv(file) for file in files]
data = pd.concat(df_list).reset_index()
data.head()

Unnamed: 0.1,index,Unnamed: 0,State,TotalPop,Hispanic,White,Black,Native,Asian,Pacific,Income,GenderPop
0,0,0,Missouri,6045448,4.037247838616718%,77.508069164265%,14.122118155619594%,0.36332853025936646%,1.6244956772334296%,0.10165706051873193%,$49763.98772563177,2964003M_3081445F
1,1,1,Montana,1014699,3.2688888888888896%,86.41555555555554%,0.4292592592592591%,7.0607407407407425%,0.5703703703703705%,0.07222222222222222%,$47645.682835820895,510163M_F
2,2,2,Nebraska,1869365,9.203759398496235%,81.13947368421056%,4.956203007518794%,0.8644736842105263%,1.8590225563909788%,0.05714285714285715%,$55916.469696969696,929606M_939759F
3,3,3,Nevada,2798636,27.100883652430046%,53.23932253313698%,7.739617083946994%,1.0871870397643593%,7.095729013254786%,0.5745213549337267%,$55526.525073746314,1407735M_1390901F
4,4,4,New Hampshire,1324201,3.3219178082191796%,91.31917808219184%,1.2277397260273974%,0.14280821917808229%,2.191438356164382%,0.016095890410958904%,$68728.8595890411,653484M_670717F


In [2]:
data = data[['State', 'TotalPop', 'Hispanic', 'White', 'Black', 'Native', 'Asian', 'Pacific', 'Income', 'GenderPop']]
data.head()

Unnamed: 0,State,TotalPop,Hispanic,White,Black,Native,Asian,Pacific,Income,GenderPop
0,Missouri,6045448,4.037247838616718%,77.508069164265%,14.122118155619594%,0.36332853025936646%,1.6244956772334296%,0.10165706051873193%,$49763.98772563177,2964003M_3081445F
1,Montana,1014699,3.2688888888888896%,86.41555555555554%,0.4292592592592591%,7.0607407407407425%,0.5703703703703705%,0.07222222222222222%,$47645.682835820895,510163M_F
2,Nebraska,1869365,9.203759398496235%,81.13947368421056%,4.956203007518794%,0.8644736842105263%,1.8590225563909788%,0.05714285714285715%,$55916.469696969696,929606M_939759F
3,Nevada,2798636,27.100883652430046%,53.23932253313698%,7.739617083946994%,1.0871870397643593%,7.095729013254786%,0.5745213549337267%,$55526.525073746314,1407735M_1390901F
4,New Hampshire,1324201,3.3219178082191796%,91.31917808219184%,1.2277397260273974%,0.14280821917808229%,2.191438356164382%,0.016095890410958904%,$68728.8595890411,653484M_670717F


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 10 columns):
State        60 non-null object
TotalPop     60 non-null int64
Hispanic     60 non-null object
White        60 non-null object
Black        60 non-null object
Native       60 non-null object
Asian        60 non-null object
Pacific      55 non-null object
Income       60 non-null object
GenderPop    60 non-null object
dtypes: int64(1), object(9)
memory usage: 4.8+ KB


In [4]:
# convert population percentages into floats
def string_to_numbers(col):
    temp = col.replace('[\%,]', '', regex=True) # returns a series
    return pd.to_numeric(temp)

data['Hispanic'] = string_to_numbers(data.Hispanic)
data['White'] = string_to_numbers(data.White)
data['Black'] = string_to_numbers(data.Black)
data['Native'] = string_to_numbers(data.Native)
data['Asian'] = string_to_numbers(data.Asian)
data['Pacific'] = string_to_numbers(data.Pacific)

In [5]:
# convert income into a float
income_series = data.Income.replace('[\$,]', '', regex=True)
data['Income'] = pd.to_numeric(income_series)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 10 columns):
State        60 non-null object
TotalPop     60 non-null int64
Hispanic     60 non-null float64
White        60 non-null float64
Black        60 non-null float64
Native       60 non-null float64
Asian        60 non-null float64
Pacific      55 non-null float64
Income       60 non-null float64
GenderPop    60 non-null object
dtypes: float64(7), int64(1), object(2)
memory usage: 4.8+ KB


In [6]:
data.head()

Unnamed: 0,State,TotalPop,Hispanic,White,Black,Native,Asian,Pacific,Income,GenderPop
0,Missouri,6045448,4.037248,77.508069,14.122118,0.363329,1.624496,0.101657,49763.987726,2964003M_3081445F
1,Montana,1014699,3.268889,86.415556,0.429259,7.060741,0.57037,0.072222,47645.682836,510163M_F
2,Nebraska,1869365,9.203759,81.139474,4.956203,0.864474,1.859023,0.057143,55916.469697,929606M_939759F
3,Nevada,2798636,27.100884,53.239323,7.739617,1.087187,7.095729,0.574521,55526.525074,1407735M_1390901F
4,New Hampshire,1324201,3.321918,91.319178,1.22774,0.142808,2.191438,0.016096,68728.859589,653484M_670717F


In [7]:
# split 'GenderPop' into male and female
male_female_split = data.GenderPop.str.split('_')
male_female_split[:]

0       [2964003M, 3081445F]
1               [510163M, F]
2         [929606M, 939759F]
3       [1407735M, 1390901F]
4         [653484M, 670717F]
5       [4343027M, 4561386F]
6       [2341093M, 2489527F]
7         [384160M, 349215F]
8       [3299088M, 3342840F]
9       [1451913M, 1506295F]
10    [19087135M, 19334329F]
11      [2648667M, 2630239F]
12        [510388M, 543273F]
13      [2322409M, 2455167F]
14        [423477M, 419713F]
15      [3167756M, 3331859F]
16    [13171316M, 13367298F]
17      [1459229M, 1444150F]
18             [2872643M, F]
19      [3249650M, 3455936F]
20      [4861973M, 5038598F]
21      [2692166M, 2727005F]
22      [1451723M, 1536358F]
23      [2964003M, 3081445F]
24      [4343027M, 4561386F]
25      [1032414M, 1051703F]
26     [9541801M, 10131373F]
27      [4795408M, 5049925F]
28        [367963M, 353677F]
29      [5662893M, 5913084F]
30      [5662893M, 5913084F]
31      [1906944M, 1942789F]
32      [1948453M, 1990780F]
33      [6245344M, 6534215F]
34      [17138

In [8]:
data['Male'] = male_female_split.str.get(0)
data['Female'] = male_female_split.str.get(1)
data.head()

Unnamed: 0,State,TotalPop,Hispanic,White,Black,Native,Asian,Pacific,Income,GenderPop,Male,Female
0,Missouri,6045448,4.037248,77.508069,14.122118,0.363329,1.624496,0.101657,49763.987726,2964003M_3081445F,2964003M,3081445F
1,Montana,1014699,3.268889,86.415556,0.429259,7.060741,0.57037,0.072222,47645.682836,510163M_F,510163M,F
2,Nebraska,1869365,9.203759,81.139474,4.956203,0.864474,1.859023,0.057143,55916.469697,929606M_939759F,929606M,939759F
3,Nevada,2798636,27.100884,53.239323,7.739617,1.087187,7.095729,0.574521,55526.525074,1407735M_1390901F,1407735M,1390901F
4,New Hampshire,1324201,3.321918,91.319178,1.22774,0.142808,2.191438,0.016096,68728.859589,653484M_670717F,653484M,670717F


In [9]:
# convert 'Male' and 'Female' columns into integers
male_series = data.Male.str.replace('M', '')
data['Male'] = pd.to_numeric(male_series)

female_series = data.Female.str.replace('F', '')
data['Female'] = pd.to_numeric(female_series)

In [10]:
# sum the number of 'NaN' values in the dataset 
data.isnull().sum()

State        0
TotalPop     0
Hispanic     0
White        0
Black        0
Native       0
Asian        0
Pacific      5
Income       0
GenderPop    0
Male         0
Female       3
dtype: int64

In [11]:
data = data.fillna(0)
data.isnull().sum()

State        0
TotalPop     0
Hispanic     0
White        0
Black        0
Native       0
Asian        0
Pacific      0
Income       0
GenderPop    0
Male         0
Female       0
dtype: int64

In [12]:
def fill_in_pop(row):
    if row.Female == 0:
        return row.TotalPop - row.Male
    else:
        return row.Female

# fill in the missing female population numbers
data['Female'] = data.apply(fill_in_pop, axis=1)

In [13]:
# 'melt' male and female columns into 'Gender' and 'GenderPop'
data = pd.melt(frame=data, id_vars=['State', 'TotalPop', 'Hispanic', 'White', 'Black', 'Native', 'Asian', 'Pacific', 'Income'],
              value_vars=['Male', 'Female'], var_name='Gender', value_name='GenderPop')

In [14]:
data.sample(10)

Unnamed: 0,State,TotalPop,Hispanic,White,Black,Native,Asian,Pacific,Income,Gender,GenderPop
37,Vermont,626604,1.60929,93.98306,0.980874,0.301639,1.238798,0.030601,55602.967213,Male,308573.0
113,Iowa,3093526,5.303645,87.719684,3.256987,0.289793,1.699392,0.055164,53017.753041,Female,1558931.0
47,Georgia,10006693,8.418242,54.286306,32.088298,0.187583,3.097649,0.046602,50811.082051,Male,4883331.0
17,Utah,2903379,13.468376,79.406838,1.017949,1.081368,2.196068,0.825983,63488.917808,Male,1459229.0
106,Florida,19645772,21.338543,59.083749,15.165676,0.210451,2.283174,0.05151,50690.194987,Female,10045763.0
63,Nevada,2798636,27.100884,53.239323,7.739617,1.087187,7.095729,0.574521,55526.525074,Female,1390901.0
52,Indiana,6568645,6.536744,78.431894,11.186977,0.194086,1.578272,0.032625,48616.227848,Male,3235263.0
108,Georgia,10006693,8.418242,54.286306,32.088298,0.187583,3.097649,0.046602,50811.082051,Female,5123362.0
2,Nebraska,1869365,9.203759,81.139474,4.956203,0.864474,1.859023,0.057143,55916.469697,Male,929606.0
80,Michigan,9900571,4.634993,72.381722,17.633103,0.484411,2.42311,0.019549,51201.830037,Female,5038598.0


In [15]:
# 'melt' different races into a single column
data = pd.melt(frame=data, id_vars=['State', 'TotalPop', 'Income', 'Gender', 'GenderPop'],
        value_vars=['Hispanic', 'White', 'Black', 'Native', 'Asian', 'Pacific'],
              var_name='Race', value_name='RacePopPercentage')

In [16]:
data.sample(10)

Unnamed: 0,State,TotalPop,Income,Gender,GenderPop,Race,RacePopPercentage
263,Missouri,6045448,49763.987726,Male,2964003.0,Black,14.122118
232,Indiana,6568645,48616.227848,Female,3333382.0,White,78.431894
10,California,38421464,67264.782303,Male,19087135.0,Hispanic,37.291875
715,Kansas,2892987,53885.612648,Female,1453125.0,Pacific,0.0
215,Rhode Island,1053661,59125.270833,Female,543273.0,White,74.325417
702,Colorado,5278906,64657.801787,Female,2630239.0,Pacific,0.0
22,Mississippi,2988081,38909.919207,Male,1451723.0,Hispanic,2.842401
502,Mississippi,2988081,38909.919207,Male,1451723.0,Asian,0.876444
330,Ohio,11575977,49655.248466,Female,5913084.0,Black,16.207276
497,Utah,2903379,63488.917808,Male,1459229.0,Asian,2.196068
