## Hands-On Data Preprocessing in Python
Learn how to effectively prepare data for successful data analytics
    
    AUTHOR: Dr. Roy Jafari 

# Chapter 9: Data Cleaning Level Ⅰ - Clean up the table

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Data Cleaning Level Ⅰ - Cleaning up the table  
### Example 1 – Unwise data collection

In [2]:
from os import listdir
FileNames = listdir('Speeches')
print(FileNames)

['BattleCreekDec19_2019.txt', 'BemidjiSep18_2020.txt', 'CharlestonFeb28_2020.txt', 'CharlotteMar2_2020.txt', 'CincinnatiAug1_2019.txt', 'ColoradorSpringsFeb20_2020.txt', 'DallasOct17_2019.txt', 'DesMoinesJan30_2020.txt', 'FayettevilleSep19_2020.txt', 'FayettevilleSep9_2019.txt', 'FreelandSep10_2020.txt', 'GreenvilleJul17_2019.txt', 'HendersonSep13_2020.txt', 'HersheyDec10_2019.txt', 'LasVegasFeb21_2020.txt', 'LatrobeSep3_2020.txt', 'LexingtonNov4_2019.txt', 'MilwaukeeJan14_2020.txt', 'MindenSep12_2020.txt', 'MinneapolisOct10_2019.txt', 'MosineeSep17_2020.txt', 'NewHampshireAug15_2019.txt', 'NewHampshireAug28_2020.txt', 'NewHampshireFeb10_2020.txt', 'NewMexicoSep16_2019.txt', 'OhioSep21_2020.txt', 'PhoenixFeb19_2020.txt', 'PittsburghSep22_2020.txt', 'TexasSep23_2019.txt', 'ToledoJan9_2020.txt', 'TulsaJun20_2020.txt', 'TupeloNov1_2019.txt', 'WildwoodJan28_2020.txt', 'Winston-SalemSep8_2020.txt', 'YumaAug18_2020.txt']


In [3]:
import pandas as pd
speech_df = pd.DataFrame(index=range(len(FileNames)),columns=['File Name','The Content'])
print(speech_df)

   File Name The Content
0        NaN         NaN
1        NaN         NaN
2        NaN         NaN
3        NaN         NaN
4        NaN         NaN
5        NaN         NaN
6        NaN         NaN
7        NaN         NaN
8        NaN         NaN
9        NaN         NaN
10       NaN         NaN
11       NaN         NaN
12       NaN         NaN
13       NaN         NaN
14       NaN         NaN
15       NaN         NaN
16       NaN         NaN
17       NaN         NaN
18       NaN         NaN
19       NaN         NaN
20       NaN         NaN
21       NaN         NaN
22       NaN         NaN
23       NaN         NaN
24       NaN         NaN
25       NaN         NaN
26       NaN         NaN
27       NaN         NaN
28       NaN         NaN
29       NaN         NaN
30       NaN         NaN
31       NaN         NaN
32       NaN         NaN
33       NaN         NaN
34       NaN         NaN


In [4]:
for i,f_name in enumerate(FileNames):
    f = open('Speeches/' + f_name, "r", encoding='utf-8')
    f_content = f.readlines()
    f.close()
    
    speech_df.at[i,'File Name'] = f_name
    speech_df.at[i,'The Content'] = f_content[0]

In [5]:
speech_df.columns = ['FileName','Content']

In [6]:
speech_df

Unnamed: 0,FileName,Content
0,BattleCreekDec19_2019.txt,Thank you. Thank you. Thank you to Vice Presid...
1,BemidjiSep18_2020.txt,There's a lot of people. That's great. Thank y...
2,CharlestonFeb28_2020.txt,Thank you. Thank you. Thank you. All I can say...
3,CharlotteMar2_2020.txt,"I want to thank you very much. North Carolina,..."
4,CincinnatiAug1_2019.txt,Thank you all. Thank you very much. Thank you ...
5,ColoradorSpringsFeb20_2020.txt,"Hello Colorado. We love Colorado, most beautif..."
6,DallasOct17_2019.txt,Thank you. Thank you very much. Hello Dallas. ...
7,DesMoinesJan30_2020.txt,I worked so hard for this state. I worked so h...
8,FayettevilleSep19_2020.txt,"What a crowd, what a crowd. Get those people o..."
9,FayettevilleSep9_2019.txt,Thank you everybody. Thank you and Vice Presi...


### Example 2 – reindexing (Multi-level Indexing)

In [7]:
air_df = pd.read_csv('TempData.csv')
air_df

Unnamed: 0,Temp,Year,Month,Day,Time
0,79.0,2016,1,1,00:00:00
1,79.0,2016,1,1,00:30:00
2,79.0,2016,1,1,01:00:00
3,77.0,2016,1,1,01:30:00
4,78.0,2016,1,1,02:00:00
...,...,...,...,...,...
20448,77.0,2016,12,31,22:00:00
20449,77.0,2016,12,31,22:30:00
20450,77.0,2016,12,31,23:00:00
20451,77.0,2016,12,31,23:00:00


In [8]:
air2016_df = air_df.drop(columns=['Year'])

In [9]:
air2016_df.set_index(['Month','Day','Time'],inplace=True)

In [10]:
air2016_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Temp
Month,Day,Time,Unnamed: 3_level_1
1,1,00:00:00,79.0
1,1,00:30:00,79.0
1,1,01:00:00,79.0
1,1,01:30:00,77.0
1,1,02:00:00,78.0
...,...,...,...
12,31,22:00:00,77.0
12,31,22:30:00,77.0
12,31,23:00:00,77.0
12,31,23:00:00,77.0


In [11]:
air2016_df.loc[2,24,'00:30:00']

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Temp
Month,Day,Time,Unnamed: 3_level_1
2,24,00:30:00,77.0


### Example 3 – Intuitive but long column titles

In [12]:
response_df = pd.read_csv('OSMI Mental Health in Tech Survey 2019.csv')
response_df.head(1)

Unnamed: 0,*Are you self-employed?*,How many employees does your company or organization have?,Is your employer primarily a tech company/organization?,Is your primary role within your company related to tech/IT?,Does your employer provide mental health benefits as part of healthcare coverage?,Do you know the options for mental health care available under your employer-provided health coverage?,"Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?",Does your employer offer resources to learn more about mental health disorders and options for seeking help?,Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?,"If a mental health issue prompted you to request a medical leave from work, how easy or difficult would it be to ask for that leave?",...,Briefly describe what you think the industry as a whole and/or employers could do to improve mental health support for employees.,"If there is anything else you would like to tell us that has not been covered by the survey questions, please use this space to do so.",Would you be willing to talk to one of us more extensively about your experiences with mental health issues in the tech industry? (Note that all interview responses would be used _anonymously_ and only with your permission.),What is your age?,What is your gender?,What country do you *live* in?,What US state or territory do you *live* in?,What is your race?,What country do you *work* in?,What US state or territory do you *work* in?
0,False,26-100,True,True,I don't know,No,Yes,Yes,I don't know,Very easy,...,,,False,25,Male,United States of America,Nebraska,White,United States of America,Nebraska


In [13]:
response_df['Do you know the options for mental health care available under your employer-provided health coverage?']

0       No
1       No
2       No
3       No
4       No
      ... 
347     No
348    NaN
349    NaN
350     No
351    Yes
Name: Do you know the options for mental health care available under your employer-provided health coverage?, Length: 352, dtype: object

In [14]:
keys = ['Q{}'.format(i) for i in range(1,83)]

columns_dic = pd.Series(response_df.columns,index=keys)

In [15]:
columns_dic['Q4']

'Is your primary role within your company related to tech/IT?'

In [16]:
response_df.columns = keys

In [17]:
response_df.head(1)

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q73,Q74,Q75,Q76,Q77,Q78,Q79,Q80,Q81,Q82
0,False,26-100,True,True,I don't know,No,Yes,Yes,I don't know,Very easy,...,,,False,25,Male,United States of America,Nebraska,White,United States of America,Nebraska
