Cleaning Data - Casting Datatypes and Handling Missing Values

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Working with example dataframe
people = {
    "first" : ["Anish", "Ramish",None, "Samish", np.nan,"Bamish", np.nan, 'NA'],
    "last" : ["Khadka", "Mainali", "Shrestha", "Karki", "Mainali", np.nan, np.nan, 'Missing'],
    "email" : [None, "mainaliramish89@gmail.com", 
               "shresthasamish28@gmail.com", None,"bamishkarki819@gmail.com",
               "bamishmainali78@gmail.com", np.nan, 'Missing'],
    "age": ['23', '24', '22', '26', '27', None, None, 'Missing']

}
mydf = pd.DataFrame(people)
mydf

Unnamed: 0,first,last,email,age
0,Anish,Khadka,,23
1,Ramish,Mainali,mainaliramish89@gmail.com,24
2,,Shrestha,shresthasamish28@gmail.com,22
3,Samish,Karki,,26
4,,Mainali,bamishkarki819@gmail.com,27
5,Bamish,,bamishmainali78@gmail.com,
6,,,,
7,,Missing,Missing,Missing


In [3]:
# simply drop NA
mydf.dropna()

Unnamed: 0,first,last,email,age
1,Ramish,Mainali,mainaliramish89@gmail.com,24
7,,Missing,Missing,Missing


In [4]:
# index : drop the rows
# drop any row with a missing value
# how = all, when all are missing
mydf.dropna(axis='index', how='any')

Unnamed: 0,first,last,email,age
1,Ramish,Mainali,mainaliramish89@gmail.com,24
7,,Missing,Missing,Missing


In [5]:
mydf.dropna(axis='index', how='all')

Unnamed: 0,first,last,email,age
0,Anish,Khadka,,23
1,Ramish,Mainali,mainaliramish89@gmail.com,24
2,,Shrestha,shresthasamish28@gmail.com,22
3,Samish,Karki,,26
4,,Mainali,bamishkarki819@gmail.com,27
5,Bamish,,bamishmainali78@gmail.com,
7,,Missing,Missing,Missing


In [6]:
mydf.dropna(axis='columns', how='all') # no column with all values empty

Unnamed: 0,first,last,email,age
0,Anish,Khadka,,23
1,Ramish,Mainali,mainaliramish89@gmail.com,24
2,,Shrestha,shresthasamish28@gmail.com,22
3,Samish,Karki,,26
4,,Mainali,bamishkarki819@gmail.com,27
5,Bamish,,bamishmainali78@gmail.com,
6,,,,
7,,Missing,Missing,Missing


In [7]:
mydf.dropna(axis='index', how='any', subset=['email']) # check if there is email

Unnamed: 0,first,last,email,age
1,Ramish,Mainali,mainaliramish89@gmail.com,24
2,,Shrestha,shresthasamish28@gmail.com,22
4,,Mainali,bamishkarki819@gmail.com,27
5,Bamish,,bamishmainali78@gmail.com,
7,,Missing,Missing,Missing


In [8]:
mydf.dropna(axis='index', how='all', subset=['last', 'email']) # both columns must be NaN to be dropped
# any would mean if any value is missing, the row will be dropped
# use inplace=True to change the dataframe in place

Unnamed: 0,first,last,email,age
0,Anish,Khadka,,23
1,Ramish,Mainali,mainaliramish89@gmail.com,24
2,,Shrestha,shresthasamish28@gmail.com,22
3,Samish,Karki,,26
4,,Mainali,bamishkarki819@gmail.com,27
5,Bamish,,bamishmainali78@gmail.com,
7,,Missing,Missing,Missing


In [9]:
# How to deal with 'NA' or 'Missing' values in the dataframe?
mydf.replace('NA', np.nan, inplace=True)
mydf.replace('Missing', np.nan, inplace=True)
mydf

Unnamed: 0,first,last,email,age
0,Anish,Khadka,,23.0
1,Ramish,Mainali,mainaliramish89@gmail.com,24.0
2,,Shrestha,shresthasamish28@gmail.com,22.0
3,Samish,Karki,,26.0
4,,Mainali,bamishkarki819@gmail.com,27.0
5,Bamish,,bamishmainali78@gmail.com,
6,,,,
7,,,,


In [10]:
# run the previous line again
mydf.dropna(axis='index', how='all', subset=['last', 'email'])

Unnamed: 0,first,last,email,age
0,Anish,Khadka,,23.0
1,Ramish,Mainali,mainaliramish89@gmail.com,24.0
2,,Shrestha,shresthasamish28@gmail.com,22.0
3,Samish,Karki,,26.0
4,,Mainali,bamishkarki819@gmail.com,27.0
5,Bamish,,bamishmainali78@gmail.com,


In [11]:
# To check isna()
mydf.isna()

Unnamed: 0,first,last,email,age
0,False,False,True,False
1,False,False,False,False
2,True,False,False,False
3,False,False,True,False
4,True,False,False,False
5,False,True,False,True
6,True,True,True,True
7,True,True,True,True


In [12]:
# You could replace NA with anything
mydf.fillna('MISSING')

Unnamed: 0,first,last,email,age
0,Anish,Khadka,MISSING,23
1,Ramish,Mainali,mainaliramish89@gmail.com,24
2,MISSING,Shrestha,shresthasamish28@gmail.com,22
3,Samish,Karki,MISSING,26
4,MISSING,Mainali,bamishkarki819@gmail.com,27
5,Bamish,MISSING,bamishmainali78@gmail.com,MISSING
6,MISSING,MISSING,MISSING,MISSING
7,MISSING,MISSING,MISSING,MISSING


In [13]:
mydf.fillna(0)

Unnamed: 0,first,last,email,age
0,Anish,Khadka,0,23
1,Ramish,Mainali,mainaliramish89@gmail.com,24
2,0,Shrestha,shresthasamish28@gmail.com,22
3,Samish,Karki,0,26
4,0,Mainali,bamishkarki819@gmail.com,27
5,Bamish,0,bamishmainali78@gmail.com,0
6,0,0,0,0
7,0,0,0,0


In [14]:
# Casting data types
mydf.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [15]:
# We want age to be numeric data
type(np.nan) # NAN is acutally float


float

In [16]:
# mydf['age'] = mydf['age'].astype(int) # won't work because of NaN values
mydf['age'] = mydf['age'].astype(float)
mydf.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [17]:
# To convert all columns in a dataframe
# mydf.astype(float)

In [18]:
mydf['age'].mean()

24.4

In [24]:
# Let's work on the real data now
na_vals = ['NA', 'Missing', ' ']
df = pd.read_csv('Data/stack-overflow-developer-survey-2019/survey_results_public.csv', index_col='Respondent', na_values=na_vals)
schema_df = pd.read_csv('Data/stack-overflow-developer-survey-2019/survey_results_schema.csv',  index_col='Column')
# missing values are defined while reading the CSV

In [25]:
pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

In [26]:
# Find the average number of coding experience of the people in the survery
df['YearsCode'].head(100)

Respondent
1        4
2      NaN
3        3
4        3
5       16
      ... 
96      19
97       7
98       5
99       6
100     33
Name: YearsCode, Length: 100, dtype: object

In [28]:
# df['YearsCode'] = df['YearsCode'].astype(float)  # Won't work
# let's see the unique values
df['YearsCode'].unique()

array(['4', nan, '3', '16', '13', '6', '8', '12', '2', '5', '17', '10',
       '14', '35', '7', 'Less than 1 year', '30', '9', '26', '40', '19',
       '15', '20', '28', '25', '1', '22', '11', '33', '50', '41', '18',
       '34', '24', '23', '42', '27', '21', '36', '32', '39', '38', '31',
       '37', 'More than 50 years', '29', '44', '45', '48', '46', '43',
       '47', '49'], dtype=object)

In [30]:
# Make corrections
df['YearsCode'].replace('Less than 1 year', 0, inplace=True)
df['YearsCode'].replace('More than 50 years', 51, inplace=True) # just above 50

In [31]:
df.dtypes

MainBranch                 object
Hobbyist                   object
OpenSourcer                object
OpenSource                 object
Employment                 object
Country                    object
Student                    object
EdLevel                    object
UndergradMajor             object
EduOther                   object
OrgSize                    object
DevType                    object
YearsCode                  object
Age1stCode                 object
YearsCodePro               object
CareerSat                  object
JobSat                     object
MgrIdiot                   object
MgrMoney                   object
MgrWant                    object
JobSeek                    object
LastHireDate               object
LastInt                    object
FizzBuzz                   object
JobFactors                 object
ResumeUpdate               object
CurrencySymbol             object
CurrencyDesc               object
CompTotal                 float64
CompFreq      

In [32]:
# Still an object
df['YearsCode'] = df['YearsCode'].astype(float)

In [33]:
df['YearsCode'].mean() # Average years of the experience

11.662114216834588

In [34]:
df['YearsCode'].median()

9.0

In [None]:
# Knowing to handle messy data is the skill, you could always find the functions