# Cleaning Data - Casting Datatypes and Handling Missing

In [1]:
import pandas as pd
import numpy as np

In [2]:
people = {
    "first":['vikas','rakesh','mahesh', 'saurabh', np.nan, None, 'NA'],
    "last":['sharma','puri','jain', 'sharma', np.nan, np.nan, 'Missing'],
    "email":["vikas_sharma@gmail.com","rakesh_puri@gmail.com","mahesh_jain@gmail.com", None, np.nan, 'Anonymous@emial.com', 'NA'],
    "age": ['33', '55', '63', '36', None, None, 'Missing']
}

In [3]:
df_dict = pd.DataFrame(people)
df_dict

Unnamed: 0,first,last,email,age
0,vikas,sharma,vikas_sharma@gmail.com,33
1,rakesh,puri,rakesh_puri@gmail.com,55
2,mahesh,jain,mahesh_jain@gmail.com,63
3,saurabh,sharma,,36
4,,,,
5,,,Anonymous@emial.com,
6,,Missing,,Missing


In [4]:
# df = pd.read_csv('D:/GIT_Repositories/pandas/survey_results_public.csv', index_col = 'Respondent')
# schema_df = pd.read_csv('D:/GIT_Repositories/pandas/survey_results_schema.csv', index_col = 'Column')

# pd.set_option('display.max_columns', 85)
# pd.set_option('display.max_rows', 85)

In [5]:
df_dict.dropna()

Unnamed: 0,first,last,email,age
0,vikas,sharma,vikas_sharma@gmail.com,33
1,rakesh,puri,rakesh_puri@gmail.com,55
2,mahesh,jain,mahesh_jain@gmail.com,63
6,,Missing,,Missing


### How dropna works ????

#### axis = 'index'   => drop rows
#### axis = 'columns' => drop columns
#### how = 'any' => If any NA values are present, drop that row or column. 
i.e. drop row, even if one of the values of a row is 'NaN' or None 
#### how = 'all' => drop row, ONLY if all the values of row are 'NaN' or None
i.e. If all values are NA, drop that row or column.

In [6]:
# default parameters 

# index => drop rows,  how = 'any' => even if one of the values of a row is NaN or none- drop row

df_dict.dropna(axis = 'index', how = 'any' ) 

Unnamed: 0,first,last,email,age
0,vikas,sharma,vikas_sharma@gmail.com,33
1,rakesh,puri,rakesh_puri@gmail.com,55
2,mahesh,jain,mahesh_jain@gmail.com,63
6,,Missing,,Missing


In [7]:
df_dict.dropna(axis = 'index', how = 'all' )   # only row with index = 4 would be dropped as it has all missing values

Unnamed: 0,first,last,email,age
0,vikas,sharma,vikas_sharma@gmail.com,33
1,rakesh,puri,rakesh_puri@gmail.com,55
2,mahesh,jain,mahesh_jain@gmail.com,63
3,saurabh,sharma,,36
5,,,Anonymous@emial.com,
6,,Missing,,Missing


In [8]:
# axis = 'columns'

df_dict.dropna(axis = 'columns', how = 'all' )    # since none of the columns has ALL Missing/NaN/None values -- dataframe is returned as is

Unnamed: 0,first,last,email,age
0,vikas,sharma,vikas_sharma@gmail.com,33
1,rakesh,puri,rakesh_puri@gmail.com,55
2,mahesh,jain,mahesh_jain@gmail.com,63
3,saurabh,sharma,,36
4,,,,
5,,,Anonymous@emial.com,
6,,Missing,,Missing


In [9]:
# since every column has atleast one Missing/NaN/None value,except index rest all columns dropped --- EMPTY dataframe

df_dict.dropna(axis = 'columns', how = 'any' ) 

0
1
2
3
4
5
6


### scenario: i'm doing analysis of data, and it is fine if the rows doesnt have first name or last name but we really need the email address and if they dont have the email address, that row should be dropped

In [10]:
df_dict

Unnamed: 0,first,last,email,age
0,vikas,sharma,vikas_sharma@gmail.com,33
1,rakesh,puri,rakesh_puri@gmail.com,55
2,mahesh,jain,mahesh_jain@gmail.com,63
3,saurabh,sharma,,36
4,,,,
5,,,Anonymous@emial.com,
6,,Missing,,Missing


In [11]:
# subset()

df_dict.dropna(axis = 'index', how = 'any', subset = ['email'] )   # drop row if its 'email' is NaN or None 

# here no impact of how = 'any', the validation criterion runs only through subset
# that means even how = 'all' also would fetch same result in this case.

Unnamed: 0,first,last,email,age
0,vikas,sharma,vikas_sharma@gmail.com,33
1,rakesh,puri,rakesh_puri@gmail.com,55
2,mahesh,jain,mahesh_jain@gmail.com,63
5,,,Anonymous@emial.com,
6,,Missing,,Missing


In [12]:
df_dict.dropna(axis = 'index', how = 'all', subset = ['last', 'email'] )   # drop row if both last & email are none or NaN

Unnamed: 0,first,last,email,age
0,vikas,sharma,vikas_sharma@gmail.com,33
1,rakesh,puri,rakesh_puri@gmail.com,55
2,mahesh,jain,mahesh_jain@gmail.com,63
3,saurabh,sharma,,36
5,,,Anonymous@emial.com,
6,,Missing,,Missing


In [13]:
# drop if any of last name or email is ---> NaN or None

df_dict.dropna(axis = 'index', how = 'any', subset = ['last', 'email'] )

Unnamed: 0,first,last,email,age
0,vikas,sharma,vikas_sharma@gmail.com,33
1,rakesh,puri,rakesh_puri@gmail.com,55
2,mahesh,jain,mahesh_jain@gmail.com,63
6,,Missing,,Missing


In [14]:
# default is ----> how = 'any'  ;  

df_dict.dropna(axis = 'index', subset = ['last', 'email'] )   # drop if any of last name or email is ---> NaN or None

Unnamed: 0,first,last,email,age
0,vikas,sharma,vikas_sharma@gmail.com,33
1,rakesh,puri,rakesh_puri@gmail.com,55
2,mahesh,jain,mahesh_jain@gmail.com,63
6,,Missing,,Missing


In [15]:
# default is ----> axis = 'index'

df_dict.dropna(subset = ['last', 'email'] )     # drop if any of last name or email is ---> NaN or None

Unnamed: 0,first,last,email,age
0,vikas,sharma,vikas_sharma@gmail.com,33
1,rakesh,puri,rakesh_puri@gmail.com,55
2,mahesh,jain,mahesh_jain@gmail.com,63
6,,Missing,,Missing


# How to handle custom missing values ????

### who prepared data might have enterted --- strings --- 'NA' or 'missing' --- for fields with no values

In [16]:
# While LOADING the DATAFRAME -----> replace custome values with ---> numpy's ---> nan

df_dict.replace('NA', np.nan, inplace=True)
df_dict.replace('Missing', np.nan, inplace=True)
df_dict

Unnamed: 0,first,last,email,age
0,vikas,sharma,vikas_sharma@gmail.com,33.0
1,rakesh,puri,rakesh_puri@gmail.com,55.0
2,mahesh,jain,mahesh_jain@gmail.com,63.0
3,saurabh,sharma,,36.0
4,,,,
5,,,Anonymous@emial.com,
6,,,,


In [17]:
# drop if any of last name or email is ---> NaN or None

df_dict.dropna(axis = 'index', how = 'any', subset = ['last', 'email'] )

Unnamed: 0,first,last,email,age
0,vikas,sharma,vikas_sharma@gmail.com,33
1,rakesh,puri,rakesh_puri@gmail.com,55
2,mahesh,jain,mahesh_jain@gmail.com,63


In [18]:
# drop if both last name or email are ---> NaN or None

df_dict.dropna(axis = 'index', how = 'all', subset = ['last', 'email'] )  

Unnamed: 0,first,last,email,age
0,vikas,sharma,vikas_sharma@gmail.com,33.0
1,rakesh,puri,rakesh_puri@gmail.com,55.0
2,mahesh,jain,mahesh_jain@gmail.com,63.0
3,saurabh,sharma,,36.0
5,,,Anonymous@emial.com,


In [19]:
# how can we know if certain values of row/column are treated as nan ??

# to check which values are ---> nan ??

df_dict.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


# sometime while working with numeric data, if we want to repalce some 'nan' with some pre-defined or default values???

ex: If student attend an exam -- then instead of nan --- it is better to give 0 (ZERO) that helps in grade calcualtion on totals

In [20]:
df_dict

Unnamed: 0,first,last,email,age
0,vikas,sharma,vikas_sharma@gmail.com,33.0
1,rakesh,puri,rakesh_puri@gmail.com,55.0
2,mahesh,jain,mahesh_jain@gmail.com,63.0
3,saurabh,sharma,,36.0
4,,,,
5,,,Anonymous@emial.com,
6,,,,


In [21]:
df_dict.fillna(0)   #  note:  zero is suitable only for numeric fields

Unnamed: 0,first,last,email,age
0,vikas,sharma,vikas_sharma@gmail.com,33
1,rakesh,puri,rakesh_puri@gmail.com,55
2,mahesh,jain,mahesh_jain@gmail.com,63
3,saurabh,sharma,0,36
4,0,0,0,0
5,0,0,Anonymous@emial.com,0
6,0,0,0,0


In [22]:
df_dict.fillna('MISSING')

Unnamed: 0,first,last,email,age
0,vikas,sharma,vikas_sharma@gmail.com,33
1,rakesh,puri,rakesh_puri@gmail.com,55
2,mahesh,jain,mahesh_jain@gmail.com,63
3,saurabh,sharma,MISSING,36
4,MISSING,MISSING,MISSING,MISSING
5,MISSING,MISSING,Anonymous@emial.com,MISSING
6,MISSING,MISSING,MISSING,MISSING


In [23]:
df_dict   # w/o inplace=TRUE no changes applied on original dataframe

Unnamed: 0,first,last,email,age
0,vikas,sharma,vikas_sharma@gmail.com,33.0
1,rakesh,puri,rakesh_puri@gmail.com,55.0
2,mahesh,jain,mahesh_jain@gmail.com,63.0
3,saurabh,sharma,,36.0
4,,,,
5,,,Anonymous@emial.com,
6,,,,


# how do we know the data types of columns in a dataframe ???

In [24]:
df_dict.dtypes    # type: object ---> non-integer

first    object
last     object
email    object
age      object
dtype: object

# how to calculate AVERAGE age??

In [25]:
df_dict['age'].mean()   # age is NOT int

TypeError: can only concatenate str (not "int") to str

In [26]:
type(np.nan)

float

In [27]:
df_dict['age'] = df_dict['age'].astype(int)   
# ERROR bks the column age has some nan values which are of type float and it can not convert nan values to int

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'

In [28]:
df_dict['age'] = df_dict['age'].astype(float)   

In [29]:
df_dict.dtypes   # age converted to float

first     object
last      object
email     object
age      float64
dtype: object

In [30]:
df_dict['age'].mean()

np.float64(46.75)

# How to replace CUSTOM MISSING values while loading the CSV ???

In [31]:
na_vals = ['NA', 'Missing']

df = pd.read_csv('D:/GIT_Repositories/pandas/survey_results_public.csv', index_col = 'Respondent', na_values = na_vals)
df.head()

Unnamed: 0_level_0,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,EduOther,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,"Taught yourself a new language, framework, or ...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",Bosnia and Herzegovina,"Yes, full-time","Secondary school (e.g. American high school, G...",,Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,19.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
3,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Thailand,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,"Taught yourself a new language, framework, or ...",...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,28.0,Man,No,Straight / Heterosexual,,Yes,Appropriate in length,Neither easy nor difficult
4,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,Employed full-time,United States,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,22.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
5,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,Ukraine,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,30.0,Man,No,Straight / Heterosexual,White or of European descent;Multiracial,No,Appropriate in length,Easy


# How to find average years of experience of survey participants ???

In [32]:
df['YearsCode'].head(10)

Respondent
1       4
2     NaN
3       3
4       3
5      16
6      13
7       6
8       8
9      12
10     12
Name: YearsCode, dtype: object

In [33]:
df['YearsCode'].mean()

TypeError: can only concatenate str (not "int") to str

In [36]:
df['YearsCode'] = df['YearsCode'].astype(float)   # STILL fails bks the column has some string values

ValueError: could not convert string to float: 'Less than 1 year'

In [37]:
# column has some strings -- beyond numeric and nan values

# to know all unique values of a column ??????

df['YearsCode'].unique()

array(['4', nan, '3', '16', '13', '6', '8', '12', '2', '5', '17', '10',
       '14', '35', '7', 'Less than 1 year', '30', '9', '26', '40', '19',
       '15', '20', '28', '25', '1', '22', '11', '33', '50', '41', '18',
       '34', '24', '23', '42', '27', '21', '36', '32', '39', '38', '31',
       '37', 'More than 50 years', '29', '44', '45', '48', '46', '43',
       '47', '49'], dtype=object)

In [43]:
# Other way --- to know unique values along with their frequency

# since here we jsut need to know what are unique values --- unique() is sufficient..

# just for info otherwise we dont need count of unqiue values here

df['YearsCode'].value_counts()   # ===== > DOESN'T COUNT  'nan'  values

YearsCode
5                     7047
10                    6777
6                     6179
4                     5729
8                     5361
7                     5320
3                     5179
2                     3974
15                    3942
20                    3636
12                    3530
9                     3360
11                    2265
14                    2126
13                    2036
18                    1900
1                     1814
25                    1657
16                    1593
30                    1532
Less than 1 year      1367
17                    1349
19                    1018
22                    1016
35                     873
23                     745
21                     715
24                     693
40                     497
28                     465
32                     420
26                     409
27                     408
33                     353
38                     340
34                     327
37                

In [44]:
df['YearsCode'].unique()

array(['4', nan, '3', '16', '13', '6', '8', '12', '2', '5', '17', '10',
       '14', '35', '7', 'Less than 1 year', '30', '9', '26', '40', '19',
       '15', '20', '28', '25', '1', '22', '11', '33', '50', '41', '18',
       '34', '24', '23', '42', '27', '21', '36', '32', '39', '38', '31',
       '37', 'More than 50 years', '29', '44', '45', '48', '46', '43',
       '47', '49'], dtype=object)

In [45]:
'''
replce the below 2 string values  -

1. 'Less than 1 year'     =====================> consider as 0 years
2. 'More than 50 years'   =====================> consider as 51 years

with numbers so that it helps derive average coding experience of the participants

'''

"\nreplce the below 2 string values  -\n1. 'Less than 1 year'    &\n2. 'More than 50 years' \nwith numbers so that it helps derive average coding experience of the participants\n"

In [46]:
# 1. 'Less than 1 year'     =====================> consider as 0 years

df['YearsCode'].replace('Less than 1 year', 0, inplace=True)

In [49]:
# 2. 'More than 50 years'   =====================> consider as 51 years

df['YearsCode'].replace('More than 50 years', 51, inplace=True)

In [48]:
df['YearsCode'].unique()

array(['4', nan, '3', '16', '13', '6', '8', '12', '2', '5', '17', '10',
       '14', '35', '7', 0, '30', '9', '26', '40', '19', '15', '20', '28',
       '25', '1', '22', '11', '33', '50', '41', '18', '34', '24', '23',
       '42', '27', '21', '36', '32', '39', '38', '31', '37', 51, '29',
       '44', '45', '48', '46', '43', '47', '49'], dtype=object)

In [50]:
# now convert the field to float

df['YearsCode'] = df['YearsCode'].astype(float)   # wont fail this time

In [52]:
df['YearsCode'].mean()                 # AVG developer experience of participants is  ===>  11.5 years

np.float64(11.662114216834588)

In [53]:
df['YearsCode'].median()

np.float64(9.0)