In [54]:
import pandas as pd
import numpy as np

In [55]:
people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'],
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'],
    'email': ['CoreyMShafer@gmail.com', 'JaneDoe@gmail.com', 'JohnDoe@gmail.com', None, np.nan, 'Anonymous@gmail.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

In [56]:
df = pd.DataFrame(people)
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMShafer@gmail.com,33
1,Jane,Doe,JaneDoe@gmail.com,55
2,John,Doe,JohnDoe@gmail.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@gmail.com,
6,,Missing,,Missing


In [57]:
df.dropna() 
#The default has the arguments `axis='index', how='any'` 
# axis specifies is the dropping should be by rows(index) or columns
# how specifies if pandas should drop the row/column if any or all the data are missing in the dataframe

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMShafer@gmail.com,33
1,Jane,Doe,JaneDoe@gmail.com,55
2,John,Doe,JohnDoe@gmail.com,63
6,,Missing,,Missing


In [58]:
df.dropna(axis='index', how='all')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMShafer@gmail.com,33
1,Jane,Doe,JaneDoe@gmail.com,55
2,John,Doe,JohnDoe@gmail.com,63
3,Chris,Schafer,,36
5,,,Anonymous@gmail.com,
6,,Missing,,Missing


- We can also specify the columns it should check for missing vaues before it drops them. we pass a list of the columns to the `subset` key-word argument 

In [59]:
df.dropna(axis='index', how='any', subset=['email']) # to drop rows that don't have an email.
# it is not dropping 'NA' and 'Missing' because they are strings and not the actual `NAN`/`np.nan`/`NONE` object.
# To change all our custom 'NA' and 'Missing' value be NAN, we can use the .replace() method to change them.

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMShafer@gmail.com,33
1,Jane,Doe,JaneDoe@gmail.com,55
2,John,Doe,JohnDoe@gmail.com,63
5,,,Anonymous@gmail.com,
6,,Missing,,Missing


In [60]:
df.isna() # To know the values that are classified as NAN.

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,False,False,False,False


In [61]:
df.fillna(0) # We cam fill all our NAN data with a particular string or number, in this case 0
# As always, add `inplace=True` if you want the change to be permanent.

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMShafer@gmail.com,33
1,Jane,Doe,JaneDoe@gmail.com,55
2,John,Doe,JohnDoe@gmail.com,63
3,Chris,Schafer,0,36
4,0,0,0,0
5,0,0,Anonymous@gmail.com,0
6,,Missing,,Missing


In [62]:
df.replace('Missing', np.nan, inplace=True)
df.replace('NA', np.nan, inplace=True)

- ##  Casting data types

In [63]:
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

- Let's cast our `age` column to a float type. We probably should have converted it to an `int` tyoe but we have some `nan` value in the `age` column and this will throw an error since it cannot convert `nan` to `int`, but under-the-hood, `nan` is of type `float`.
 Check it out below

In [64]:
type(np.nan)

float

-  Now, let's convert the `age` column to type `float`. don't forget that `nan` has a type of `float` under-the-hood

In [65]:
df['age'] = df['age'].astype(float)

In [66]:
df['age'].mean()

46.75

In [67]:
df['age'].dtype

dtype('float64')

- #### Let's do some work on a bigger data set.

In [68]:
na_vals = ['NAN', 'NA']
devs_data = pd.read_csv('../../../dev_survey.csv', index_col='ResponseId', na_values=na_vals)
devs_data['YearsCode'].unique() # To view all the unique values in the data i.e No value appears twice.

array([nan, '14', '20', '8', '15', '3', '1', '6', '37', '5', '12', '22',
       '11', '4', '7', '13', '36', '2', '25', '10', '40', '16', '27',
       '24', '19', '9', '17', '18', '26', 'More than 50 years', '29',
       '30', '32', 'Less than 1 year', '48', '45', '38', '39', '28', '23',
       '43', '21', '41', '35', '50', '33', '31', '34', '46', '44', '42',
       '47', '49'], dtype=object)

In [69]:
devs_data['YearsCode'].replace('Less than 1 year', 0, inplace=True)
devs_data['YearsCode'].replace('More than 50 years', 51, inplace=True)
devs_data['YearsCode'] = devs_data['YearsCode'].astype(float)

In [70]:
devs_data['YearsCode'].unique()

array([nan, 14., 20.,  8., 15.,  3.,  1.,  6., 37.,  5., 12., 22., 11.,
        4.,  7., 13., 36.,  2., 25., 10., 40., 16., 27., 24., 19.,  9.,
       17., 18., 26., 51., 29., 30., 32.,  0., 48., 45., 38., 39., 28.,
       23., 43., 21., 41., 35., 50., 33., 31., 34., 46., 44., 42., 47.,
       49.])

In [72]:
devs_data['YearsCode'].mean()

12.251466760464298