# Handling missing values

## fillna fills NaN values using the specified method

## By default: `DataFrame.fillna(value=None, method=None, axis=None, inplace=False, limit=None)`

## value:
It is a value that is used to fill the null values.
scalar, dict, Series, or DataFrame.

##Method:

Method to use for filling holes in reindexed Series.

**pad / ffill**: propagate last valid observation forward to next.
(Top To Bottom)

**backfill / bfill**: use next valid observation to fill gap.
(Bottom To Top)

##axis:
axis takes int or string value for

**rows**: Input can be 0 or  ‘index’.

**Columns**: Input can be 1 or  ‘columns’.

## inplace:
It is a boolean which makes the changes in the data frame itself if True.

##limit:    
int, default None

If the method is specified, this is the maximum number of consecutive NaN values to forward/backward fill.

Must be greater than 0 if not None.

In [1]:
import pandas as pd

In [7]:
data = pd.read_csv('data_m.csv', skipfooter = 1, engine = 'python')
data

Unnamed: 0,Name,Marks,Grades
0,Priyang,98.0,
1,Aadhya,,AB
2,Krisha,99.0,AA
3,Vedant,87.0,
4,Parshv,90.0,AC
5,Mittal,,BA
6,Archana,82.0,BB


## Fill all NaN values with 0

In [29]:
data.fillna(value = 0)

Unnamed: 0,Name,Marks,Grades
0,Priyang,98.0,0
1,Aadhya,0.0,AB
2,Krisha,99.0,AA
3,Vedant,87.0,0
4,Parshv,90.0,AC
5,Mittal,0.0,BA
6,Archana,82.0,BB


## Fill all NaN values with mean value

In [24]:
df = pd.DataFrame({'City': ['Tonalá', 'Guadalajara', 'Nueva Escocia', 'Nueva York',
                    'Delhi', 'San Luis Potosí'],
                   'Temp':[34.0,None,None,40.0,None, 45.0]})
df

df.to_csv('data_new.csv', index = False)

In [30]:
df.fillna(value = df['Temp'].mean())

Unnamed: 0,City,Temp
0,Tonalá,34.0
1,Guadalajara,39.666667
2,Nueva Escocia,39.666667
3,Nueva York,40.0
4,Delhi,39.666667
5,San Luis Potosí,45.0


## Fill all NaN elements in columns 'Marks' and 'Grades' with 97 and 'AA' respectively

In [18]:
data.fillna({'Marks':97, 'Grades':'AA'})

Unnamed: 0,Name,Marks,Grades
0,Priyang,98.0,AA
1,Aadhya,97.0,AB
2,Krisha,99.0,AA
3,Vedant,87.0,AA
4,Parshv,90.0,AC
5,Mittal,97.0,BA
6,Archana,82.0,BB


## Propagate non-values foward (fill them with the last observasion from top to bottom)

In [28]:
df = pd.read_csv('data_new.csv')
df

Unnamed: 0,City,Temp
0,Tonalá,34.0
1,Guadalajara,
2,Nueva Escocia,
3,Nueva York,40.0
4,Delhi,
5,San Luis Potosí,45.0


In [33]:
df.fillna(method = 'ffill')

  df.fillna(method = 'ffill')


Unnamed: 0,City,Temp
0,Tonalá,34.0
1,Guadalajara,34.0
2,Nueva Escocia,34.0
3,Nueva York,40.0
4,Delhi,40.0
5,San Luis Potosí,45.0


In [34]:
# También se puede hacer esto
df.fillna(method = 'pad')

  df.fillna(method = 'pad')


Unnamed: 0,City,Temp
0,Tonalá,34.0
1,Guadalajara,34.0
2,Nueva Escocia,34.0
3,Nueva York,40.0
4,Delhi,40.0
5,San Luis Potosí,45.0


In [36]:
# Las 2 formas anteriores no se recomiendan, es mejor usar esta
df.ffill()

Unnamed: 0,City,Temp
0,Tonalá,34.0
1,Guadalajara,34.0
2,Nueva Escocia,34.0
3,Nueva York,40.0
4,Delhi,40.0
5,San Luis Potosí,45.0


In [49]:
# Por default, el relleno se hace por filas
df.ffill(axis = 'index')

Unnamed: 0,City,Temp
0,Tonalá,34.0
1,Guadalajara,34.0
2,Nueva Escocia,34.0
3,Nueva York,40.0
4,Delhi,40.0
5,San Luis Potosí,45.0


In [50]:
# Pero se puede hacer por columnas, en este caso de izquierda a derecha
df.ffill(axis = 'columns')

Unnamed: 0,City,Temp
0,Tonalá,34.0
1,Guadalajara,Guadalajara
2,Nueva Escocia,Nueva Escocia
3,Nueva York,40.0
4,Delhi,Delhi
5,San Luis Potosí,45.0


## Propagate non-values backwards (fill them with the last observasion from bottom to top)

In [37]:
df.fillna(method = 'backfill')

  df.fillna(method = 'backfill')


Unnamed: 0,City,Temp
0,Tonalá,34.0
1,Guadalajara,40.0
2,Nueva Escocia,40.0
3,Nueva York,40.0
4,Delhi,45.0
5,San Luis Potosí,45.0


In [38]:
# También se puede hacer de esta manera
df.fillna(method = 'bfill')

  df.fillna(method = 'bfill')


Unnamed: 0,City,Temp
0,Tonalá,34.0
1,Guadalajara,40.0
2,Nueva Escocia,40.0
3,Nueva York,40.0
4,Delhi,45.0
5,San Luis Potosí,45.0


In [39]:
# Las 2 formas anteriores no se recomiendan, es mejor usar esta
df.bfill()

Unnamed: 0,City,Temp
0,Tonalá,34.0
1,Guadalajara,40.0
2,Nueva Escocia,40.0
3,Nueva York,40.0
4,Delhi,45.0
5,San Luis Potosí,45.0


In [52]:
# Por default, el relleno se hace por filas
data.bfill(axis = 'index')

Unnamed: 0,Name,Marks,Grades
0,Priyang,98.0,AB
1,Aadhya,99.0,AB
2,Krisha,99.0,AA
3,Vedant,87.0,AC
4,Parshv,90.0,AC
5,Mittal,82.0,BA
6,Archana,82.0,BB


In [53]:
# Pero se puede hacer por columnas, de derecha a izquierda
data.bfill(axis = 'columns')

Unnamed: 0,Name,Marks,Grades
0,Priyang,98.0,
1,Aadhya,AB,AB
2,Krisha,99.0,AA
3,Vedant,87.0,
4,Parshv,90.0,AC
5,Mittal,BA,BA
6,Archana,82.0,BB


## Inplace

In [55]:
# Hace efectivos los cambios en el dataframe
df1 = pd.read_csv('data_new.csv')
df1.fillna(value = df['Temp'].mean(), inplace = True)
df1

Unnamed: 0,City,Temp
0,Tonalá,34.0
1,Guadalajara,39.666667
2,Nueva Escocia,39.666667
3,Nueva York,40.0
4,Delhi,39.666667
5,San Luis Potosí,45.0


## limit
## Fill the first NaN element

In [58]:
# Solo rellenamos el primer valor NaN que encontramos
df.ffill(limit = 1)

Unnamed: 0,City,Temp
0,Tonalá,34.0
1,Guadalajara,34.0
2,Nueva Escocia,
3,Nueva York,40.0
4,Delhi,40.0
5,San Luis Potosí,45.0


In [60]:
# Solo rellenamos los primeros valores NaN que encontramos
df.ffill(limit = 2)

Unnamed: 0,City,Temp
0,Tonalá,34.0
1,Guadalajara,34.0
2,Nueva Escocia,34.0
3,Nueva York,40.0
4,Delhi,40.0
5,San Luis Potosí,45.0
