# Handling Missing Values

In [2]:
import pandas as pd 
import numpy as np

In [3]:
df=pd.read_csv('ramen-ratings.csv')
df.head()

Unnamed: 0,Review #,Brand,Variety,Style,Country,Stars,Top Ten
0,2580,New Touch,T's Restaurant Tantanmen,Cup,Japan,3.75,
1,2579,Just Way,Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...,Pack,Taiwan,1.0,
2,2578,Nissin,Cup Noodles Chicken Vegetable,Cup,USA,2.25,
3,2577,Wei Lih,GGE Ramen Snack Tomato Flavor,Pack,Taiwan,2.75,
4,2576,Ching's Secret,Singapore Curry,Pack,India,3.75,


In [4]:
df['Top Ten'].notnull()

0       False
1       False
2       False
3       False
4       False
        ...  
2575    False
2576    False
2577    False
2578    False
2579    False
Name: Top Ten, Length: 2580, dtype: bool

In [5]:
df.isnull().sum(axis=0) # to count number of null values in each column

Review #       0
Brand          0
Variety        0
Style          2
Country        0
Stars          0
Top Ten     2539
dtype: int64

In [6]:
df.shape

(2580, 7)

In [7]:
df.isnull().sum(axis=0) / df.shape[0] # to get the ratio of null values for each column

Review #    0.000000
Brand       0.000000
Variety     0.000000
Style       0.000775
Country     0.000000
Stars       0.000000
Top Ten     0.984109
dtype: float64

Note that if we droped the rows where top ten == null almost all data will be droped , but if we dropped the rows when value in style column == null only 2 rows will be dropped 

## deopna()

Drop the rows that contain  nan

In [8]:
# drop all rows have nan 
df.dropna()

Unnamed: 0,Review #,Brand,Variety,Style,Country,Stars,Top Ten
616,1964,MAMA,Instant Noodles Coconut Milk Flavour,Pack,Myanmar,5.0,2016 #10
633,1947,Prima Taste,Singapore Laksa Wholegrain La Mian,Pack,Singapore,5.0,2016 #1
655,1925,Prima,Juzz's Mee Creamy Chicken Flavour,Pack,Singapore,5.0,2016 #8
673,1907,Prima Taste,Singapore Curry Wholegrain La Mian,Pack,Singapore,5.0,2016 #5
752,1828,Tseng Noodles,Scallion With Sichuan Pepper Flavor,Pack,Taiwan,5.0,2016 #9
891,1689,Wugudaochang,Tomato Beef Brisket Flavor Purple Potato Noodle,Pack,China,5.0,2016 #7
942,1638,A-Sha Dry Noodle,Veggie Noodle Tomato Noodle With Vine Ripened ...,Pack,Taiwan,5.0,2015 #10
963,1617,MyKuali,Penang Hokkien Prawn Noodle (New Improved Taste),Pack,Malaysia,5.0,2015 #7
995,1585,CarJEN,Nyonya Curry Laksa,Pack,Malaysia,5.0,2015 #4
1059,1521,Maruchan,Gotsumori Sauce Yakisoba,Tray,Japan,5.0,2015 #9


In [9]:
df.dropna().shape

(41, 7)

Drop the columns that contain nan

In [10]:
df.dropna(axis='columns')  # will drop Style column and Top Ten column

Unnamed: 0,Review #,Brand,Variety,Country,Stars
0,2580,New Touch,T's Restaurant Tantanmen,Japan,3.75
1,2579,Just Way,Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...,Taiwan,1
2,2578,Nissin,Cup Noodles Chicken Vegetable,USA,2.25
3,2577,Wei Lih,GGE Ramen Snack Tomato Flavor,Taiwan,2.75
4,2576,Ching's Secret,Singapore Curry,India,3.75
...,...,...,...,...,...
2575,5,Vifon,"Hu Tiu Nam Vang [""Phnom Penh"" style] Asian Sty...",Vietnam,3.5
2576,4,Wai Wai,Oriental Style Instant Noodles,Thailand,1
2577,3,Wai Wai,Tom Yum Shrimp,Thailand,2
2578,2,Wai Wai,Tom Yum Chili Flavor,Thailand,2


In [11]:
df.dropna(subset=['Style']) # delete from speicified columns 

Unnamed: 0,Review #,Brand,Variety,Style,Country,Stars,Top Ten
0,2580,New Touch,T's Restaurant Tantanmen,Cup,Japan,3.75,
1,2579,Just Way,Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...,Pack,Taiwan,1,
2,2578,Nissin,Cup Noodles Chicken Vegetable,Cup,USA,2.25,
3,2577,Wei Lih,GGE Ramen Snack Tomato Flavor,Pack,Taiwan,2.75,
4,2576,Ching's Secret,Singapore Curry,Pack,India,3.75,
...,...,...,...,...,...,...,...
2575,5,Vifon,"Hu Tiu Nam Vang [""Phnom Penh"" style] Asian Sty...",Bowl,Vietnam,3.5,
2576,4,Wai Wai,Oriental Style Instant Noodles,Pack,Thailand,1,
2577,3,Wai Wai,Tom Yum Shrimp,Pack,Thailand,2,
2578,2,Wai Wai,Tom Yum Chili Flavor,Pack,Thailand,2,


## fillna()

In [12]:
df1=df.fillna(0)
df1

Unnamed: 0,Review #,Brand,Variety,Style,Country,Stars,Top Ten
0,2580,New Touch,T's Restaurant Tantanmen,Cup,Japan,3.75,0
1,2579,Just Way,Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...,Pack,Taiwan,1,0
2,2578,Nissin,Cup Noodles Chicken Vegetable,Cup,USA,2.25,0
3,2577,Wei Lih,GGE Ramen Snack Tomato Flavor,Pack,Taiwan,2.75,0
4,2576,Ching's Secret,Singapore Curry,Pack,India,3.75,0
...,...,...,...,...,...,...,...
2575,5,Vifon,"Hu Tiu Nam Vang [""Phnom Penh"" style] Asian Sty...",Bowl,Vietnam,3.5,0
2576,4,Wai Wai,Oriental Style Instant Noodles,Pack,Thailand,1,0
2577,3,Wai Wai,Tom Yum Shrimp,Pack,Thailand,2,0
2578,2,Wai Wai,Tom Yum Chili Flavor,Pack,Thailand,2,0


In [13]:
df.loc[df['Style'].isnull()]

Unnamed: 0,Review #,Brand,Variety,Style,Country,Stars,Top Ten
2152,428,Kamfen,E Menm Chicken,,China,3.75,
2442,138,Unif,100 Furong Shrimp,,Taiwan,3.0,


In [14]:
df1.loc[df1['Style'].isnull()]

Unnamed: 0,Review #,Brand,Variety,Style,Country,Stars,Top Ten


In [15]:
df1.loc[df1['Style']==0]

Unnamed: 0,Review #,Brand,Variety,Style,Country,Stars,Top Ten
2152,428,Kamfen,E Menm Chicken,0,China,3.75,0
2442,138,Unif,100 Furong Shrimp,0,Taiwan,3.0,0


if i want to change nan in Style column with value and in Top Ten Column with another value

In [16]:
df2=df.fillna({'Style':"Not Available",'Top Ten':-1})
df2

Unnamed: 0,Review #,Brand,Variety,Style,Country,Stars,Top Ten
0,2580,New Touch,T's Restaurant Tantanmen,Cup,Japan,3.75,-1
1,2579,Just Way,Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...,Pack,Taiwan,1,-1
2,2578,Nissin,Cup Noodles Chicken Vegetable,Cup,USA,2.25,-1
3,2577,Wei Lih,GGE Ramen Snack Tomato Flavor,Pack,Taiwan,2.75,-1
4,2576,Ching's Secret,Singapore Curry,Pack,India,3.75,-1
...,...,...,...,...,...,...,...
2575,5,Vifon,"Hu Tiu Nam Vang [""Phnom Penh"" style] Asian Sty...",Bowl,Vietnam,3.5,-1
2576,4,Wai Wai,Oriental Style Instant Noodles,Pack,Thailand,1,-1
2577,3,Wai Wai,Tom Yum Shrimp,Pack,Thailand,2,-1
2578,2,Wai Wai,Tom Yum Chili Flavor,Pack,Thailand,2,-1


In [17]:
df2.loc[df2['Style']=='Not Available']

Unnamed: 0,Review #,Brand,Variety,Style,Country,Stars,Top Ten
2152,428,Kamfen,E Menm Chicken,Not Available,China,3.75,-1
2442,138,Unif,100 Furong Shrimp,Not Available,Taiwan,3.0,-1


In [18]:
#fillna(method='ffill')  fill nan value by the previous value in this column
#fillna(method='bfill')fill nan value by the next value in this column
# this method used when you have time series data

In [19]:
df['Style'].value_counts() # counts how many times each value in the column style repeated  

Pack    1531
Bowl     481
Cup      450
Tray     108
Box        6
Can        1
Bar        1
Name: Style, dtype: int64

In [45]:
df['Stars'].value_counts()

4        384
5        369
3.75     350
3.5      326
3        173
3.25     170
4.25     143
4.5      132
2.75      85
2         68
2.5       67
4.75      64
1.5       37
1.75      27
1         26
0         26
2.25      21
0.5       14
0.25      11
1.25      10
5.0       10
3.50       9
5.00       7
4.00       6
4.3        4
3.8        3
4.0        3
4.50       3
0          3
1.1        2
2.8        2
2.9        2
4.125      2
2.3        2
3.1        2
3.0        2
2.1        1
0.9        1
0.75       1
3.125      1
2.125      1
3.00       1
0.1        1
3.3        1
3.7        1
3.4        1
3.6        1
2.85       1
3.2        1
3.65       1
1.8        1
Name: Stars, dtype: int64

In [27]:
df['Stars']=df['Stars'].replace('Unrated',np.nan)
df['Stars']

0       3.75
1          1
2       2.25
3       2.75
4       3.75
        ... 
2575     3.5
2576       1
2577       2
2578       2
2579     0.5
Name: Stars, Length: 2580, dtype: object

In [38]:
df.dropna(subset=['Stars'],inplace=True)
df

Unnamed: 0,Review #,Brand,Variety,Style,Country,Stars,Top Ten
0,2580,New Touch,T's Restaurant Tantanmen,Cup,Japan,3.75,
1,2579,Just Way,Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...,Pack,Taiwan,1,
2,2578,Nissin,Cup Noodles Chicken Vegetable,Cup,USA,2.25,
3,2577,Wei Lih,GGE Ramen Snack Tomato Flavor,Pack,Taiwan,2.75,
4,2576,Ching's Secret,Singapore Curry,Pack,India,3.75,
...,...,...,...,...,...,...,...
2575,5,Vifon,"Hu Tiu Nam Vang [""Phnom Penh"" style] Asian Sty...",Bowl,Vietnam,3.5,
2576,4,Wai Wai,Oriental Style Instant Noodles,Pack,Thailand,1,
2577,3,Wai Wai,Tom Yum Shrimp,Pack,Thailand,2,
2578,2,Wai Wai,Tom Yum Chili Flavor,Pack,Thailand,2,


In [46]:
df['Stars']=df['Stars'].astype(float)