In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import numpy as np
import pandas as pd

## Data Quality

1) Duplicate Values   
2) Missing Values   
3) Invalid Values   
4) Outliers   

#### Duplicates 

1. **duplicated** - returns a Series with True and False values that describe which rows in the DataFrame are duplicated and not
2. **drop_duplicates** - return DataFrame with duplicate rows removed
3. **reset_index** - allows you reset the index back to the default 0, 1, 2 etc indexes
4. **subset** - selecting particular rows and columns of data from a DataFrame (or Series)
5. **keep**
- first : drop duplicates except for the first occurrence. 
- last : drop duplicates except for the last occurrence.
- False : drop all duplicates.

In [2]:
dic1 = {
    'name' : ['pramodha','deepthi','chandana','siri','deepthi','siri','pramodha'],
    'place' : ['ongole','vijayawada','tenali','guntur','vijayawada','guntur','ongole'],
    'age' : [23,22,21,23,22,23,23],
    'class' : ['cse','cse','cse','ece','cse','ece','cse']
}
df = pd.DataFrame(dic1)
df

Unnamed: 0,name,place,age,class
0,pramodha,ongole,23,cse
1,deepthi,vijayawada,22,cse
2,chandana,tenali,21,cse
3,siri,guntur,23,ece
4,deepthi,vijayawada,22,cse
5,siri,guntur,23,ece
6,pramodha,ongole,23,cse


In [3]:
df.duplicated()
df.duplicated(keep = 'first')
df.duplicated(keep = 'last')

0    False
1    False
2    False
3    False
4     True
5     True
6     True
dtype: bool

0    False
1    False
2    False
3    False
4     True
5     True
6     True
dtype: bool

0     True
1     True
2    False
3     True
4    False
5    False
6    False
dtype: bool

In [4]:
a = df.drop_duplicates()
b = df.drop_duplicates(keep = 'first')
c = df.drop_duplicates(keep = 'last')
a
b
c

Unnamed: 0,name,place,age,class
0,pramodha,ongole,23,cse
1,deepthi,vijayawada,22,cse
2,chandana,tenali,21,cse
3,siri,guntur,23,ece


Unnamed: 0,name,place,age,class
0,pramodha,ongole,23,cse
1,deepthi,vijayawada,22,cse
2,chandana,tenali,21,cse
3,siri,guntur,23,ece


Unnamed: 0,name,place,age,class
2,chandana,tenali,21,cse
4,deepthi,vijayawada,22,cse
5,siri,guntur,23,ece
6,pramodha,ongole,23,cse


In [5]:
a.reset_index()
b.reset_index()
c.reset_index()

Unnamed: 0,index,name,place,age,class
0,0,pramodha,ongole,23,cse
1,1,deepthi,vijayawada,22,cse
2,2,chandana,tenali,21,cse
3,3,siri,guntur,23,ece


Unnamed: 0,index,name,place,age,class
0,0,pramodha,ongole,23,cse
1,1,deepthi,vijayawada,22,cse
2,2,chandana,tenali,21,cse
3,3,siri,guntur,23,ece


Unnamed: 0,index,name,place,age,class
0,2,chandana,tenali,21,cse
1,4,deepthi,vijayawada,22,cse
2,5,siri,guntur,23,ece
3,6,pramodha,ongole,23,cse


In [6]:
a.reset_index(drop = True)
b.reset_index(drop = True)
c.reset_index(drop = True)

Unnamed: 0,name,place,age,class
0,pramodha,ongole,23,cse
1,deepthi,vijayawada,22,cse
2,chandana,tenali,21,cse
3,siri,guntur,23,ece


Unnamed: 0,name,place,age,class
0,pramodha,ongole,23,cse
1,deepthi,vijayawada,22,cse
2,chandana,tenali,21,cse
3,siri,guntur,23,ece


Unnamed: 0,name,place,age,class
0,chandana,tenali,21,cse
1,deepthi,vijayawada,22,cse
2,siri,guntur,23,ece
3,pramodha,ongole,23,cse


In [7]:
df.drop_duplicates (subset = ["name","place"])

Unnamed: 0,name,place,age,class
0,pramodha,ongole,23,cse
1,deepthi,vijayawada,22,cse
2,chandana,tenali,21,cse
3,siri,guntur,23,ece


#### Missing Values

1. **isnull** - returns a DataFrame object where all the values are replaced with a Boolean value True for NULL values, and otherwise False
2. **notnull** - pandas function that will examine one or multiple values to validate that they are not null
3. **notna** - returns a DataFrame object where all the values are replaced with a Boolean value True for NOT NA (not-a -number) values, and otherwise False
4. **all** - returns one value for each column, True if ALL values in that column are True, otherwise False
5. **any** - checks whether any value in the caller object (Dataframe or series) is not 0 and returns True for that

In [8]:
df = pd.read_csv("C:\\Users\\Lenovo\\Downloads\\datasets\\Sales_Data.csv")
df

Unnamed: 0,Region,Country,Item_Type,Sales_Channel,Order_Priority,Order_Date,Order_ID,Ship_Date,Units_Sold,Unit_SellingPrice,Unit_MakingCost,Total_Revenue,Total_Cost,Total_Profit
0,Central America and the Caribbean,Antigua and Barbuda,Baby Food,Online,M,12/20/2013,957081544.0,01-11-2014,552,255.28,159.42,140914.56,87999.84,52914.72
1,Central America and the Caribbean,Panama,Snacks,Offline,C,07-05-2010,301644504.0,7/26/2010,2167,152.58,97.44,330640.86,211152.48,119488.38
2,Europe,Czech Republic,Beverages,Offline,C,09-12-2011,478051030.0,9/29/2011,4778,47.45,31.79,226716.10,151892.62,74823.48
3,Asia,North Korea,Cereal,Offline,L,5/13/2010,892599952.0,6/15/2010,9016,205.70,117.11,1854591.20,1055863.76,798727.44
4,Asia,Sri Lanka,Snacks,Offline,C,7/20/2015,571902596.0,7/27/2015,7542,152.58,97.44,1150758.36,734892.48,415865.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,Australia and Oceania,New Zealand,Household,Offline,C,04-03-2015,217984473.0,4/20/2015,5305,668.27,502.54,3545172.35,2665974.70,879197.65
4996,Middle East and North Africa,Azerbaijan,Clothes,Offline,L,8/17/2014,169748055.0,10-02-2014,1077,109.28,35.84,117694.56,38599.68,79094.88
4997,Asia,Myanmar,Baby Food,Offline,H,11/23/2016,,12-10-2016,5204,255.28,159.42,1328477.12,829621.68,498855.44
4998,Europe,Finland,Clothes,Online,L,4/22/2014,,05-11-2014,9410,109.28,35.84,1028324.80,337254.40,691070.40


In [9]:
df.isnull()
df.isnull().sum()
df['Country'].isnull()
df['Country'].isnull().sum()

Unnamed: 0,Region,Country,Item_Type,Sales_Channel,Order_Priority,Order_Date,Order_ID,Ship_Date,Units_Sold,Unit_SellingPrice,Unit_MakingCost,Total_Revenue,Total_Cost,Total_Profit
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4996,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4997,False,False,False,False,False,False,True,False,False,False,False,False,False,False
4998,False,False,False,False,False,False,True,False,False,False,False,False,False,False


Region                0
Country               0
Item_Type            11
Sales_Channel         0
Order_Priority        0
Order_Date            0
Order_ID              3
Ship_Date             0
Units_Sold            0
Unit_SellingPrice     0
Unit_MakingCost       0
Total_Revenue         0
Total_Cost            0
Total_Profit          0
dtype: int64

0       False
1       False
2       False
3       False
4       False
        ...  
4995    False
4996    False
4997    False
4998    False
4999    False
Name: Country, Length: 5000, dtype: bool

0

In [10]:
df.notnull()
df.notnull().sum()
df['Country'].notnull()
df['Country'].notnull().sum()

Unnamed: 0,Region,Country,Item_Type,Sales_Channel,Order_Priority,Order_Date,Order_ID,Ship_Date,Units_Sold,Unit_SellingPrice,Unit_MakingCost,Total_Revenue,Total_Cost,Total_Profit
0,True,True,True,True,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,True,True,True,True,True,True,True,True,True,True,True,True,True,True
4996,True,True,True,True,True,True,True,True,True,True,True,True,True,True
4997,True,True,True,True,True,True,False,True,True,True,True,True,True,True
4998,True,True,True,True,True,True,False,True,True,True,True,True,True,True


Region               5000
Country              5000
Item_Type            4989
Sales_Channel        5000
Order_Priority       5000
Order_Date           5000
Order_ID             4997
Ship_Date            5000
Units_Sold           5000
Unit_SellingPrice    5000
Unit_MakingCost      5000
Total_Revenue        5000
Total_Cost           5000
Total_Profit         5000
dtype: int64

0       True
1       True
2       True
3       True
4       True
        ... 
4995    True
4996    True
4997    True
4998    True
4999    True
Name: Country, Length: 5000, dtype: bool

5000

In [11]:
df.notna()

Unnamed: 0,Region,Country,Item_Type,Sales_Channel,Order_Priority,Order_Date,Order_ID,Ship_Date,Units_Sold,Unit_SellingPrice,Unit_MakingCost,Total_Revenue,Total_Cost,Total_Profit
0,True,True,True,True,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,True,True,True,True,True,True,True,True,True,True,True,True,True,True
4996,True,True,True,True,True,True,True,True,True,True,True,True,True,True
4997,True,True,True,True,True,True,False,True,True,True,True,True,True,True
4998,True,True,True,True,True,True,False,True,True,True,True,True,True,True


In [12]:
df.isnull().all(axis = 0)
df.isnull().sum().all()
df['Country'].isnull().all()
df['Country'].isnull().sum().all()

Region               False
Country              False
Item_Type            False
Sales_Channel        False
Order_Priority       False
Order_Date           False
Order_ID             False
Ship_Date            False
Units_Sold           False
Unit_SellingPrice    False
Unit_MakingCost      False
Total_Revenue        False
Total_Cost           False
Total_Profit         False
dtype: bool

False

False

False

In [13]:
df.isnull().any(axis = 0)
df.isnull().sum().any()
df['Country'].isnull().any()
df['Country'].isnull().sum().any()

Region               False
Country              False
Item_Type             True
Sales_Channel        False
Order_Priority       False
Order_Date           False
Order_ID              True
Ship_Date            False
Units_Sold           False
Unit_SellingPrice    False
Unit_MakingCost      False
Total_Revenue        False
Total_Cost           False
Total_Profit         False
dtype: bool

True

False

False

#### Treating Missing Values

1. **dropna** - used to remove missing values
2. **thresh** - takes integer value which tells minimum amount of na values to drop

In [14]:
df.dropna(how = 'all')
df.dropna(how = 'all' , subset = ['Region','Country'])

Unnamed: 0,Region,Country,Item_Type,Sales_Channel,Order_Priority,Order_Date,Order_ID,Ship_Date,Units_Sold,Unit_SellingPrice,Unit_MakingCost,Total_Revenue,Total_Cost,Total_Profit
0,Central America and the Caribbean,Antigua and Barbuda,Baby Food,Online,M,12/20/2013,957081544.0,01-11-2014,552,255.28,159.42,140914.56,87999.84,52914.72
1,Central America and the Caribbean,Panama,Snacks,Offline,C,07-05-2010,301644504.0,7/26/2010,2167,152.58,97.44,330640.86,211152.48,119488.38
2,Europe,Czech Republic,Beverages,Offline,C,09-12-2011,478051030.0,9/29/2011,4778,47.45,31.79,226716.10,151892.62,74823.48
3,Asia,North Korea,Cereal,Offline,L,5/13/2010,892599952.0,6/15/2010,9016,205.70,117.11,1854591.20,1055863.76,798727.44
4,Asia,Sri Lanka,Snacks,Offline,C,7/20/2015,571902596.0,7/27/2015,7542,152.58,97.44,1150758.36,734892.48,415865.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,Australia and Oceania,New Zealand,Household,Offline,C,04-03-2015,217984473.0,4/20/2015,5305,668.27,502.54,3545172.35,2665974.70,879197.65
4996,Middle East and North Africa,Azerbaijan,Clothes,Offline,L,8/17/2014,169748055.0,10-02-2014,1077,109.28,35.84,117694.56,38599.68,79094.88
4997,Asia,Myanmar,Baby Food,Offline,H,11/23/2016,,12-10-2016,5204,255.28,159.42,1328477.12,829621.68,498855.44
4998,Europe,Finland,Clothes,Online,L,4/22/2014,,05-11-2014,9410,109.28,35.84,1028324.80,337254.40,691070.40


Unnamed: 0,Region,Country,Item_Type,Sales_Channel,Order_Priority,Order_Date,Order_ID,Ship_Date,Units_Sold,Unit_SellingPrice,Unit_MakingCost,Total_Revenue,Total_Cost,Total_Profit
0,Central America and the Caribbean,Antigua and Barbuda,Baby Food,Online,M,12/20/2013,957081544.0,01-11-2014,552,255.28,159.42,140914.56,87999.84,52914.72
1,Central America and the Caribbean,Panama,Snacks,Offline,C,07-05-2010,301644504.0,7/26/2010,2167,152.58,97.44,330640.86,211152.48,119488.38
2,Europe,Czech Republic,Beverages,Offline,C,09-12-2011,478051030.0,9/29/2011,4778,47.45,31.79,226716.10,151892.62,74823.48
3,Asia,North Korea,Cereal,Offline,L,5/13/2010,892599952.0,6/15/2010,9016,205.70,117.11,1854591.20,1055863.76,798727.44
4,Asia,Sri Lanka,Snacks,Offline,C,7/20/2015,571902596.0,7/27/2015,7542,152.58,97.44,1150758.36,734892.48,415865.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,Australia and Oceania,New Zealand,Household,Offline,C,04-03-2015,217984473.0,4/20/2015,5305,668.27,502.54,3545172.35,2665974.70,879197.65
4996,Middle East and North Africa,Azerbaijan,Clothes,Offline,L,8/17/2014,169748055.0,10-02-2014,1077,109.28,35.84,117694.56,38599.68,79094.88
4997,Asia,Myanmar,Baby Food,Offline,H,11/23/2016,,12-10-2016,5204,255.28,159.42,1328477.12,829621.68,498855.44
4998,Europe,Finland,Clothes,Online,L,4/22/2014,,05-11-2014,9410,109.28,35.84,1028324.80,337254.40,691070.40


In [15]:
df.dropna(how = 'any')
df.dropna(how = 'any' , subset = ['Region','Country'])

Unnamed: 0,Region,Country,Item_Type,Sales_Channel,Order_Priority,Order_Date,Order_ID,Ship_Date,Units_Sold,Unit_SellingPrice,Unit_MakingCost,Total_Revenue,Total_Cost,Total_Profit
0,Central America and the Caribbean,Antigua and Barbuda,Baby Food,Online,M,12/20/2013,957081544.0,01-11-2014,552,255.28,159.42,140914.56,87999.84,52914.72
1,Central America and the Caribbean,Panama,Snacks,Offline,C,07-05-2010,301644504.0,7/26/2010,2167,152.58,97.44,330640.86,211152.48,119488.38
2,Europe,Czech Republic,Beverages,Offline,C,09-12-2011,478051030.0,9/29/2011,4778,47.45,31.79,226716.10,151892.62,74823.48
3,Asia,North Korea,Cereal,Offline,L,5/13/2010,892599952.0,6/15/2010,9016,205.70,117.11,1854591.20,1055863.76,798727.44
4,Asia,Sri Lanka,Snacks,Offline,C,7/20/2015,571902596.0,7/27/2015,7542,152.58,97.44,1150758.36,734892.48,415865.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4992,Middle East and North Africa,Iran,Clothes,Offline,L,07-11-2014,927583253.0,8/30/2014,1352,109.28,35.84,147746.56,48455.68,99290.88
4993,Europe,Denmark,Clothes,Offline,H,05-09-2012,713357150.0,06-03-2012,7088,109.28,35.84,774576.64,254033.92,520542.72
4994,Sub-Saharan Africa,Liberia,Cosmetics,Offline,M,6/17/2012,374724614.0,6/23/2012,8195,437.20,263.33,3582854.00,2157989.35,1424864.65
4995,Australia and Oceania,New Zealand,Household,Offline,C,04-03-2015,217984473.0,4/20/2015,5305,668.27,502.54,3545172.35,2665974.70,879197.65


Unnamed: 0,Region,Country,Item_Type,Sales_Channel,Order_Priority,Order_Date,Order_ID,Ship_Date,Units_Sold,Unit_SellingPrice,Unit_MakingCost,Total_Revenue,Total_Cost,Total_Profit
0,Central America and the Caribbean,Antigua and Barbuda,Baby Food,Online,M,12/20/2013,957081544.0,01-11-2014,552,255.28,159.42,140914.56,87999.84,52914.72
1,Central America and the Caribbean,Panama,Snacks,Offline,C,07-05-2010,301644504.0,7/26/2010,2167,152.58,97.44,330640.86,211152.48,119488.38
2,Europe,Czech Republic,Beverages,Offline,C,09-12-2011,478051030.0,9/29/2011,4778,47.45,31.79,226716.10,151892.62,74823.48
3,Asia,North Korea,Cereal,Offline,L,5/13/2010,892599952.0,6/15/2010,9016,205.70,117.11,1854591.20,1055863.76,798727.44
4,Asia,Sri Lanka,Snacks,Offline,C,7/20/2015,571902596.0,7/27/2015,7542,152.58,97.44,1150758.36,734892.48,415865.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,Australia and Oceania,New Zealand,Household,Offline,C,04-03-2015,217984473.0,4/20/2015,5305,668.27,502.54,3545172.35,2665974.70,879197.65
4996,Middle East and North Africa,Azerbaijan,Clothes,Offline,L,8/17/2014,169748055.0,10-02-2014,1077,109.28,35.84,117694.56,38599.68,79094.88
4997,Asia,Myanmar,Baby Food,Offline,H,11/23/2016,,12-10-2016,5204,255.28,159.42,1328477.12,829621.68,498855.44
4998,Europe,Finland,Clothes,Online,L,4/22/2014,,05-11-2014,9410,109.28,35.84,1028324.80,337254.40,691070.40


In [16]:
df.dropna(thresh = 1)

Unnamed: 0,Region,Country,Item_Type,Sales_Channel,Order_Priority,Order_Date,Order_ID,Ship_Date,Units_Sold,Unit_SellingPrice,Unit_MakingCost,Total_Revenue,Total_Cost,Total_Profit
0,Central America and the Caribbean,Antigua and Barbuda,Baby Food,Online,M,12/20/2013,957081544.0,01-11-2014,552,255.28,159.42,140914.56,87999.84,52914.72
1,Central America and the Caribbean,Panama,Snacks,Offline,C,07-05-2010,301644504.0,7/26/2010,2167,152.58,97.44,330640.86,211152.48,119488.38
2,Europe,Czech Republic,Beverages,Offline,C,09-12-2011,478051030.0,9/29/2011,4778,47.45,31.79,226716.10,151892.62,74823.48
3,Asia,North Korea,Cereal,Offline,L,5/13/2010,892599952.0,6/15/2010,9016,205.70,117.11,1854591.20,1055863.76,798727.44
4,Asia,Sri Lanka,Snacks,Offline,C,7/20/2015,571902596.0,7/27/2015,7542,152.58,97.44,1150758.36,734892.48,415865.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,Australia and Oceania,New Zealand,Household,Offline,C,04-03-2015,217984473.0,4/20/2015,5305,668.27,502.54,3545172.35,2665974.70,879197.65
4996,Middle East and North Africa,Azerbaijan,Clothes,Offline,L,8/17/2014,169748055.0,10-02-2014,1077,109.28,35.84,117694.56,38599.68,79094.88
4997,Asia,Myanmar,Baby Food,Offline,H,11/23/2016,,12-10-2016,5204,255.28,159.42,1328477.12,829621.68,498855.44
4998,Europe,Finland,Clothes,Online,L,4/22/2014,,05-11-2014,9410,109.28,35.84,1028324.80,337254.40,691070.40


#### Replacing Missing Values

1. **fillna** - replaces the NULL values with a specified value
- forwardfill - replaces the NULL values with the value from the previous row (or previous column, if the axis parameter is set to 'columns' )
- backfill - used to backward fill the missing values in the dataset. It will backward fill the NaN values that are present in the pandas dataframe
- value - pass in a value into the value= parameter
2. **replace** - replaces the specified value with another specified value
3. **interpolate** - used to fill NA values in the dataframe or series. But, this is a very powerful function to fill the missing values

In [17]:
df.fillna(method = 'pad')

Unnamed: 0,Region,Country,Item_Type,Sales_Channel,Order_Priority,Order_Date,Order_ID,Ship_Date,Units_Sold,Unit_SellingPrice,Unit_MakingCost,Total_Revenue,Total_Cost,Total_Profit
0,Central America and the Caribbean,Antigua and Barbuda,Baby Food,Online,M,12/20/2013,957081544.0,01-11-2014,552,255.28,159.42,140914.56,87999.84,52914.72
1,Central America and the Caribbean,Panama,Snacks,Offline,C,07-05-2010,301644504.0,7/26/2010,2167,152.58,97.44,330640.86,211152.48,119488.38
2,Europe,Czech Republic,Beverages,Offline,C,09-12-2011,478051030.0,9/29/2011,4778,47.45,31.79,226716.10,151892.62,74823.48
3,Asia,North Korea,Cereal,Offline,L,5/13/2010,892599952.0,6/15/2010,9016,205.70,117.11,1854591.20,1055863.76,798727.44
4,Asia,Sri Lanka,Snacks,Offline,C,7/20/2015,571902596.0,7/27/2015,7542,152.58,97.44,1150758.36,734892.48,415865.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,Australia and Oceania,New Zealand,Household,Offline,C,04-03-2015,217984473.0,4/20/2015,5305,668.27,502.54,3545172.35,2665974.70,879197.65
4996,Middle East and North Africa,Azerbaijan,Clothes,Offline,L,8/17/2014,169748055.0,10-02-2014,1077,109.28,35.84,117694.56,38599.68,79094.88
4997,Asia,Myanmar,Baby Food,Offline,H,11/23/2016,169748055.0,12-10-2016,5204,255.28,159.42,1328477.12,829621.68,498855.44
4998,Europe,Finland,Clothes,Online,L,4/22/2014,169748055.0,05-11-2014,9410,109.28,35.84,1028324.80,337254.40,691070.40


In [18]:
df.fillna(method = 'bfill')

Unnamed: 0,Region,Country,Item_Type,Sales_Channel,Order_Priority,Order_Date,Order_ID,Ship_Date,Units_Sold,Unit_SellingPrice,Unit_MakingCost,Total_Revenue,Total_Cost,Total_Profit
0,Central America and the Caribbean,Antigua and Barbuda,Baby Food,Online,M,12/20/2013,957081544.0,01-11-2014,552,255.28,159.42,140914.56,87999.84,52914.72
1,Central America and the Caribbean,Panama,Snacks,Offline,C,07-05-2010,301644504.0,7/26/2010,2167,152.58,97.44,330640.86,211152.48,119488.38
2,Europe,Czech Republic,Beverages,Offline,C,09-12-2011,478051030.0,9/29/2011,4778,47.45,31.79,226716.10,151892.62,74823.48
3,Asia,North Korea,Cereal,Offline,L,5/13/2010,892599952.0,6/15/2010,9016,205.70,117.11,1854591.20,1055863.76,798727.44
4,Asia,Sri Lanka,Snacks,Offline,C,7/20/2015,571902596.0,7/27/2015,7542,152.58,97.44,1150758.36,734892.48,415865.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,Australia and Oceania,New Zealand,Household,Offline,C,04-03-2015,217984473.0,4/20/2015,5305,668.27,502.54,3545172.35,2665974.70,879197.65
4996,Middle East and North Africa,Azerbaijan,Clothes,Offline,L,8/17/2014,169748055.0,10-02-2014,1077,109.28,35.84,117694.56,38599.68,79094.88
4997,Asia,Myanmar,Baby Food,Offline,H,11/23/2016,,12-10-2016,5204,255.28,159.42,1328477.12,829621.68,498855.44
4998,Europe,Finland,Clothes,Online,L,4/22/2014,,05-11-2014,9410,109.28,35.84,1028324.80,337254.40,691070.40


In [19]:
df.fillna(value = 'not sure')

Unnamed: 0,Region,Country,Item_Type,Sales_Channel,Order_Priority,Order_Date,Order_ID,Ship_Date,Units_Sold,Unit_SellingPrice,Unit_MakingCost,Total_Revenue,Total_Cost,Total_Profit
0,Central America and the Caribbean,Antigua and Barbuda,Baby Food,Online,M,12/20/2013,957081544.0,01-11-2014,552,255.28,159.42,140914.56,87999.84,52914.72
1,Central America and the Caribbean,Panama,Snacks,Offline,C,07-05-2010,301644504.0,7/26/2010,2167,152.58,97.44,330640.86,211152.48,119488.38
2,Europe,Czech Republic,Beverages,Offline,C,09-12-2011,478051030.0,9/29/2011,4778,47.45,31.79,226716.10,151892.62,74823.48
3,Asia,North Korea,Cereal,Offline,L,5/13/2010,892599952.0,6/15/2010,9016,205.70,117.11,1854591.20,1055863.76,798727.44
4,Asia,Sri Lanka,Snacks,Offline,C,7/20/2015,571902596.0,7/27/2015,7542,152.58,97.44,1150758.36,734892.48,415865.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,Australia and Oceania,New Zealand,Household,Offline,C,04-03-2015,217984473.0,4/20/2015,5305,668.27,502.54,3545172.35,2665974.70,879197.65
4996,Middle East and North Africa,Azerbaijan,Clothes,Offline,L,8/17/2014,169748055.0,10-02-2014,1077,109.28,35.84,117694.56,38599.68,79094.88
4997,Asia,Myanmar,Baby Food,Offline,H,11/23/2016,not sure,12-10-2016,5204,255.28,159.42,1328477.12,829621.68,498855.44
4998,Europe,Finland,Clothes,Online,L,4/22/2014,not sure,05-11-2014,9410,109.28,35.84,1028324.80,337254.40,691070.40


In [20]:
df['Order_ID'].fillna(value = '9515842735')

0       957081544.0
1       301644504.0
2       478051030.0
3       892599952.0
4       571902596.0
           ...     
4995    217984473.0
4996    169748055.0
4997     9515842735
4998     9515842735
4999     9515842735
Name: Order_ID, Length: 5000, dtype: object

In [21]:
df['Order_ID'].fillna(df['Order_ID'].mean())

0       9.570815e+08
1       3.016445e+08
2       4.780510e+08
3       8.926000e+08
4       5.719026e+08
            ...     
4995    2.179845e+08
4996    1.697481e+08
4997    5.485477e+08
4998    5.485477e+08
4999    5.485477e+08
Name: Order_ID, Length: 5000, dtype: float64

In [22]:
df['Order_ID'] = df['Order_ID'].replace(np.nan , df['Order_ID'].mean())
df['Order_ID']

0       9.570815e+08
1       3.016445e+08
2       4.780510e+08
3       8.926000e+08
4       5.719026e+08
            ...     
4995    2.179845e+08
4996    1.697481e+08
4997    5.485477e+08
4998    5.485477e+08
4999    5.485477e+08
Name: Order_ID, Length: 5000, dtype: float64

In [23]:
df['Order_ID'].interpolate()

0       9.570815e+08
1       3.016445e+08
2       4.780510e+08
3       8.926000e+08
4       5.719026e+08
            ...     
4995    2.179845e+08
4996    1.697481e+08
4997    5.485477e+08
4998    5.485477e+08
4999    5.485477e+08
Name: Order_ID, Length: 5000, dtype: float64

In [24]:
df['Order_ID'].interpolate(method = 'polynomial' , order = 2)

0       9.570815e+08
1       3.016445e+08
2       4.780510e+08
3       8.926000e+08
4       5.719026e+08
            ...     
4995    2.179845e+08
4996    1.697481e+08
4997    5.485477e+08
4998    5.485477e+08
4999    5.485477e+08
Name: Order_ID, Length: 5000, dtype: float64