# Data Cleaning

## Different ways to clean the data

In [2]:
import pandas as pd 

In [3]:
order_details = pd.read_table(filepath_or_buffer= "data.tsv.txt")

In [4]:
print(order_details)

      order_id  quantity                              item_name  \
0            1         1           Chips and Fresh Tomato Salsa   
1            1         1                                   Izze   
2            1         1                       Nantucket Nectar   
3            1         1  Chips and Tomatillo-Green Chili Salsa   
4            2         2                           Chicken Bowl   
...        ...       ...                                    ...   
4617      1833         1                          Steak Burrito   
4618      1833         1                          Steak Burrito   
4619      1834         1                     Chicken Salad Bowl   
4620      1834         1                     Chicken Salad Bowl   
4621      1834         1                     Chicken Salad Bowl   

                                     choice_description item_price  
0                                                   NaN     $2.39   
1                                          [Clementine]  

In [5]:
order_details.dtypes

order_id               int64
quantity               int64
item_name             object
choice_description    object
item_price            object
dtype: object

### Converting item_price from object (series) to float 


In [11]:
# Use str.replace instead of replace because str.replace is used for substring and replace is used for entire string 

In [6]:
order_details['item_price'] = order_details['item_price'].str.replace('$',"").astype(float)  

In [7]:
print(order_details)

      order_id  quantity                              item_name  \
0            1         1           Chips and Fresh Tomato Salsa   
1            1         1                                   Izze   
2            1         1                       Nantucket Nectar   
3            1         1  Chips and Tomatillo-Green Chili Salsa   
4            2         2                           Chicken Bowl   
...        ...       ...                                    ...   
4617      1833         1                          Steak Burrito   
4618      1833         1                          Steak Burrito   
4619      1834         1                     Chicken Salad Bowl   
4620      1834         1                     Chicken Salad Bowl   
4621      1834         1                     Chicken Salad Bowl   

                                     choice_description  item_price  
0                                                   NaN        2.39  
1                                          [Clementine]

In [8]:
order_details.dtypes

order_id                int64
quantity                int64
item_name              object
choice_description     object
item_price            float64
dtype: object

In [9]:
order_details.describe()  # Now item_price is added as its not object 

Unnamed: 0,order_id,quantity,item_price
count,4622.0,4622.0,4622.0
mean,927.254868,1.075725,7.464336
std,528.890796,0.410186,4.245557
min,1.0,1.0,1.09
25%,477.25,1.0,3.39
50%,926.0,1.0,8.75
75%,1393.0,1.0,9.25
max,1834.0,15.0,44.25


### Task 1: Top 5 selling items and their individual revenue

In [20]:
order_details.groupby(by="item_name")["item_price"].sum().sort_values(ascending = False).head()

item_name
Chicken Bowl           7342.73
Chicken Burrito        5575.82
Steak Burrito          3851.43
Steak Bowl             2260.19
Chips and Guacamole    2201.04
Name: item_price, dtype: float64

In [22]:
top5_individual = order_details.groupby(by="item_name")["item_price"].sum().sort_values(ascending = False).head()

In [23]:
print(top5_individual)

item_name
Chicken Bowl           7342.73
Chicken Burrito        5575.82
Steak Burrito          3851.43
Steak Bowl             2260.19
Chips and Guacamole    2201.04
Name: item_price, dtype: float64


### Task 2: Top 5 selling items and their total revenue

In [21]:
order_details.groupby(by="item_name")["item_price"].sum().sort_values(ascending = False).head().sum()

np.float64(21231.21)

In [24]:
top5_total = order_details.groupby(by="item_name")["item_price"].sum().sort_values(ascending = False).head().sum()

In [28]:
print(f"The total revenue earned by the top 5 selling items are ${top5_total}")

The total revenue earned by the top 5 selling items are $21231.21


### Task 3 : Create Series

In [6]:
# Series Creation 

In [5]:
pd.Series(data = [10,24,42,121,12.12,"Aas"])

0       10
1       24
2       42
3      121
4    12.12
5      Aas
dtype: object

In [7]:
# DataFrame Creation

In [13]:
pd.DataFrame({"Players":("Messi","Ronaldo","Neymar","Mbappe","Haaland","Kane"),"Club":["Inter Miami","Al Nassr","Santos","Real Madrid","Man City","Bayern"],"Country":("Argentina","Portugal","Brazil","France","Norway","England"),"Main Foot":("Left","Right","Right","Right","Left","Right"),"Positions":("CAM,RW","ST","LW,CAM","LW,ST","ST","ST")})

Unnamed: 0,Players,Club,Country,Main Foot,Positions
0,Messi,Inter Miami,Argentina,Left,"CAM,RW"
1,Ronaldo,Al Nassr,Portugal,Right,ST
2,Neymar,Santos,Brazil,Right,"LW,CAM"
3,Mbappe,Real Madrid,France,Right,"LW,ST"
4,Haaland,Man City,Norway,Left,ST
5,Kane,Bayern,England,Right,ST
