# Import the necessary libraries

In [None]:
import numpy as np
import pandas as pd

# Import the dataset

In [None]:
Dataset = pd.read_csv('OneDrive/Desktop/Week 3 - Data Cleaning (Pandas) - 2/chipotle.tsv',sep='\t')

# Knowing our dataset

In [None]:
Dataset.head(10)

# Checking datatypes of each column

In [None]:
Dataset.info()

# Handling missing values
As we can see there are around 1400 missing values in choice_description column. choice_description column is a categorical column so we can not replace those null values with mean or median of the column. After studying the dataset, we can see that the there are null values in choice_description column for only those items in item_name for which no choices are available. Therefore, we can replace the null values with 'No Choices' String value.

In [None]:
Dataset.isnull().sum()

In [None]:
Dataset['choice_description'] = Dataset['choice_description'].fillna('No choice')

# Handling inconsistency in data
Now, If we check the dtype of item_price, we can see that it is object because there is dollar sign attached to the price value. If we want to perform some statistical analysis, we have to change the dtype of item_price column to 'float64'.Also we have to remove the dollar sign. We can use the replace method of python to do that. To change the dtype we use 'astype' method.

In [None]:
Dataset['item_price'] = Dataset['item_price'].str.replace('$', '').astype(float)

# Analyzing choice_desciption column
If we look at the choice_description column, we can clearly see that there are multiple choices for a single item which should be handled properly for accurate analysis. What we can do in this case is create a column for every unique item in choice_description column and mark their presence by 0 or 1 according to their corresponding value in choice_description column just like one-hot encoding.

In [None]:
def extract_items(choice_description):
    if pd.isnull(choice_description):
        return []
    else:
        # Assuming items are enclosed in square brackets
        return [item.strip(" []") for item in choice_description.split(",")]
Dataset['parsed_items'] = Dataset['choice_description'].apply(extract_items)
unique_items = set(item for sublist in Dataset['parsed_items'] for item in sublist)
for item in unique_items:
    Dataset[item] = Dataset['parsed_items'].apply(lambda x: 1 if item in x else 0)
Dataset.drop(['choice_description', 'parsed_items'], axis=1, inplace=True)

# Data integrity check for order_id column
To cross-reference the Order ID column for integrity, we can check for irregularities or patterns by examining the unique values in the 'order_id' column. 

In [None]:
# Check unique values in the 'order_id' column
unique_order_ids = Dataset['order_id'].unique()

# Check for irregularities or patterns
# We can check if the order IDs follow a specific pattern or have consistent lengths
for order_id in unique_order_ids:
    print(f"Order ID: {order_id}, Length: {len(str(order_id))}")

# Additionally, we can check if there are any missing values in the 'order_id' column
missing_order_ids = Dataset['order_id'].isnull().sum()
print(f"Number of missing Order IDs: {missing_order_ids}")

# Item name standardization
To standardize the Item Name column and unify variations for better analysis, we can apply text processing techniques to clean and standardize the item names.
The code performs the following operations:
1.Converts all item names to lowercase for consistency.
2.Removes leading and trailing whitespaces.
3.Replaces hyphens with spaces.

In [None]:
Dataset['item_name'] = Dataset['item_name'].str.lower() 
Dataset['item_name'] = Dataset['item_name'].str.strip() 
Dataset['item_name'] = Dataset['item_name'].str.replace('-', ' ')

# Relationship between Quantity and Item_price
To perform a data integrity check to ensure that quantities and prices align with the corresponding items and descriptions, we can examine the dataset to identify any discrepancies.This code checks for records where the product of quantity and item price does not match the total price, highlighting potential inconsistencies.This analysis helps to identify records where the calculated total price does not align with the given quantity and item price.

In [None]:
Dataset['total_price'] = Dataset['quantity'] * Dataset['item_price']

# Check for inconsistencies between quantities and prices
inconsistent_records = Dataset[Dataset['total_price'] != Dataset['total_price']]

# Display inconsistent records
print("Inconsistent Records:")
print(inconsistent_records)