# **Mini-Project-2:**

## Data Wrangling Challenges (Levels 1–10)
### Goal: Practice cleaning, transforming, and organizing data step-by-step.

In [113]:
import pandas as pd
import matplotlib.pyplot as plot

In [114]:
file_path = r"C:\Users\rohan\OneDrive\Desktop\Jupyter Projects\Notes\CSV Files\messy_data_wrangling_project.csv"
df = pd.read_csv(file_path)

In [115]:
df.head()

Unnamed: 0,Customer Name,Order Date,SALES,Quantity Ordered,Returned?,Extra Column
0,Alice,2023/01/15,1000.0,2,Yes,xyz
1,Bob,15-02-2023,850.0,3,no,xyz
2,alice,2023-03-10,,1,Y,xyz
3,Charlie,03.04.2023,550.0,1,YES,xyz
4,David,"April 5, 2023",900.0,4,No,xyz


## 1.Level 1: Rename all column headers to lowercase and replace spaces with underscores

In [116]:
df.columns  = df.columns.str.lower().str.replace(" ","_")
print("\n✅ [Level 1] Cleaned column names:")
print(df.columns.tolist())


✅ [Level 1] Cleaned column names:
['customer_name', 'order_date', 'sales', 'quantity_ordered', 'returned?', 'extra_column']


## 2. Level 2: Remove extra spaces from text values

In [117]:
str_cols = df.select_dtypes(include='object').columns
df[str_cols] = df[str_cols].apply(lambda x: x.str.strip())

print("\n✅ [Level 2] Removed leading/trailing spaces from text columns.")


✅ [Level 2] Removed leading/trailing spaces from text columns.


## 3.Level 3: Convert order_date column to proper date format

In [118]:
df['order_date'] = pd.to_datetime(df['order_date'], errors='coerce')
print("\n✅ [Level 3] Converted 'order_date' to datetime format.")


✅ [Level 3] Converted 'order_date' to datetime format.


In [119]:
df['order_date']

0    2023-01-15
1           NaT
2           NaT
3           NaT
4           NaT
5           NaT
6           NaT
7           NaT
8           NaT
9           NaT
10          NaT
Name: order_date, dtype: datetime64[ns]

## 4. Level 4: Remove duplicate rows

In [120]:
df = df.drop_duplicates()
print("\n✅ [Level 4] Removed duplicate rows.")


✅ [Level 4] Removed duplicate rows.


In [121]:
df

Unnamed: 0,customer_name,order_date,sales,quantity_ordered,returned?,extra_column
0,Alice,2023-01-15,1000.0,2,Yes,xyz
1,Bob,NaT,850.0,3,no,xyz
2,alice,NaT,,1,Y,xyz
3,Charlie,NaT,550.0,1,YES,xyz
4,David,NaT,900.0,4,No,xyz
5,Eve,NaT,,3,n,xyz
6,Frank,NaT,300.0,,yes,xyz
7,Grace,NaT,100.0,2,N,xyz
8,Heidi,NaT,,1,Y,xyz
9,Ivan,NaT,450.0,two,y,xyz


## 5. Level 5: Handle missing values (Numerical)

In [140]:
df['sales'] = pd.to_numeric(df['sales'], errors='coerce')
df['quantity_ordered'] = pd.to_numeric(df['quantity_ordered'], errors='coerce')

df['sales'] = df['sales'].fillna(df['sales'].mean())
df['quantity_ordered'] = df['quantity_ordered'].fillna(df['quantity_ordered'].mean())

print("\n✅ [Level 5] Missing numeric values filled using mean (no warnings now).")





In [133]:
df[['sales','quantity_ordered']]

Unnamed: 0,sales,quantity_ordered
0,1000.0,2.0
1,850.0,3.0
2,592.857143,1.0
3,550.0,1.0
4,900.0,4.0
5,592.857143,3.0
6,300.0,2.125
7,100.0,2.0
8,592.857143,1.0
9,450.0,2.125


## 6. Level 6: Drop irrelevant column

In [126]:
df = df.drop(columns=['extra_column'])
df.head(2)

Unnamed: 0,customer_name,order_date,sales,quantity_ordered,returned?
0,Alice,2023-01-15,1000.0,2.0,Yes
1,Bob,NaT,850.0,3.0,no


In [127]:
print("\n✅ [Level 6] Dropped irrelevant column: 'extra_column'")


✅ [Level 6] Dropped irrelevant column: 'extra_column'


## 7.Level 7: Fix inconsistent categories (like "Yes", "yes", "Y", "n", etc.)

In [130]:

df['returned?'] = df['returned?'].str.lower().map({
    'yes': 'Yes', 'y': 'Yes',
    'no': 'No', 'n': 'No'
})

print("\n✅ [Level 7] Standardized 'returned?' column values:")
print(df['returned?'].value_counts())



✅ [Level 7] Standardized 'returned?' column values:
returned?
Yes    6
No     4
Name: count, dtype: int64


## 8.Create a new column "high_value_order" if sales > 500

In [132]:
df['high_value_order'] = df['sales'] > 500
print("\n✅ [Level 8] Created 'high_value_order' column:")
print(df[['sales', 'high_value_order']].head())


✅ [Level 8] Created 'high_value_order' column:
         sales  high_value_order
0  1000.000000              True
1   850.000000              True
2   592.857143              True
3   550.000000              True
4   900.000000              True


## 9.Level 9: Sort data by order date (newest first)

In [135]:
df = df.sort_values(by='order_date', ascending=False)
print("\n✅ [Level 9] Sorted data by latest order date.")


✅ [Level 9] Sorted data by latest order date.


In [136]:
df['order_date']

0   2023-01-15
1          NaT
2          NaT
3          NaT
4          NaT
5          NaT
6          NaT
7          NaT
8          NaT
9          NaT
Name: order_date, dtype: datetime64[ns]

## 10.Level 10: Save the cleaned dataset to a new CSV

In [138]:
df.to_csv("cleaned_data_wrangling_project.csv", index=False)
print("\n✅ [Level 10] Cleaned dataset saved as 'cleaned_data_wrangling_project.csv'")



✅ [Level 10] Cleaned dataset saved as 'cleaned_data_wrangling_project.csv'
