In [1]:
# 1.Part 1: Handle Missing Values & Duplicates

    # Step-by-Step Guidelines:
# 1. Load the Data: First, ensure you have pandas installed and import it.
# 2. Handling Missing Values
#     1. Identify Missing Values:
#     2. Fill Missing Values:
# 3. Handling Duplicates
#     1. Identify Duplicates:
#     2. Remove Duplicates:
# 4. Combined Practice on a New Dataset
#     1. New Sample Data:
#     2. Handling Missing Values:
#     3. Remove Duplicates:
        
import pandas as pd
import numpy as np
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'David', None],
    'Age': [25, None, 30, 25, 28, 28],
    'Salary': [50000, 60000, None, 50000, 70000, 70000]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print("\nMissing Values Count:")
print(df.isnull().sum())
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Salary'].fillna(df['Salary'].median(), inplace=True)
df['Name'].fillna(df['Name'].mode()[0], inplace=True)
print("\nAfter Filling Missing Values:")
print(df)
print("\nDuplicate Rows:")
print(df[df.duplicated()])
df.drop_duplicates(inplace=True)
print("\nAfter Removing Duplicates:")
print(df)
print("\nFinal Cleaned Dataset:")
print(df)       

Original DataFrame:
      Name   Age   Salary
0    Alice  25.0  50000.0
1      Bob   NaN  60000.0
2  Charlie  30.0      NaN
3    Alice  25.0  50000.0
4    David  28.0  70000.0
5     None  28.0  70000.0

Missing Values Count:
Name      1
Age       1
Salary    1
dtype: int64

After Filling Missing Values:
      Name   Age   Salary
0    Alice  25.0  50000.0
1      Bob  28.0  60000.0
2  Charlie  30.0  60000.0
3    Alice  25.0  50000.0
4    David  28.0  70000.0
5    Alice  28.0  70000.0

Duplicate Rows:
    Name   Age   Salary
3  Alice  25.0  50000.0

After Removing Duplicates:
      Name   Age   Salary
0    Alice  25.0  50000.0
1      Bob  28.0  60000.0
2  Charlie  30.0  60000.0
4    David  28.0  70000.0
5    Alice  28.0  70000.0

Final Cleaned Dataset:
      Name   Age   Salary
0    Alice  25.0  50000.0
1      Bob  28.0  60000.0
2  Charlie  30.0  60000.0
4    David  28.0  70000.0
5    Alice  28.0  70000.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are 

In [2]:
# Part 2: Apply Standardization & Formatting Rules

#     Step-by-Step Guidelines:
# 1. Standardize Text Data
#     1. Convert All Names to Lowercase:
# 2. Format Numerical Data
#     1. Round Age Column to the Nearest Integer:
# 3. Combined Practice on Another Dataset
#     1. New Sample Data:
#     2. Standardize Product Names:
#     3. Format Prices to Two Decimal Places:
import pandas as pd
data1 = {
    'Name': ['Alice', 'BOB', 'CharLie'],
    'Age': [25.7, 30.2, 22.9]
}
df1 = pd.DataFrame(data1)
df1['Name'] = df1['Name'].str.lower()
df1['Age'] = df1['Age'].round()
print("Standardized People Dataset:")
print(df1)
data2 = {
    'Product': ['  Phone ', 'Laptop', 'TABLET '],
    'Price': [199.999, 1299.5, 350.459]
}
df2 = pd.DataFrame(data2)
df2['Product'] = df2['Product'].str.strip().str.lower()
df2['Price'] = df2['Price'].round(2)
print("\nFormatted Product Dataset:")
print(df2)
      
        
        
        
        

Standardized People Dataset:
      Name   Age
0    alice  26.0
1      bob  30.0
2  charlie  23.0

Formatted Product Dataset:
  Product    Price
0   phone   200.00
1  laptop  1299.50
2  tablet   350.46
