# **Data Cleaning in Pandas**

# **1. Sample Data with Issues**

In [18]:
import pandas as pd
import numpy as np

# Sample Data with Missing values, Duplicates, and Incorrect Data Types
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Alice', np.nan],
    'Age': [25, 30, 35, 25, 40],
    'City': ['New York', 'Los Angeles', 'New York', 'New York', 'Chicago'],
    'Salary': [50000, 60000, np.nan, 50000, 70000]
}

df = pd.DataFrame(data)
print(df)


      Name  Age         City   Salary
0    Alice   25     New York  50000.0
1      Bob   30  Los Angeles  60000.0
2  Charlie   35     New York      NaN
3    Alice   25     New York  50000.0
4      NaN   40      Chicago  70000.0


# **2. Handling Missing value**

In [19]:
## Fill missing values:
df['Salary'] = df['Salary'].fillna(df['Salary'].mean())
df

Unnamed: 0,Name,Age,City,Salary
0,Alice,25,New York,50000.0
1,Bob,30,Los Angeles,60000.0
2,Charlie,35,New York,57500.0
3,Alice,25,New York,50000.0
4,,40,Chicago,70000.0


In [20]:
## Drop rows with missing values:
df_cleaned = df.dropna()
df_cleaned

Unnamed: 0,Name,Age,City,Salary
0,Alice,25,New York,50000.0
1,Bob,30,Los Angeles,60000.0
2,Charlie,35,New York,57500.0
3,Alice,25,New York,50000.0


In [21]:
## Remove duplicate rows:
df_no_duplicates = df.drop_duplicates()
df_no_duplicates

Unnamed: 0,Name,Age,City,Salary
0,Alice,25,New York,50000.0
1,Bob,30,Los Angeles,60000.0
2,Charlie,35,New York,57500.0
4,,40,Chicago,70000.0


In [22]:
## Convert 'Age' to integer (if it's a float)
df['Age'] = df['Age'].astype(int)
df

Unnamed: 0,Name,Age,City,Salary
0,Alice,25,New York,50000.0
1,Bob,30,Los Angeles,60000.0
2,Charlie,35,New York,57500.0
3,Alice,25,New York,50000.0
4,,40,Chicago,70000.0


In [23]:
## Fix inconsistent text (e.g., lowercase city names):
df['City'] = df['City'].str.upper()
df

Unnamed: 0,Name,Age,City,Salary
0,Alice,25,NEW YORK,50000.0
1,Bob,30,LOS ANGELES,60000.0
2,Charlie,35,NEW YORK,57500.0
3,Alice,25,NEW YORK,50000.0
4,,40,CHICAGO,70000.0


In [24]:
## Remove leading and trailing whitespaces from string columns:

df['Name'] =df['Name'].str.strip()
df['City'] = df['City'].str.strip()
df

Unnamed: 0,Name,Age,City,Salary
0,Alice,25,NEW YORK,50000.0
1,Bob,30,LOS ANGELES,60000.0
2,Charlie,35,NEW YORK,57500.0
3,Alice,25,NEW YORK,50000.0
4,,40,CHICAGO,70000.0


In [25]:
## Final Cleaned DataFrame
df

Unnamed: 0,Name,Age,City,Salary
0,Alice,25,NEW YORK,50000.0
1,Bob,30,LOS ANGELES,60000.0
2,Charlie,35,NEW YORK,57500.0
3,Alice,25,NEW YORK,50000.0
4,,40,CHICAGO,70000.0


In [26]:
df.to_csv('cleaned_data.csv', index=False)