## Data Cleaning & Preprocessing


### Handling Missing Values


In [69]:
import pandas as pd
df = pd.read_csv("1745501008647-data_cleaning_sample.csv")

df.isnull()              # True for NaNs/null values
df.isnull().sum()        # Count missing per column

df.dropna()              # Drop rows(Not shows) with *any* missing values. Shows only completed rows
df.dropna(axis=1)        # Drop columns with missing values

df.fillna(0)                     # Replace NaN with 0
df["Age"].fillna(df["Age"].mean())  # Replace with mean
df.ffill()      # Forward fill with previous values
df.bfill()      # Backward fill with next value



Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,30.0,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,22.0,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,Alice,28.0,Delhi,F,eve@domain.com,01-05-2021
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,,Delhi,M,charlie@example,20-07-2021


### Detecting & Removing Duplicates



In [33]:
df.duplicated()          # True for duplicates
# df.drop_duplicates()     # Remove duplicate rows
# df.duplicated(subset=["Name", "Age"]) #Check based on specific columns:

0    False
1    False
2    False
3     True
4    False
5    False
6     True
7     True
8     True
dtype: bool

## String Operations with .str

Works like vectorized string methods and returns a pandas Series:



In [70]:
df["Name"].str.lower() # Converts all names to lowercase.
df["City"].str.contains("delhi", case=False) # Checks if 'delhi' is in the city name, case-insensitive.
df["Email"].str.split("@") # Outputs a pandas Series where each element is a list of strings (the split parts). This is where a Python list comes into play, but the outer object is still a pandas Series.
df

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,,28.0,Delhi,F,eve@domain.com,
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,,Delhi,M,charlie@example,20-07-2021


## Type Conversions with .astype()

In [71]:
df2=df.dropna().copy()
df2["Age"] = df2["Age"].astype(int)
# df2["Join Date"] = pd.to_datetime(df2["Join Date"])
# df2["City"] = df2["City"].astype("category")
df2.info()
df


<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, 0 to 7
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Name       5 non-null      object
 1   Age        5 non-null      int64 
 2   City       5 non-null      object
 3   Gender     5 non-null      object
 4   Email      5 non-null      object
 5   Join Date  5 non-null      object
dtypes: int64(1), object(5)
memory usage: 280.0+ bytes


Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,,28.0,Delhi,F,eve@domain.com,
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,,Delhi,M,charlie@example,20-07-2021


### Applying Functions

In [75]:
df["Age Group"] = df["Age"].apply(lambda x: "Adult" if x >= 18 else "Minor") #Apply any function to rows or columns
gender_map = {"M": "Male", "F": "Female"}
df["Gender"] = df["Gender"].map(gender_map)  #Element-wise mapping for Series
df["City"].replace({"Delhi": "New Delhi", "Mum": "Mumbai"})  #Replace specific values
df


Unnamed: 0,Name,Age,City,Gender,Email,Join Date,Age Group
0,Alice,25.0,New York,Female,alice@example.com,01-05-2021,Adult
1,Charlie,,Delhi,Male,charlie@example,20-07-2021,Minor
2,Bob,30.0,Los Angeles,Male,bob@example.com,15-06-2020,Adult
3,Charlie,,Delhi,Male,charlie@example,20-07-2021,Minor
4,David,22.0,Mumbai,Male,david@example.com,12-11-2019,Adult
5,,28.0,Delhi,Female,eve@domain.com,,Adult
6,Alice,25.0,New York,Female,alice@example.com,01-05-2021,Adult
7,Alice,25.0,New York,Female,alice@example.com,01-05-2021,Adult
8,Charlie,,Delhi,Male,charlie@example,20-07-2021,Minor
