In [27]:
import pandas as pd 

In [28]:
df=pd.read_csv("data_cleaning.csv")

In [29]:
df.head()

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019


In [30]:
# check for missing data
df.isnull()              # True for NaNs

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,False,False,False,False,False,False
1,False,True,False,False,False,False
2,False,False,False,False,False,False
3,False,True,False,False,False,False
4,False,False,False,False,False,False
5,True,False,False,False,False,True
6,False,False,False,False,False,False
7,False,False,False,False,False,False
8,False,True,False,False,False,False


In [31]:
df.isnull().sum()        # Count missing per column

Name         1
Age          3
City         0
Gender       0
Email        0
Join Date    1
dtype: int64

In [32]:
# drop missing data
df.dropna() # Drop rows with *any* missing values

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021


In [33]:
df.dropna(axis=1)        # Drop columns with missing values

Unnamed: 0,City,Gender,Email
0,New York,F,alice@example.com
1,Delhi,M,charlie@example
2,Los Angeles,M,bob@example.com
3,Delhi,M,charlie@example
4,Mumbai,M,david@example.com
5,Delhi,F,eve@domain.com
6,New York,F,alice@example.com
7,New York,F,alice@example.com
8,Delhi,M,charlie@example


In [34]:
# fill missing data using fillna
df.fillna(0)                     # Replace NaN with 0

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,0.0,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,0.0,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,0,28.0,Delhi,F,eve@domain.com,0
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,0.0,Delhi,M,charlie@example,20-07-2021


In [35]:
df["Age"].fillna(df["Age"].mean())  # Replace with mean

0    25.000000
1    25.833333
2    30.000000
3    25.833333
4    22.000000
5    28.000000
6    25.000000
7    25.000000
8    25.833333
Name: Age, dtype: float64

In [36]:
df.ffill()      # Forward fill

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,25.0,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,30.0,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,David,28.0,Delhi,F,eve@domain.com,12-11-2019
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,25.0,Delhi,M,charlie@example,20-07-2021


In [37]:
df.bfill()      # Backward fill

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,30.0,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,22.0,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,Alice,28.0,Delhi,F,eve@domain.com,01-05-2021
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,,Delhi,M,charlie@example,20-07-2021


In [38]:
# detecting and removing duplicates 
df.duplicated()          # True for duplicates

0    False
1    False
2    False
3     True
4    False
5    False
6     True
7     True
8     True
dtype: bool

In [39]:
df.drop_duplicates()     # Remove duplicate rows

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,,28.0,Delhi,F,eve@domain.com,


In [40]:
# check based on specific columns 
df.duplicated(subset=["Name", "Age"])

0    False
1    False
2    False
3     True
4    False
5    False
6     True
7     True
8     True
dtype: bool

In [41]:
# string operations with str
df["Name"].str.lower() # Converts all names to lowercase.
df["City"].str.contains("delhi", case=False) # Checks if 'delhi' is in the city name, case-insensitive.
df["Email"].str.split("@") # Outputs a pandas Series where each element is a list of strings (the split parts). This is where a Python list comes into play, but the outer object is still a pandas Series

0    [alice, example.com]
1      [charlie, example]
2      [bob, example.com]
3      [charlie, example]
4    [david, example.com]
5       [eve, domain.com]
6    [alice, example.com]
7    [alice, example.com]
8      [charlie, example]
Name: Email, dtype: object

In [42]:
df

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,,28.0,Delhi,F,eve@domain.com,
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,,Delhi,M,charlie@example,20-07-2021


In [43]:
#  type conversions with as type 
df.dropna()
df.dropna(axis=1)
df["Age"] = df["Age"].astype(int)
df["Date"] = pd.to_datetime(df["Date"])
df["Category"] = df["Category"].astype("category")

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [44]:
df.describe()

Unnamed: 0,Age
count,6.0
mean,25.833333
std,2.786874
min,22.0
25%,25.0
50%,25.0
75%,27.25
max,30.0


In [45]:
df.dtypes

Name          object
Age          float64
City          object
Gender        object
Email         object
Join Date     object
dtype: object

In [46]:
# apply any functions to rows or columns 
df["Age Group"] = df["Age"].apply(lambda x: "Adult" if x >= 18 else "Minor")

In [47]:
df

Unnamed: 0,Name,Age,City,Gender,Email,Join Date,Age Group
0,Alice,25.0,New York,F,alice@example.com,01-05-2021,Adult
1,Charlie,,Delhi,M,charlie@example,20-07-2021,Minor
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020,Adult
3,Charlie,,Delhi,M,charlie@example,20-07-2021,Minor
4,David,22.0,Mumbai,M,david@example.com,12-11-2019,Adult
5,,28.0,Delhi,F,eve@domain.com,,Adult
6,Alice,25.0,New York,F,alice@example.com,01-05-2021,Adult
7,Alice,25.0,New York,F,alice@example.com,01-05-2021,Adult
8,Charlie,,Delhi,M,charlie@example,20-07-2021,Minor


In [48]:
# element wise mapping for series 
gender_map = {"M": "Male", "F": "Female"}
df["Gender"] = df["Gender"].map(gender_map)

In [49]:
df

Unnamed: 0,Name,Age,City,Gender,Email,Join Date,Age Group
0,Alice,25.0,New York,Female,alice@example.com,01-05-2021,Adult
1,Charlie,,Delhi,Male,charlie@example,20-07-2021,Minor
2,Bob,30.0,Los Angeles,Male,bob@example.com,15-06-2020,Adult
3,Charlie,,Delhi,Male,charlie@example,20-07-2021,Minor
4,David,22.0,Mumbai,Male,david@example.com,12-11-2019,Adult
5,,28.0,Delhi,Female,eve@domain.com,,Adult
6,Alice,25.0,New York,Female,alice@example.com,01-05-2021,Adult
7,Alice,25.0,New York,Female,alice@example.com,01-05-2021,Adult
8,Charlie,,Delhi,Male,charlie@example,20-07-2021,Minor


In [50]:
# replace specific values 
df["City"].replace({"Del": "Delhi", "Mum": "Mumbai"})

0       New York
1          Delhi
2    Los Angeles
3          Delhi
4         Mumbai
5          Delhi
6       New York
7       New York
8          Delhi
Name: City, dtype: object

In [51]:
df

Unnamed: 0,Name,Age,City,Gender,Email,Join Date,Age Group
0,Alice,25.0,New York,Female,alice@example.com,01-05-2021,Adult
1,Charlie,,Delhi,Male,charlie@example,20-07-2021,Minor
2,Bob,30.0,Los Angeles,Male,bob@example.com,15-06-2020,Adult
3,Charlie,,Delhi,Male,charlie@example,20-07-2021,Minor
4,David,22.0,Mumbai,Male,david@example.com,12-11-2019,Adult
5,,28.0,Delhi,Female,eve@domain.com,,Adult
6,Alice,25.0,New York,Female,alice@example.com,01-05-2021,Adult
7,Alice,25.0,New York,Female,alice@example.com,01-05-2021,Adult
8,Charlie,,Delhi,Male,charlie@example,20-07-2021,Minor


In [52]:
import pandas as pd
import numpy as np

data = {'A': [1, 2, np.nan, 4, 5, np.nan], 
        'B': [np.nan, 2, 3, np.nan, 5, 6]}

df = pd.DataFrame(data)

df_ffill = df.ffill()

print("Original DataFrame:")
print(df)
print("\nDataFrame after ffill:")
print(df_ffill)

Original DataFrame:
     A    B
0  1.0  NaN
1  2.0  2.0
2  NaN  3.0
3  4.0  NaN
4  5.0  5.0
5  NaN  6.0

DataFrame after ffill:
     A    B
0  1.0  NaN
1  2.0  2.0
2  2.0  3.0
3  4.0  3.0
4  5.0  5.0
5  5.0  6.0


In [53]:
import pandas as pd
import numpy as np

data = {'A': [1, np.nan, np.nan, 4], 
        'B': [np.nan, 5, np.nan, 7]}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

df_bfill = df.bfill()

print("\nDataFrame after bfill:")
print(df_bfill)

Original DataFrame:
     A    B
0  1.0  NaN
1  NaN  5.0
2  NaN  NaN
3  4.0  7.0

DataFrame after bfill:
     A    B
0  1.0  5.0
1  4.0  5.0
2  4.0  7.0
3  4.0  7.0
