# Data Cleaning and Processing

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data_cleaning_sample.csv')
df

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Joe,25.0,New York,F,alice@example.com,1/5/2021
1,Smith,30.0,Los Angeles,M,bob@example.com,15-06-2020
2,Alex,,Delhi,M,charlie@example,20-07-2021
3,Charlie,22.0,Mumbai,M,david@example.com,12/11/2019
4,Jane,28.0,Delhi,F,eve@domain.com,


In [3]:
df.isnull()

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,True,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,True


In [4]:
df.isnull().sum()

Name         0
Age          1
City         0
Gender       0
Email        0
Join Date    1
dtype: int64

In [5]:
df.dropna()

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Joe,25.0,New York,F,alice@example.com,1/5/2021
1,Smith,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,22.0,Mumbai,M,david@example.com,12/11/2019


In [6]:
df.dropna(axis=1)

Unnamed: 0,Name,City,Gender,Email
0,Joe,New York,F,alice@example.com
1,Smith,Los Angeles,M,bob@example.com
2,Alex,Delhi,M,charlie@example
3,Charlie,Mumbai,M,david@example.com
4,Jane,Delhi,F,eve@domain.com


In [7]:
df.fillna(0)

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Joe,25.0,New York,F,alice@example.com,1/5/2021
1,Smith,30.0,Los Angeles,M,bob@example.com,15-06-2020
2,Alex,0.0,Delhi,M,charlie@example,20-07-2021
3,Charlie,22.0,Mumbai,M,david@example.com,12/11/2019
4,Jane,28.0,Delhi,F,eve@domain.com,0


In [8]:
df['Age'].fillna(df['Age'].mean())

0    25.00
1    30.00
2    26.25
3    22.00
4    28.00
Name: Age, dtype: float64

In [9]:
df.duplicated()

0    False
1    False
2    False
3    False
4    False
dtype: bool

In [10]:
df.drop_duplicates()

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Joe,25.0,New York,F,alice@example.com,1/5/2021
1,Smith,30.0,Los Angeles,M,bob@example.com,15-06-2020
2,Alex,,Delhi,M,charlie@example,20-07-2021
3,Charlie,22.0,Mumbai,M,david@example.com,12/11/2019
4,Jane,28.0,Delhi,F,eve@domain.com,


In [11]:
df['Name'].str.lower()

0        joe
1      smith
2       alex
3    charlie
4       jane
Name: Name, dtype: object

In [12]:
df['City'].str.contains("delhi", case=False)

0    False
1    False
2     True
3    False
4     True
Name: City, dtype: bool

In [13]:
df['Email'].str.split("@")

0    [alice, example.com]
1      [bob, example.com]
2      [charlie, example]
3    [david, example.com]
4       [eve, domain.com]
Name: Email, dtype: object

In [14]:
df2 = df.dropna().copy()

In [15]:
df2

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Joe,25.0,New York,F,alice@example.com,1/5/2021
1,Smith,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,22.0,Mumbai,M,david@example.com,12/11/2019


In [16]:

type(df2['Age'])

pandas.core.series.Series

In [17]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, 0 to 3
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Name       3 non-null      object 
 1   Age        3 non-null      float64
 2   City       3 non-null      object 
 3   Gender     3 non-null      object 
 4   Email      3 non-null      object 
 5   Join Date  3 non-null      object 
dtypes: float64(1), object(5)
memory usage: 168.0+ bytes


In [18]:
df2['Age'].astype('int32')

0    25
1    30
3    22
Name: Age, dtype: int32

In [19]:
df2['Age Group'] = df2['Age'].apply(lambda x: "Adult" if x >= 25 else "Minor")

In [20]:
df2

Unnamed: 0,Name,Age,City,Gender,Email,Join Date,Age Group
0,Joe,25.0,New York,F,alice@example.com,1/5/2021,Adult
1,Smith,30.0,Los Angeles,M,bob@example.com,15-06-2020,Adult
3,Charlie,22.0,Mumbai,M,david@example.com,12/11/2019,Minor


In [21]:
def isminor(age):
    return "Minor" if age < 25 else "Adult"

df2['Age Group'] = df2['Age'].apply(isminor)
df2

Unnamed: 0,Name,Age,City,Gender,Email,Join Date,Age Group
0,Joe,25.0,New York,F,alice@example.com,1/5/2021,Adult
1,Smith,30.0,Los Angeles,M,bob@example.com,15-06-2020,Adult
3,Charlie,22.0,Mumbai,M,david@example.com,12/11/2019,Minor


In [22]:
gender_mapping = {"M": "Male", "F": "Female", "O": "Other"}
df2['Gender'] = df2['Gender'].map(gender_mapping)
df2

Unnamed: 0,Name,Age,City,Gender,Email,Join Date,Age Group
0,Joe,25.0,New York,Female,alice@example.com,1/5/2021,Adult
1,Smith,30.0,Los Angeles,Male,bob@example.com,15-06-2020,Adult
3,Charlie,22.0,Mumbai,Male,david@example.com,12/11/2019,Minor


In [23]:
df2['City'] = df2['City'].replace({"New York": "NYC", "Los Angeles": "LA"})
df2

Unnamed: 0,Name,Age,City,Gender,Email,Join Date,Age Group
0,Joe,25.0,NYC,Female,alice@example.com,1/5/2021,Adult
1,Smith,30.0,LA,Male,bob@example.com,15-06-2020,Adult
3,Charlie,22.0,Mumbai,Male,david@example.com,12/11/2019,Minor
