# Data Cleaning Practice

In [57]:
import pandas as pd
import numpy as np

data = {
    'EmpID': [101, 102, 103, 104, 105, 106, 107, 108, 109, 110],
    'Name': ['Ali', 'Sara', None, 'Zara', 'Usman', 'Hina', 'Bilal', 'Nida', 'Omer', 'Ayesha'],
    'Dept': ['IT', 'hr', 'IT ', 'Finance', 'Sales', 'it', 'HR', 'Marketing', 'Finance', 'IT'],
    'Salary': ['45,000', '52000', '48k', '60,000', None, '47,000', '55,000', '62,000', '58000', '50k'],
    'Experience': [1, 5, np.nan, 7, 3, 1, 4, np.nan, 8, 2],
    'JoinDate': ['2021-01-10', '2018/05/20', None, '2016-03-15', '2019-07-01', 
                 '2022-02-11', '2017/08/30', '2015-06-05', '2014-11-25', '2020-09-01']
}

df = pd.DataFrame(data)


In [58]:
df

Unnamed: 0,EmpID,Name,Dept,Salary,Experience,JoinDate
0,101,Ali,IT,45000,1.0,2021-01-10
1,102,Sara,hr,52000,5.0,2018/05/20
2,103,,IT,48k,,
3,104,Zara,Finance,60000,7.0,2016-03-15
4,105,Usman,Sales,,3.0,2019-07-01
5,106,Hina,it,47000,1.0,2022-02-11
6,107,Bilal,HR,55000,4.0,2017/08/30
7,108,Nida,Marketing,62000,,2015-06-05
8,109,Omer,Finance,58000,8.0,2014-11-25
9,110,Ayesha,IT,50k,2.0,2020-09-01


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   EmpID       10 non-null     int64  
 1   Name        9 non-null      object 
 2   Dept        10 non-null     object 
 3   Salary      9 non-null      object 
 4   Experience  8 non-null      float64
 5   JoinDate    9 non-null      object 
dtypes: float64(1), int64(1), object(4)
memory usage: 612.0+ bytes


### __üßπ PART A ‚Äî DATA CLEANING (HARD)__

#### Q1Ô∏è‚É£ Department Standardization
```
* Remove extra spaces
* Convert to uppercase
* Validate departments as only:
* IT, HR, FINANCE, MARKETING
```


In [60]:
df['Dept'] = df['Dept'].str.upper().str.strip()
# Validation using .isin()
valid_Depts = ['IT', 'HR', 'FINANCE', 'MARKETING']
df.loc[~df['Dept'].isin(valid_Depts),'Dept']=None
df

Unnamed: 0,EmpID,Name,Dept,Salary,Experience,JoinDate
0,101,Ali,IT,45000,1.0,2021-01-10
1,102,Sara,HR,52000,5.0,2018/05/20
2,103,,IT,48k,,
3,104,Zara,FINANCE,60000,7.0,2016-03-15
4,105,Usman,,,3.0,2019-07-01
5,106,Hina,IT,47000,1.0,2022-02-11
6,107,Bilal,HR,55000,4.0,2017/08/30
7,108,Nida,MARKETING,62000,,2015-06-05
8,109,Omer,FINANCE,58000,8.0,2014-11-25
9,110,Ayesha,IT,50k,2.0,2020-09-01


#### Q2Ô∏è‚É£ Salary Conversion
```
Convert salary to numeric
Handle:
* commas
* 'k' (thousands)
* missing values
Final dtype must be numeric (int or float)
```

In [61]:
# Using Replace Function
df=df.replace({'Salary':'k'},'000',regex=True)
df=df.replace({'Salary':','},'',regex=True)

# Using str.replace
df['Salary'] = df['Salary'].str.replace('k','000').str.replace(',','')
df['Salary']=pd.to_numeric(df['Salary'])
df['Salary']=df['Salary'].fillna(df['Salary'].median())
df['Salary']=df['Salary'].astype(int)

df['Salary']

0    45000
1    52000
2    48000
3    60000
4    52000
5    47000
6    55000
7    62000
8    58000
9    50000
Name: Salary, dtype: int64

#### Q3Ô∏è‚É£ Experience Imputation
```
Fill missing Experience using:
* department-wise mean
* If department mean not available:
* use global mean
```

In [62]:
globalExpMean=df['Experience'].mean()

s=df.groupby('Dept')['Experience'].transform('mean')
df['Experience']=df['Experience'].fillna(s)
df['Experience']=df['Experience'].fillna(globalExpMean)
df['Experience']=df['Experience'].round(2)
df

Unnamed: 0,EmpID,Name,Dept,Salary,Experience,JoinDate
0,101,Ali,IT,45000,1.0,2021-01-10
1,102,Sara,HR,52000,5.0,2018/05/20
2,103,,IT,48000,1.33,
3,104,Zara,FINANCE,60000,7.0,2016-03-15
4,105,Usman,,52000,3.0,2019-07-01
5,106,Hina,IT,47000,1.0,2022-02-11
6,107,Bilal,HR,55000,4.0,2017/08/30
7,108,Nida,MARKETING,62000,3.88,2015-06-05
8,109,Omer,FINANCE,58000,8.0,2014-11-25
9,110,Ayesha,IT,50000,2.0,2020-09-01


#### Q4Ô∏è‚É£ Join Date Cleaning
```
Convert JoinDate to datetime
Handle mixed formats
Invalid/missing ‚Üí NaT
```

In [63]:
df['JoinDate']=df['JoinDate'].str.replace('/','-')
df['JoinDate']=pd.to_datetime(df['JoinDate'])


#### Q5Ô∏è‚É£ Name Cleaning
```
Drop rows where Name is missing
Ensure index is reset cleanly
```

In [64]:
df
df.dropna(subset=['Name'])

df.reset_index(drop=True)


Unnamed: 0,EmpID,Name,Dept,Salary,Experience,JoinDate
0,101,Ali,IT,45000,1.0,2021-01-10
1,102,Sara,HR,52000,5.0,2018-05-20
2,103,,IT,48000,1.33,NaT
3,104,Zara,FINANCE,60000,7.0,2016-03-15
4,105,Usman,,52000,3.0,2019-07-01
5,106,Hina,IT,47000,1.0,2022-02-11
6,107,Bilal,HR,55000,4.0,2017-08-30
7,108,Nida,MARKETING,62000,3.88,2015-06-05
8,109,Omer,FINANCE,58000,8.0,2014-11-25
9,110,Ayesha,IT,50000,2.0,2020-09-01


### __PART B ‚Äî DATA RETRIEVAL (QUERY-STYLE)__
#### Q6Ô∏è‚É£ Retrieve employees who:
```
Are in IT
Have Experience ‚â• 2
Have Salary > 50000
```

In [65]:
df.loc[(df["Dept"]=='IT')&(df['Experience']>=2)&(df['Salary']>5000)]

Unnamed: 0,EmpID,Name,Dept,Salary,Experience,JoinDate
9,110,Ayesha,IT,50000,2.0,2020-09-01


#### Q7Ô∏è‚É£ Retrieve employees who:

```
Joined before 2019
Belong to HR or Finance
```

In [66]:
df.loc[(df['JoinDate']<'2019-01-01') & (df['Dept'].isin(['HR','FINANCE']))]

Unnamed: 0,EmpID,Name,Dept,Salary,Experience,JoinDate
1,102,Sara,HR,52000,5.0,2018-05-20
3,104,Zara,FINANCE,60000,7.0,2016-03-15
6,107,Bilal,HR,55000,4.0,2017-08-30
8,109,Omer,FINANCE,58000,8.0,2014-11-25


#### Q8Ô∏è‚É£ Find:

```
Average salary per department
Sort result by salary descending
```

In [67]:
df.groupby('Dept')['Salary'].mean().sort_index(ascending=False)

Dept
MARKETING    62000.0
IT           47500.0
HR           53500.0
FINANCE      59000.0
Name: Salary, dtype: float64

#### Q9Ô∏è‚É£ Retrieve top 3 highest paid employees
    (Return Name, Dept, Salary only)

In [68]:
df[['Name','Dept','Salary']].sort_values('Salary',ascending=False).head(3)

Unnamed: 0,Name,Dept,Salary
7,Nida,MARKETING,62000
3,Zara,FINANCE,60000
8,Omer,FINANCE,58000


#### Qüîü Find employees whose:
```
Salary is above department average
(Hint: no loops)
```

In [69]:
dept_Avg=df.groupby('Dept')['Salary'].transform('mean')
df.loc[df['Salary']>dept_Avg]

Unnamed: 0,EmpID,Name,Dept,Salary,Experience,JoinDate
2,103,,IT,48000,1.33,NaT
3,104,Zara,FINANCE,60000,7.0,2016-03-15
6,107,Bilal,HR,55000,4.0,2017-08-30
9,110,Ayesha,IT,50000,2.0,2020-09-01


### üõ†Ô∏è PART C ‚Äî SAFE MODIFICATIONS (NO BUGS)
#### Q1Ô∏è‚É£1Ô∏è‚É£
```
Give a 10% salary raise to:
Employees with Experience ‚â• 5
```

In [72]:
df.loc[df['Experience']>=5,'Salary']*=1.10
df

Unnamed: 0,EmpID,Name,Dept,Salary,Experience,JoinDate
0,101,Ali,IT,45000.0,1.0,2021-01-10
1,102,Sara,HR,62920.0,5.0,2018-05-20
2,103,,IT,48000.0,1.33,NaT
3,104,Zara,FINANCE,72600.0,7.0,2016-03-15
4,105,Usman,,52000.0,3.0,2019-07-01
5,106,Hina,IT,47000.0,1.0,2022-02-11
6,107,Bilal,HR,55000.0,4.0,2017-08-30
7,108,Nida,MARKETING,62000.0,3.88,2015-06-05
8,109,Omer,FINANCE,70180.0,8.0,2014-11-25
9,110,Ayesha,IT,50000.0,2.0,2020-09-01


#### Q1Ô∏è‚É£2Ô∏è‚É£
```
Set Experience = 0 for:
Employees who joined after 2021
```

In [75]:
df.loc[df['JoinDate']>'2021-01-01','Experience']=0
df

Unnamed: 0,EmpID,Name,Dept,Salary,Experience,JoinDate
0,101,Ali,IT,45000.0,0.0,2021-01-10
1,102,Sara,HR,62920.0,5.0,2018-05-20
2,103,,IT,48000.0,1.33,NaT
3,104,Zara,FINANCE,72600.0,7.0,2016-03-15
4,105,Usman,,52000.0,3.0,2019-07-01
5,106,Hina,IT,47000.0,0.0,2022-02-11
6,107,Bilal,HR,55000.0,4.0,2017-08-30
7,108,Nida,MARKETING,62000.0,3.88,2015-06-05
8,109,Omer,FINANCE,70180.0,8.0,2014-11-25
9,110,Ayesha,IT,50000.0,2.0,2020-09-01
