In [70]:
import numpy as np
import pandas as pd

In [71]:
df = pd.read_csv(r"C:\Users\sanja\Downloads\Data-cleaning-for-beginners-using-pandas.csv")

In [72]:
df.head()

Unnamed: 0,Index,Age,Salary,Rating,Location,Established,Easy Apply
0,0,44.0,$44k-$99k,5.4,"India,In",1999,TRUE
1,1,66.0,$55k-$66k,3.5,"New York,Ny",2002,TRUE
2,2,,$77k-$89k,-1.0,"New York,Ny",-1,-1
3,3,64.0,$44k-$99k,4.4,India In,1988,-1
4,4,25.0,$44k-$99k,6.4,Australia Aus,2002,-1


**Missing Value**

In [73]:
df.isnull().sum()

Index          0
Age            7
Salary         0
Rating         1
Location       0
Established    0
Easy Apply     0
dtype: int64

As we can see, Age and Rating columns contain missing value. The total size of the dataset is 29x7, also the missing value is low in number, we can perform either of the following steps  1. Remove the entire row that contains the missing value n
2. R Replacing the missing values by median or mode value of the colum
      we can observe from the above table information rating columns as only one missing values we quiet go with removing the row from the dataset, whereas if we do the same action the on Age column we could lose some row in the small dataset, so we can opt for the 2 action to deal missing values.

In [74]:
missing_values_index = df[df['Rating'].isnull()].index
df = df.drop(missing_values_index)

In [75]:
df['Age'].fillna(df['Age'].mode()[0], inplace=True)

In [76]:
df.isnull().sum()

Index          0
Age            0
Salary         0
Rating         0
Location       0
Established    0
Easy Apply     0
dtype: int64

In [77]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 28 entries, 0 to 28
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Index        28 non-null     int64  
 1   Age          28 non-null     float64
 2   Salary       28 non-null     object 
 3   Rating       28 non-null     float64
 4   Location     28 non-null     object 
 5   Established  28 non-null     int64  
 6   Easy Apply   28 non-null     object 
dtypes: float64(2), int64(2), object(3)
memory usage: 1.8+ KB
None



 **Salary Formatting**

In [78]:
def avg_salary(salary):
    if '-' in salary:
        low, high = salary.split('-')
        return (float(low)+float(high))/2
    else:
        return float(salary)

In [79]:
# Conversion Salary Formatting:

df['Salary'] = df['Salary'].apply(lambda x:x.replace("$",''))
df['Salary'] = df['Salary'].apply(lambda x:x.replace("k",'000'))

df['Salary'] = df['Salary'].apply(avg_salary)
df

Unnamed: 0,Index,Age,Salary,Rating,Location,Established,Easy Apply
0,0,44.0,71500.0,5.4,"India,In",1999,TRUE
1,1,66.0,60500.0,3.5,"New York,Ny",2002,TRUE
2,2,44.0,83000.0,-1.0,"New York,Ny",-1,-1
3,3,64.0,71500.0,4.4,India In,1988,-1
4,4,25.0,71500.0,6.4,Australia Aus,2002,-1
5,5,44.0,83000.0,1.4,"India,In",1999,TRUE
6,6,21.0,71500.0,0.0,"New York,Ny",-1,-1
7,7,44.0,71500.0,-1.0,Australia Aus,-1,-1
8,8,35.0,71500.0,5.4,"New York,Ny",-1,-1
9,9,22.0,71500.0,7.7,"India,In",-1,TRUE


**Easy Apply Indicator**

In [80]:
#converting -1 value to False 
df['Easy Apply'] = df['Easy Apply'].apply(lambda x:x.replace("-1",'False'))

Converting the - 1 value into False, and I'm just considering it as an object only

**Outlier Finding in Numerical Columns**

In [81]:
Q1_age = df['Age'].quantile(0.25)
print(Q1_age)
Q3_age = df['Age'].quantile(0.75)
print(Q3_age)
IQR_age = Q3_age - Q1_age
print(IQR_age)
outliers_age = df[(df['Age'] < (Q1_age - 1.5 * IQR_age)) | (df['Age'] > (Q3_age + 1.5 * IQR_age))]

outliers_age

30.25
44.0
13.75


Unnamed: 0,Index,Age,Salary,Rating,Location,Established,Easy Apply
1,1,66.0,60500.0,3.5,"New York,Ny",2002,True
14,14,66.0,71500.0,4.0,Australia Aus,2020,True


**Age Distribution**
As most person age are from 22 to 44. The outlier age are 13 and 66.

In [82]:
Q1_salary = df['Salary'].quantile(0.25)
Q3_salary = df['Salary'].quantile(0.75)
IQR_salary = Q3_salary - Q1_salary

outliers_salary = df[(df['Salary'] < (Q1_salary - 1.5 * IQR_salary)) | (df['Salary'] > (Q3_salary + 1.5 * IQR_salary))]
print(Q1_salary - 1.5 * IQR_salary)

71500.0


In [83]:
Q1_rating = df['Rating'].quantile(0.25)
Q3_rating = df['Rating'].quantile(0.75)
IQR_rating = Q3_rating - Q1_rating
outliers_rating = df[(df['Rating'] < (Q1_rating - 1.5 * IQR_rating)) | (df['Rating'] > (Q3_rating + 1.5 * IQR_rating))]
print(Q1_rating - 1.5 * IQR_rating)
print(Q3_rating + 1.5 * IQR_rating)
outliers_rating

-5.4750000000000005
11.925


Unnamed: 0,Index,Age,Salary,Rating,Location,Established,Easy Apply


Adjusting Outlier for Salary since total 10 rows comes under salary, we can’t remove every row, or we can’t leave as it is, so let’s adjust the value by using sip() function.

In [84]:
df['Salary'] = df['Salary'].clip(lower=(Q1_salary - 1.5 * IQR_salary), upper=(Q3_salary + 1.5 * IQR_salary))

**Handling Special Characters,Location Standardization and Location Accuracy**

In [85]:
df['Location'].value_counts()

Location
New York,Ny      12
India,In          8
Australia Aus     7
India In          1
Name: count, dtype: int64

In [86]:
df['Location']=df['Location'].replace('Australia Aus', 'Australia,Aus')
df['Location']=df['Location'].replace('India In', 'India,In')
df['Location'].value_counts()

Location
New York,Ny      12
India,In          9
Australia,Aus     7
Name: count, dtype: int64

In [87]:
df['Country'] = df['Location'].str.split(",",expand=False).str[0]

**Rating Conversion and Consistent Rating Scale**

In [88]:
expected_min = 1
expected_max = 10

df['Rating'] = df['Rating'].apply(lambda x: x if expected_min < x < expected_max else pd.NA)

**Established Column:**


In [89]:
df['Established'] = df['Established'].apply(lambda x: x if x > -1 else pd.NA)

In [90]:
''' From the above table we can see the Age of people list. when we are calculating outlier people at age 66 years seem to be outliers whereas Age 13 is quit unusual so we can remove it.'''

' From the above table we can see the Age of people list. when we are calculating outlier people at age 66 years seem to be outliers whereas Age 13 is quit unusual so we can remove it.'

In [91]:
df = df.drop(df[df['Age']==13.0].index)

In [92]:
df['Easy Apply'].dtype

dtype('O')

**Conversion Easy Apply column into Boolean type**

In [93]:
df['Easy Apply']=df['Easy Apply'].astype(bool)

**Handling Categorical Data**

As there only one column of catergorical data is Location/Country so we convert them in encode value using one-hot encounter 

In [68]:
from sklearn.preprocessing import LabelEncoder

In [69]:
Label = LabelEncoder()

In [94]:
df['Country'] = Label.fit_transform(df['Country'].values)

In [95]:
df

Unnamed: 0,Index,Age,Salary,Rating,Location,Established,Easy Apply,Country
0,0,44.0,71500.0,5.4,"India,In",1999.0,True,1
1,1,66.0,71500.0,3.5,"New York,Ny",2002.0,True,2
2,2,44.0,71500.0,,"New York,Ny",,True,2
3,3,64.0,71500.0,4.4,"India,In",1988.0,True,1
4,4,25.0,71500.0,6.4,"Australia,Aus",2002.0,True,0
5,5,44.0,71500.0,1.4,"India,In",1999.0,True,1
6,6,21.0,71500.0,,"New York,Ny",,True,2
7,7,44.0,71500.0,,"Australia,Aus",,True,0
8,8,35.0,71500.0,5.4,"New York,Ny",,True,2
9,9,22.0,71500.0,7.7,"India,In",,True,1


Convert the country into numerical
1. India - 1
2. New york - 2
3. Austrilia -3
