# Data Manipulation and Preparation with Pandas
_Week 3_



In [None]:
# Loading the dataset
import pandas
df = pandas.read_csv('netherlands_IT_salaries.csv') 


## Sorting Data

- Sort the dataset by Salary in descending order
- Sort by City and then by Role, both in ascending order

In [None]:
# import pandas as pd

df = pd.read_excel('netherlands_IT_salaries.xlsx')
descending = df.sort_values(by='Salary',ascending=False)

city = df.sort_values(by='City',ascending=True)
role = df.sort_values(by='Role',ascending=True)

## Handling Missing Data


· Check if there are any missing values in each column.

· Display only the rows with missing values.

· Fill missing Education values with 'Unknown'.

· Replace missing Salary values (if any) with the average salary.

· Drop rows where City is missing.

· Decide what to do with missing values in Experience.

In [None]:
df.info()

In [None]:
df_missing = df[df.isna().any(axis=1)]
print(df_missing)

In [None]:
df['Education'].fillna('Unknown', inplace=True)
df

In [None]:
df['Salary'].fillna(df['Salary'].mean(), inplace=True)

df

In [None]:
df.dropna(subset=['City'])
df

In [None]:
df['Experience'].fillna(df['Experience'].mean(), inplace=True)

df

## Removing Duplicates


· Check if there are any duplicate rows.

· If duplicates exist, remove them while keeping only the first occurrence.

In [None]:
is_duplicated = df.duplicated().any()
print(is_duplicated)



In [None]:
df.drop_duplicates(inplace=True)


# Detecting Outliers

· Use the Interquartile Range (IQR) method to identify potential outliers in Salary.

· Display the rows containing salary outliers.

· Remove detected outliers from the dataset.


In [None]:
Q1 = df["Salary"].quantile(0.25)
Q3 = df["Salary"].quantile(0.75)
IQR = Q3 - Q1
Outliers = df[(df["Salary"] < (Q1 - 1.5 * IQR)) | (df["Salary"] > (Q3 + 1.5 * IQR))]


In [None]:
print(Outliers)

In [None]:
for i in Outliers.index:
    df.drop(i, inplace=True)



# GroupBy Operations

· Group the data by City and calculate:

o The average salary in each city.

o The number of roles in each city.

· Group the data by Education and find the maximum salary for each education level.

· Group the data by Seniority and find the median salary for each seniority level.

In [None]:
avg_salary = df.groupby("City")["Salary"].mean()
role_count = df.groupby("City")["Role"].count()
print("Avg salary: \n", avg_salary)
print("Role count: \n", role_count)

In [None]:
max_salary = df.groupby("Education")["Salary"].max()
print(max_salary)

In [None]:
median_salary = df.groupby("Experience")["Salary"].median()
print(median_salary)

# Apply and Map Functions

· Add a new column Annual Salary by applying a function to calculate Salary x 12.

· Use the .map() function to categorize education levels into 'Low', 'Medium', and 'High':

o       MBO, Mavo, and Unknown → 'Low'

o       HBO → 'Medium'

o       WO → 'High'

o       Store the new values in a column called Education Level.


In [None]:
df["Annual Salary"] = df["Salary"] * 12
print(df["Annual Salary"])

In [None]:
education_map = {
    "MBO": "Low",
    "Mavo": "Low",
    "Unknown": "Low",
    "HBO": "Medium",
    "WO": "High"
}

df["Education Level"] = df["Education"].map(education_map)
print(df["Education Level"])

# Feature Engineering

· Create a new column Year of Birth using the formula: Year - Age.

· Create a new column Seniority based on experience:

o Less than 5 years → 'Junior'

o Between 5 and 10 years → 'Mid-level'

o More than 10 years → 'Senior'

· Create a new column City Category:

o If the city is 'Randstad', 'Amsterdam', or 'Utrecht', categorize it as 'Urban'.

o Otherwise, categorize it as 'Non-Urban'.

· Create a new column Salary Bracket based on salary:

o Less than €3000 → 'Low'

o Between €3000 and €5000 → 'Medium'

o More than €5000 → 'High'.

In [None]:
df['Year of Birth'] = df['Year'] - df['Age']

In [None]:
df['Seniority'] = df.cut(df['Experience'], bins=[-1, 5, 10, float('inf')], labels=['Junior', 'Mid-level', 'Senior'])

In [None]:
df['City Category'] = df['City'].apply(lambda x: 'Urban' if x in ['Randstad', 'Amsterdam', 'Utrecht'] else 'Non-Urban')

In [None]:
df['Salary Bracket'] = df.cut(df['Salary'], bins=[-float('inf'), 3000, 5000, float('inf')], labels=['Low', 'Medium', 'High'])
print(df)

# Advanced Analysis


· Find the top 5 highest-paying roles.

· Find the most common role in the dataset.

In [None]:
top_5 = df.groupby('Role')['Salary'].mean().sort_values(ascending=False).head(5)
print(top_5)

In [None]:
common_role = df['Role'].mode()[0]
print(common_role)

# Save the Cleaned Data


· Save the final cleaned dataset to a CSV file named cleaned_IT_salaries.csv.

In [None]:
df.to_csv('cleaned_IT_salaries.csv', index=False)