## Pandas Mini Project

* This data contains the names, job title, and compensation for San Francisco city employees on an annual basis from 2011 to 2014.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# loading the dataset
data=pd.read_csv('Salaries.csv',low_memory=False)

In [None]:
# viewing the first 10 rows
data.head()

In [None]:
# viewing last 10 rows
data.tail()

In [None]:
# getting information
data.info()

In [None]:
# shape of data
data.shape

In [None]:
# calculating the null values
data.isnull().sum()

In [None]:
# since benifit,status and notes have a significant number of null values we delete these columns
data=data.drop(['Notes','Status','Benefits'],axis=1)

In [None]:
# checking unique values in Agency column
data.Agency.nunique()

In [None]:
# since Agency column has same entry for all we delete the column as we know the data is about sanfransisco employess
data=data.drop(['Agency'],axis=1)

In [None]:
data.head()

In [None]:
data.set_index('Id',inplace=True)

In [None]:
# checking for duplicates
data.duplicated()

In [None]:
# removing duplicates
data.drop_duplicates('EmployeeName',inplace=True)

In [None]:
# converting all string entries to title case
data['EmployeeName']=data['EmployeeName'].str.title()
data['JobTitle']=data['JobTitle'].str.title()

In [None]:
data.describe()

In [None]:
# since base pay overtime and other pay are not coming in the describe table we need to convert them to numeric format
# ran pd.to_numeric(data['BasePay']) which showed string at poisition 110809

In [None]:
# fixing error
data.iloc[110809]

In [None]:
# deleting the row as information is not provided
data.drop(data.index[110809],axis=0, inplace= True)

In [None]:
# converting to numeric
data['BasePay']= pd.to_numeric(data['BasePay'])
data['OvertimePay']=pd.to_numeric(data['OvertimePay'])
data['OtherPay']=pd.to_numeric(data['OtherPay'])

In [None]:
# statistical summary
data.describe()

In [None]:
# average base pay by year
year_group=data.groupby('Year')
u=pd.DataFrame(year_group['BasePay'].mean())
u

In [None]:
# plotting
u.plot(kind='bar')

In [None]:
# average base pay as per job title
round(data.groupby('JobTitle')['BasePay'].mean(),2)

In [None]:
# complete infromation about person with highest Total pay
data.iloc[data['TotalPay'].idxmax()]

In [None]:
# no. of job titles
data['JobTitle'].nunique()

In [None]:
# top5 most common jobs
data['JobTitle'].value_counts().head(5)

In [None]:
# which job title has highest overtime pay
data[data['OvertimePay'] == data['OvertimePay'].max()]['JobTitle']

In [None]:
# Number of employees per year
num_employee=data.groupby('Year')
num_employ_per_year=pd.DataFrame(num_employee.nunique()['EmployeeName'])
num_employ_per_year

In [None]:
# visualisation
num_employ_per_year.plot(kind='bar')

In [None]:
# histogram that shows distribution of total pay
sns.histplot(data['TotalPay'])

In [None]:
# list of people working in Police department
police_employees=data[data['JobTitle'].str.contains('Police')]
police_employees