In [1]:
import math
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import seaborn as sns  
import matplotlib.pyplot as plt
from scipy import stats
from datetime import datetime

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
import sklearn.preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression, RFE

## Acquiring Our Dataframe

In [2]:
df = pd.read_csv('texas.csv', index_col=0) ## reading our csv into a pandas dataframe

df.head() ## previewing our data

Unnamed: 0_level_0,NAME,LASTNAME,FIRSTNAME,MI,JOBCLASS,JC TITLE,RACE,SEX,EMPTYPE,HIREDT,RATE,HRSWKD,MONTHLY,ANNUAL,STATENUM,duplicated,multiple_full_time_jobs,combined_multiple_jobs,summed_annual_salary,hide_from_search
AGY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
101,SENATE ...,GILLIAM,STACEY,L,7101,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,FEMALE,URP - UNCLASSIFIED REGULAR PART-TIME,07/01/13,0.0,20.0,8100.0,97200.0,339371,True,,,181200.0,
104,LEGISLATIVE BUDGET BOARD ...,GILLIAM,STACEY,L,C160,COMMITTEE DIRECTOR ...,WHITE,FEMALE,URP - UNCLASSIFIED REGULAR PART-TIME,11/01/17,0.0,20.0,7000.0,84000.0,339371,True,,,,True
101,SENATE ...,NELSON,DAVID,,7101,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,MALE,URP - UNCLASSIFIED REGULAR PART-TIME,12/01/98,0.0,20.0,9500.0,114000.0,193187,True,,,210000.0,
104,LEGISLATIVE BUDGET BOARD ...,NELSON,DAVID,,P080,SENIOR BUDGET ADVISOR ...,WHITE,MALE,URP - UNCLASSIFIED REGULAR PART-TIME,11/01/17,0.0,20.0,8000.0,96000.0,193187,True,,,,True
101,SENATE ...,ROCHA,MARIE,S,7103,LEG. SERVICE/MAINTENANCE ...,HISPANIC,FEMALE,URF - UNCLASSIFIED REGULAR FULL-TIME,05/01/03,0.0,41.0,3365.4,40384.8,152257,True,,True,,


In [3]:
df.shape ## checking the shape of our dataframe

(144738, 20)

In [4]:
## checking our column data types

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 144738 entries, 101 to 809
Data columns (total 20 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   NAME                     144738 non-null  object 
 1   LASTNAME                 144738 non-null  object 
 2   FIRSTNAME                144738 non-null  object 
 3   MI                       144738 non-null  object 
 4   JOBCLASS                 144738 non-null  object 
 5   JC TITLE                 144738 non-null  object 
 6   RACE                     144738 non-null  object 
 7   SEX                      144738 non-null  object 
 8   EMPTYPE                  144738 non-null  object 
 9   HIREDT                   144738 non-null  object 
 10  RATE                     144738 non-null  float64
 11  HRSWKD                   144738 non-null  float64
 12  MONTHLY                  144738 non-null  float64
 13  ANNUAL                   144738 non-null  float64
 14  STATE

## Preparing Our Dataframe

### Prepare: Clean & Prep

In [5]:
type(df)

pandas.core.frame.DataFrame

In [6]:
## first let's rename all the columns to lowercase for easeier workflow

df.rename(str.lower, axis='columns', inplace=True)

In [7]:
## now let's remove any potential leading whitesapce

df.columns = df.columns.str.strip()

In [8]:
df.head() ## previewing our changes

Unnamed: 0_level_0,name,lastname,firstname,mi,jobclass,jc title,race,sex,emptype,hiredt,rate,hrswkd,monthly,annual,statenum,duplicated,multiple_full_time_jobs,combined_multiple_jobs,summed_annual_salary,hide_from_search
AGY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
101,SENATE ...,GILLIAM,STACEY,L,7101,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,FEMALE,URP - UNCLASSIFIED REGULAR PART-TIME,07/01/13,0.0,20.0,8100.0,97200.0,339371,True,,,181200.0,
104,LEGISLATIVE BUDGET BOARD ...,GILLIAM,STACEY,L,C160,COMMITTEE DIRECTOR ...,WHITE,FEMALE,URP - UNCLASSIFIED REGULAR PART-TIME,11/01/17,0.0,20.0,7000.0,84000.0,339371,True,,,,True
101,SENATE ...,NELSON,DAVID,,7101,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,MALE,URP - UNCLASSIFIED REGULAR PART-TIME,12/01/98,0.0,20.0,9500.0,114000.0,193187,True,,,210000.0,
104,LEGISLATIVE BUDGET BOARD ...,NELSON,DAVID,,P080,SENIOR BUDGET ADVISOR ...,WHITE,MALE,URP - UNCLASSIFIED REGULAR PART-TIME,11/01/17,0.0,20.0,8000.0,96000.0,193187,True,,,,True
101,SENATE ...,ROCHA,MARIE,S,7103,LEG. SERVICE/MAINTENANCE ...,HISPANIC,FEMALE,URF - UNCLASSIFIED REGULAR FULL-TIME,05/01/03,0.0,41.0,3365.4,40384.8,152257,True,,True,,


In [9]:
df.columns ## previewing our columns

Index(['name', 'lastname', 'firstname', 'mi', 'jobclass', 'jc title', 'race',
       'sex', 'emptype', 'hiredt', 'rate', 'hrswkd', 'monthly', 'annual',
       'statenum', 'duplicated', 'multiple_full_time_jobs',
       'combined_multiple_jobs', 'summed_annual_salary', 'hide_from_search'],
      dtype='object')

In [10]:
## Let's drop unneccasry columns that won't be any help with predicting our
## target variable because they are either incomplete or unsignificant information

df = df.drop(columns = ['jobclass','mi', 'rate', 'statenum', 'duplicated', 'multiple_full_time_jobs',
       'combined_multiple_jobs', 'summed_annual_salary', 'hide_from_search'])

In [11]:
df.head() ## previewing our changes

Unnamed: 0_level_0,name,lastname,firstname,jc title,race,sex,emptype,hiredt,hrswkd,monthly,annual
AGY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
101,SENATE ...,GILLIAM,STACEY,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,FEMALE,URP - UNCLASSIFIED REGULAR PART-TIME,07/01/13,20.0,8100.0,97200.0
104,LEGISLATIVE BUDGET BOARD ...,GILLIAM,STACEY,COMMITTEE DIRECTOR ...,WHITE,FEMALE,URP - UNCLASSIFIED REGULAR PART-TIME,11/01/17,20.0,7000.0,84000.0
101,SENATE ...,NELSON,DAVID,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,MALE,URP - UNCLASSIFIED REGULAR PART-TIME,12/01/98,20.0,9500.0,114000.0
104,LEGISLATIVE BUDGET BOARD ...,NELSON,DAVID,SENIOR BUDGET ADVISOR ...,WHITE,MALE,URP - UNCLASSIFIED REGULAR PART-TIME,11/01/17,20.0,8000.0,96000.0
101,SENATE ...,ROCHA,MARIE,LEG. SERVICE/MAINTENANCE ...,HISPANIC,FEMALE,URF - UNCLASSIFIED REGULAR FULL-TIME,05/01/03,41.0,3365.4,40384.8


In [12]:
## renaming columns for ease of workflow

df = df.rename(columns = {'jc title': 'title', 
                          'hiredt': 'hire_date', 
                          'hrswkd': 'hours_worked',
                          'name': 'agency',
                          'agy': 'agency_id',
                          'monthly': 'monthly_salary',
                          'annual': 'annual_salary'})

df.head() ## previewing our changes

Unnamed: 0_level_0,agency,lastname,firstname,title,race,sex,emptype,hire_date,hours_worked,monthly_salary,annual_salary
AGY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
101,SENATE ...,GILLIAM,STACEY,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,FEMALE,URP - UNCLASSIFIED REGULAR PART-TIME,07/01/13,20.0,8100.0,97200.0
104,LEGISLATIVE BUDGET BOARD ...,GILLIAM,STACEY,COMMITTEE DIRECTOR ...,WHITE,FEMALE,URP - UNCLASSIFIED REGULAR PART-TIME,11/01/17,20.0,7000.0,84000.0
101,SENATE ...,NELSON,DAVID,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,MALE,URP - UNCLASSIFIED REGULAR PART-TIME,12/01/98,20.0,9500.0,114000.0
104,LEGISLATIVE BUDGET BOARD ...,NELSON,DAVID,SENIOR BUDGET ADVISOR ...,WHITE,MALE,URP - UNCLASSIFIED REGULAR PART-TIME,11/01/17,20.0,8000.0,96000.0
101,SENATE ...,ROCHA,MARIE,LEG. SERVICE/MAINTENANCE ...,HISPANIC,FEMALE,URF - UNCLASSIFIED REGULAR FULL-TIME,05/01/03,41.0,3365.4,40384.8


In [13]:
## changing hire date to date time format

df.hire_date = pd.to_datetime(df.hire_date)

df.info() ## previewing data type changes

<class 'pandas.core.frame.DataFrame'>
Int64Index: 144738 entries, 101 to 809
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   agency          144738 non-null  object        
 1   lastname        144738 non-null  object        
 2   firstname       144738 non-null  object        
 3   title           144738 non-null  object        
 4   race            144738 non-null  object        
 5   sex             144738 non-null  object        
 6   emptype         144738 non-null  object        
 7   hire_date       144738 non-null  datetime64[ns]
 8   hours_worked    144738 non-null  float64       
 9   monthly_salary  144738 non-null  float64       
 10  annual_salary   144738 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(7)
memory usage: 13.3+ MB


In [14]:
df.isnull().sum() ## checking for nulls

## BEAUTIFUL RESULTS :D

agency            0
lastname          0
firstname         0
title             0
race              0
sex               0
emptype           0
hire_date         0
hours_worked      0
monthly_salary    0
annual_salary     0
dtype: int64

In [15]:
df.duplicated().sum() ## checking for duplicate rows.

5

In [16]:
## dropping the duplciates rows
df = df.drop_duplicates() 

In [17]:
df.duplicated().sum() ## rechecking for duplicate rows.

0

In [18]:
## checking our dataframe shape after cleaning
df.shape

(144733, 11)

### Prepare Takeaways

* Many of the columns were capitalized and were changed to all lowercase characters. Some of the column names had spaces that were also taken removed. 
* After dropping unneccessary columns we checked for nulls and found zero, the data was already complete.  
* We now have a clean workable dataframe and can move on to feature engineering


### Prepare: Create Features

In [19]:
## previewing our dataframe
df.head(3)

Unnamed: 0_level_0,agency,lastname,firstname,title,race,sex,emptype,hire_date,hours_worked,monthly_salary,annual_salary
AGY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
101,SENATE ...,GILLIAM,STACEY,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,FEMALE,URP - UNCLASSIFIED REGULAR PART-TIME,2013-07-01,20.0,8100.0,97200.0
104,LEGISLATIVE BUDGET BOARD ...,GILLIAM,STACEY,COMMITTEE DIRECTOR ...,WHITE,FEMALE,URP - UNCLASSIFIED REGULAR PART-TIME,2017-11-01,20.0,7000.0,84000.0
101,SENATE ...,NELSON,DAVID,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,MALE,URP - UNCLASSIFIED REGULAR PART-TIME,1998-12-01,20.0,9500.0,114000.0


In [20]:
## one hot encoding column for gender

df['is_female'] = np.where(df.sex == 'FEMALE', 1, 0)

In [21]:
df.is_female.value_counts()

0    144733
Name: is_female, dtype: int64

In [22]:
## looking at the different races in the dataframe
df.race.value_counts()

WHITE              64650
HISPANIC           39554
BLACK              33963
ASIAN               4419
OTHER               1453
AM INDIAN            694
Name: race, dtype: int64

In [23]:
## one hot encoding the top three races in the dataframe

df['is_white'] = np.where(df.race == 'WHITE', 1, 0)
df['is_hispanic'] = np.where(df.race == 'HISPANIC', 1, 0)
df['is_black'] = np.where(df.race == 'BLACK', 1, 0)

In [24]:
df.is_hispanic.value_counts()

0    144733
Name: is_hispanic, dtype: int64

In [25]:
df.head() ## previewing changes

Unnamed: 0_level_0,agency,lastname,firstname,title,race,sex,emptype,hire_date,hours_worked,monthly_salary,annual_salary,is_female,is_white,is_hispanic,is_black
AGY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
101,SENATE ...,GILLIAM,STACEY,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,FEMALE,URP - UNCLASSIFIED REGULAR PART-TIME,2013-07-01,20.0,8100.0,97200.0,0,0,0,0
104,LEGISLATIVE BUDGET BOARD ...,GILLIAM,STACEY,COMMITTEE DIRECTOR ...,WHITE,FEMALE,URP - UNCLASSIFIED REGULAR PART-TIME,2017-11-01,20.0,7000.0,84000.0,0,0,0,0
101,SENATE ...,NELSON,DAVID,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,MALE,URP - UNCLASSIFIED REGULAR PART-TIME,1998-12-01,20.0,9500.0,114000.0,0,0,0,0
104,LEGISLATIVE BUDGET BOARD ...,NELSON,DAVID,SENIOR BUDGET ADVISOR ...,WHITE,MALE,URP - UNCLASSIFIED REGULAR PART-TIME,2017-11-01,20.0,8000.0,96000.0,0,0,0,0
101,SENATE ...,ROCHA,MARIE,LEG. SERVICE/MAINTENANCE ...,HISPANIC,FEMALE,URF - UNCLASSIFIED REGULAR FULL-TIME,2003-05-01,41.0,3365.4,40384.8,0,0,0,0


In [26]:
label_encoder = LabelEncoder() ## creating my label encoder

## creating a race column that is incoded for machine readable formate
df['race_encoded'] = label_encoder.fit_transform(df['race'])

In [27]:
df.race_encoded.value_counts() ## previewing changes

5    64650
3    39554
2    33963
1     4419
4     1453
0      694
Name: race_encoded, dtype: int64

In [28]:
df.race.value_counts() ## comparing changes

WHITE              64650
HISPANIC           39554
BLACK              33963
ASIAN               4419
OTHER               1453
AM INDIAN            694
Name: race, dtype: int64

In [29]:
df.head() ## previewing changes

Unnamed: 0_level_0,agency,lastname,firstname,title,race,sex,emptype,hire_date,hours_worked,monthly_salary,annual_salary,is_female,is_white,is_hispanic,is_black,race_encoded
AGY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
101,SENATE ...,GILLIAM,STACEY,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,FEMALE,URP - UNCLASSIFIED REGULAR PART-TIME,2013-07-01,20.0,8100.0,97200.0,0,0,0,0,5
104,LEGISLATIVE BUDGET BOARD ...,GILLIAM,STACEY,COMMITTEE DIRECTOR ...,WHITE,FEMALE,URP - UNCLASSIFIED REGULAR PART-TIME,2017-11-01,20.0,7000.0,84000.0,0,0,0,0,5
101,SENATE ...,NELSON,DAVID,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,MALE,URP - UNCLASSIFIED REGULAR PART-TIME,1998-12-01,20.0,9500.0,114000.0,0,0,0,0,5
104,LEGISLATIVE BUDGET BOARD ...,NELSON,DAVID,SENIOR BUDGET ADVISOR ...,WHITE,MALE,URP - UNCLASSIFIED REGULAR PART-TIME,2017-11-01,20.0,8000.0,96000.0,0,0,0,0,5
101,SENATE ...,ROCHA,MARIE,LEG. SERVICE/MAINTENANCE ...,HISPANIC,FEMALE,URF - UNCLASSIFIED REGULAR FULL-TIME,2003-05-01,41.0,3365.4,40384.8,0,0,0,0,3


In [30]:
## creating a tenure in months column by subrtracting the hire date from the last updated 
## date of the dataframe (7/1/21) and dividing it by a time delta 

df['tenure_months'] = np.round((pd.to_datetime('2021-07-01') - df['hire_date'])/np.timedelta64(1,'M'))

## casting tenure in months as an ints
df['tenure_months'] = df['tenure_months'].astype(int)

In [31]:
df['tenure_years'] = np.round(df['tenure_months'] / 12, 1)

In [32]:
df.head() ## previewing changes

Unnamed: 0_level_0,agency,lastname,firstname,title,race,sex,emptype,hire_date,hours_worked,monthly_salary,annual_salary,is_female,is_white,is_hispanic,is_black,race_encoded,tenure_months,tenure_years
AGY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
101,SENATE ...,GILLIAM,STACEY,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,FEMALE,URP - UNCLASSIFIED REGULAR PART-TIME,2013-07-01,20.0,8100.0,97200.0,0,0,0,0,5,96,8.0
104,LEGISLATIVE BUDGET BOARD ...,GILLIAM,STACEY,COMMITTEE DIRECTOR ...,WHITE,FEMALE,URP - UNCLASSIFIED REGULAR PART-TIME,2017-11-01,20.0,7000.0,84000.0,0,0,0,0,5,44,3.7
101,SENATE ...,NELSON,DAVID,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,MALE,URP - UNCLASSIFIED REGULAR PART-TIME,1998-12-01,20.0,9500.0,114000.0,0,0,0,0,5,271,22.6
104,LEGISLATIVE BUDGET BOARD ...,NELSON,DAVID,SENIOR BUDGET ADVISOR ...,WHITE,MALE,URP - UNCLASSIFIED REGULAR PART-TIME,2017-11-01,20.0,8000.0,96000.0,0,0,0,0,5,44,3.7
101,SENATE ...,ROCHA,MARIE,LEG. SERVICE/MAINTENANCE ...,HISPANIC,FEMALE,URF - UNCLASSIFIED REGULAR FULL-TIME,2003-05-01,41.0,3365.4,40384.8,0,0,0,0,3,218,18.2


In [33]:
df['tenure_months'].max(), df['tenure_months'].min()

(597, -576)

In [34]:
df[df.tenure_months < 0]

Unnamed: 0_level_0,agency,lastname,firstname,title,race,sex,emptype,hire_date,hours_worked,monthly_salary,annual_salary,is_female,is_white,is_hispanic,is_black,race_encoded,tenure_months,tenure_years
AGY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
102,HOUSE OF REPRESENTATIVES ...,CRADDICK,TOM,ELECTED OFFICIAL ...,WHITE,MALE,URF - UNCLASSIFIED REGULAR FULL-TIME,2069-01-14,40.0,600.0,7200.0,0,0,0,0,5,-570,-47.5
696,TEXAS DEPARTMENT OF CRIMINAL JUSTICE ...,JUDD,ARTHUR,AGRICULTURE SPEC V ...,WHITE,MALE,CRF - CLASSIFIED REGULAR FULL-TIME,2069-07-14,40.0,4033.74,48404.88,0,0,0,0,5,-576,-48.0
592,SOIL AND WATER CONSERVATION BOARD ...,BRANDENBERGER,DONALD,PROGRAM SPECIALIST VII ...,WHITE,MALE,CRF - CLASSIFIED REGULAR FULL-TIME,2068-12-01,40.0,7658.41,91900.92,0,0,0,0,5,-569,-47.4


In [35]:
## getting rid of clerical errors

df = df.drop(index=[794, 118710, 144495])

KeyError: '[794 118710 144495] not found in axis'

In [None]:
df[df.tenure_months == 0]

In [None]:
df['tenure_months'].max(), df['tenure_months'].min()

In [None]:
df.head()

In [None]:
df.is_white.value_counts()