In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns

# custom module imports
# import acquire as aq
# import prepare as pr
# import explore as ex

# feature selection imports
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.feature_selection import RFE

# import scaling methods
from sklearn.preprocessing import RobustScaler, StandardScaler
from scipy import stats
from sklearn.model_selection import train_test_split

# import modeling methods
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import explained_variance_score
from scipy import stats

# import to remove warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('salaries.csv')

In [3]:
df.head()

Unnamed: 0,AGY,NAME,LASTNAME,FIRSTNAME,MI,JOBCLASS,JC TITLE,RACE,SEX,EMPTYPE,...,RATE,HRSWKD,MONTHLY,ANNUAL,STATENUM,duplicated,multiple_full_time_jobs,combined_multiple_jobs,summed_annual_salary,hide_from_search
0,101,SENATE ...,GILLIAM,STACEY,L,7101,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,FEMALE,URP - UNCLASSIFIED REGULAR PART-TIME,...,0.0,20.0,8100.0,97200.0,339371,True,,,181200.0,
1,104,LEGISLATIVE BUDGET BOARD ...,GILLIAM,STACEY,L,C160,COMMITTEE DIRECTOR ...,WHITE,FEMALE,URP - UNCLASSIFIED REGULAR PART-TIME,...,0.0,20.0,7000.0,84000.0,339371,True,,,,True
2,101,SENATE ...,NELSON,DAVID,,7101,LEG. OFFICIAL/ADMINISTRATOR ...,WHITE,MALE,URP - UNCLASSIFIED REGULAR PART-TIME,...,0.0,20.0,9500.0,114000.0,193187,True,,,210000.0,
3,104,LEGISLATIVE BUDGET BOARD ...,NELSON,DAVID,,P080,SENIOR BUDGET ADVISOR ...,WHITE,MALE,URP - UNCLASSIFIED REGULAR PART-TIME,...,0.0,20.0,8000.0,96000.0,193187,True,,,,True
4,101,SENATE ...,ROCHA,MARIE,S,7103,LEG. SERVICE/MAINTENANCE ...,HISPANIC,FEMALE,URF - UNCLASSIFIED REGULAR FULL-TIME,...,0.0,41.0,3365.4,40384.8,152257,True,,True,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144738 entries, 0 to 144737
Data columns (total 21 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   AGY                      144738 non-null  int64  
 1   NAME                     144738 non-null  object 
 2   LASTNAME                 144738 non-null  object 
 3   FIRSTNAME                144738 non-null  object 
 4   MI                       144738 non-null  object 
 5   JOBCLASS                 144738 non-null  object 
 6   JC TITLE                 144738 non-null  object 
 7   RACE                     144738 non-null  object 
 8   SEX                      144738 non-null  object 
 9   EMPTYPE                  144738 non-null  object 
 10  HIREDT                   144738 non-null  object 
 11  RATE                     144738 non-null  float64
 12  HRSWKD                   144738 non-null  float64
 13  MONTHLY                  144738 non-null  float64
 14  ANNU

In [5]:
df.RACE.value_counts()

WHITE              64651
HISPANIC           39557
BLACK              33964
ASIAN               4419
OTHER               1453
AM INDIAN            694
Name: RACE, dtype: int64

In [6]:
df['JC TITLE'].value_counts()

CORREC  OFFICER IV                                    8753
CORREC  OFFICER V                                     7350
TEXAS WORKS ADVISOR II                                3812
CORREC OFFCR III                                      3652
DIRECT SUPPORT PROFESSIONAL I                         2484
                                                      ... 
INVESTIGATIVE ANALYST                                    1
EXECUTIVE DIRECTOR FOR ERS                               1
MILITARY SPECIALIST V                                    1
EXECUTIVE DIRECTOR/SECRETARY                             1
ADMINISTRATIVE ASSISTANT IV                              1
Name: JC TITLE, Length: 1406, dtype: int64

In [7]:
plt.df.ANNUAL(kind='bar')

AttributeError: module 'matplotlib.pyplot' has no attribute 'df'

In [10]:
train, test = train_test_split(df, train_size=0.8, random_state=123)
train, validate = train_test_split(train, train_size=0.7, random_state=123)

In [12]:
train.shape, validate.shape, test.shape

((81053, 21), (34737, 21), (28948, 21))

In [14]:
train.columns

Index(['AGY', 'NAME', 'LASTNAME', 'FIRSTNAME', 'MI', 'JOBCLASS', 'JC TITLE',
       'RACE', 'SEX', 'EMPTYPE', 'HIREDT', 'RATE', 'HRSWKD', 'MONTHLY',
       'ANNUAL', 'STATENUM', 'duplicated', 'multiple_full_time_jobs',
       'combined_multiple_jobs', 'summed_annual_salary', 'hide_from_search'],
      dtype='object')

In [15]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81053 entries, 122903 to 13742
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   AGY                      81053 non-null  int64  
 1   NAME                     81053 non-null  object 
 2   LASTNAME                 81053 non-null  object 
 3   FIRSTNAME                81053 non-null  object 
 4   MI                       81053 non-null  object 
 5   JOBCLASS                 81053 non-null  object 
 6   JC TITLE                 81053 non-null  object 
 7   RACE                     81053 non-null  object 
 8   SEX                      81053 non-null  object 
 9   EMPTYPE                  81053 non-null  object 
 10  HIREDT                   81053 non-null  object 
 11  RATE                     81053 non-null  float64
 12  HRSWKD                   81053 non-null  float64
 13  MONTHLY                  81053 non-null  float64
 14  ANNUAL           

In [None]:
# assign X and y variables in all our split dfs
X_train = train[['bathroomcnt', 'bedroomcnt','buildingqualitytypeid', 'calculatedfinishedsquarefeet', 
                'latitude', 'longitude', 'lotsizesquarefeet', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt',
                'landtaxvaluedollarcnt', 'taxamount', 'LA_county', 'orange_county', 'ventura_county', 'age_of_home']]
y_train = train[['ANNUAL']]

X_validate = validate[['bathroomcnt', 'bedroomcnt','buildingqualitytypeid', 'calculatedfinishedsquarefeet', 
                'latitude', 'longitude', 'lotsizesquarefeet', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt',
                'landtaxvaluedollarcnt', 'taxamount', 'LA_county', 'orange_county', 'ventura_county', 'age_of_home']]
y_validate = validate[['logerror']]

X_test = test[['bathroomcnt', 'bedroomcnt','buildingqualitytypeid', 'calculatedfinishedsquarefeet', 
                'latitude', 'longitude', 'lotsizesquarefeet', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt',
                'landtaxvaluedollarcnt', 'taxamount', 'LA_county', 'orange_county', 'ventura_county', 'age_of_home']]
y_test = test[['logerror']]