In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from starter.ml.data import process_data
from sklearn.ensemble import RandomForestClassifier

In [2]:
!ls

[31m08-final-machine-learning-pipeline.ipynb[m[m
EDA.ipynb
README.md
[34m__pycache__[m[m
[34mdata[m[m
dvc_on_heroku_instructions.md
main.py
[34mmodel[m[m
model_card_template.md
requirements.txt
sanitycheck.py
[34mscreenshots[m[m
setup.py
[34mstarter[m[m


In [3]:
df = pd.read_csv('data/census.csv')
df.head()

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
#remove spaces from col names
df.columns = [colname.replace(' ', '') for colname in df.columns]

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlgt           32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:
#from ydata_profiling import ProfileReport
#profile = ProfileReport(df, title="Profiling Report")
#profile.to_notebook_iframe()

In [7]:
train, test = train_test_split(df, test_size=0.20, random_state = 0)

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26048 entries, 15282 to 2732
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             26048 non-null  int64 
 1   workclass       26048 non-null  object
 2   fnlgt           26048 non-null  int64 
 3   education       26048 non-null  object
 4   education-num   26048 non-null  int64 
 5   marital-status  26048 non-null  object
 6   occupation      26048 non-null  object
 7   relationship    26048 non-null  object
 8   race            26048 non-null  object
 9   sex             26048 non-null  object
 10  capital-gain    26048 non-null  int64 
 11  capital-loss    26048 non-null  int64 
 12  hours-per-week  26048 non-null  int64 
 13  native-country  26048 non-null  object
 14  salary          26048 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.2+ MB


In [9]:
categorical_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]
label="salary"
training=True

In [10]:
y = train[label]
X = train.drop([label], axis=1)

In [11]:
X.head()

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
15282,36,Private,174308,11th,7,Divorced,Transport-moving,Not-in-family,White,Male,0,0,40,United-States
24870,35,Private,198202,HS-grad,9,Never-married,Exec-managerial,Not-in-family,White,Female,0,0,54,United-States
18822,38,Private,52963,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,50,United-States
26404,50,Private,138270,HS-grad,9,Married-civ-spouse,Sales,Wife,Black,Female,0,0,40,United-States
7842,68,Self-emp-not-inc,116903,Assoc-voc,11,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,2149,40,United-States


In [12]:
X_train, y_train, encoder, lb = process_data(
    train, categorical_features=categorical_features, label="salary", training=True)




In [13]:
print(X_train.shape)
print(y_train.shape)

X_train[:1]

(26048, 108)
(26048,)


array([[3.60000e+01, 1.74308e+05, 7.00000e+00, 0.00000e+00, 0.00000e+00,
        4.00000e+01, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00

In [14]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
#rf = RandomForestClassifier()
lr.fit(X_train, y_train)

In [17]:
pred = lr.predict(X_train)

In [18]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(y_train, pred))
accuracy_score(y_train, pred)

              precision    recall  f1-score   support

           0       0.81      0.97      0.88     19802
           1       0.72      0.26      0.38      6246

    accuracy                           0.80     26048
   macro avg       0.76      0.61      0.63     26048
weighted avg       0.78      0.80      0.76     26048



0.7981035012285013