In [42]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
import joblib

In [43]:
df = pd.read_csv('adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,annual-income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [44]:
df.shape

(32561, 15)

In [45]:
df = df.replace('?',pd.NA)

In [46]:
df.isna().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
annual-income        0
dtype: int64

In [47]:
df = df.dropna()

In [48]:
df.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
annual-income     0
dtype: int64

In [49]:
df.shape

(30162, 15)

In [50]:
df.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,annual-income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


In [51]:
columns_to_drop = ["fnlwgt", "education", "native-country"]
df = df.drop(columns=columns_to_drop)
df.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,annual-income
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,<=50K
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,<=50K
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,<=50K
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,<=50K
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,<=50K


In [52]:
le = LabelEncoder()
df.sex = le.fit_transform(df.sex)
df.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,annual-income
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,1,2174,0,40,<=50K
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,<=50K
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,1,0,0,40,<=50K
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,0,0,40,<=50K
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,0,40,<=50K


In [53]:
print(df["workclass"].unique())
print()
print(df["marital-status"].unique())
print()
print(df["occupation"].unique())
print()
print(df["relationship"].unique())
print()
print(df["race"].unique())

['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov'
 'Self-emp-inc' 'Without-pay']

['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed']

['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Transport-moving' 'Farming-fishing'
 'Machine-op-inspct' 'Tech-support' 'Craft-repair' 'Protective-serv'
 'Armed-Forces' 'Priv-house-serv']

['Not-in-family' 'Husband' 'Wife' 'Own-child' 'Unmarried' 'Other-relative']

['White' 'Black' 'Asian-Pac-Islander' 'Amer-Indian-Eskimo' 'Other']


In [54]:
workclass = pd.get_dummies(df["workclass"])
maritalstatus = pd.get_dummies(df["marital-status"])
occupation = pd.get_dummies(df["occupation"])
relationship = pd.get_dummies(df["relationship"])
race = pd.get_dummies(df["race"])

In [55]:
df = df.drop(
    ["workclass", "marital-status", "occupation", "relationship", "race"],
    axis="columns",
)
df.head()

Unnamed: 0,age,education-num,sex,capital-gain,capital-loss,hours-per-week,annual-income
0,39,13,1,2174,0,40,<=50K
1,50,13,1,0,0,13,<=50K
2,38,9,1,0,0,40,<=50K
3,53,7,1,0,0,40,<=50K
4,28,13,0,0,0,40,<=50K


In [56]:
df = pd.concat(
    [df, workclass, maritalstatus, occupation, relationship, race], axis="columns"
)
df.head()

Unnamed: 0,age,education-num,sex,capital-gain,capital-loss,hours-per-week,annual-income,Federal-gov,Local-gov,Private,...,Not-in-family,Other-relative,Own-child,Unmarried,Wife,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White
0,39,13,1,2174,0,40,<=50K,0,0,0,...,1,0,0,0,0,0,0,0,0,1
1,50,13,1,0,0,13,<=50K,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,38,9,1,0,0,40,<=50K,0,0,1,...,1,0,0,0,0,0,0,0,0,1
3,53,7,1,0,0,40,<=50K,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,28,13,0,0,0,40,<=50K,0,0,1,...,0,0,0,0,1,0,0,1,0,0


In [57]:
df['annual-income']  = df['annual-income'].apply(lambda x : 0 if x == '<=50K' else 1)
df.head()

Unnamed: 0,age,education-num,sex,capital-gain,capital-loss,hours-per-week,annual-income,Federal-gov,Local-gov,Private,...,Not-in-family,Other-relative,Own-child,Unmarried,Wife,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White
0,39,13,1,2174,0,40,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
1,50,13,1,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,38,9,1,0,0,40,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
3,53,7,1,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,28,13,0,0,0,40,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0


In [58]:
data = df.drop("annual-income", axis="columns")
target = df["annual-income"]

In [59]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [60]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [61]:
lr.score(X_test, y_test)

0.8367313111221615

In [62]:
joblib.dump(lr, "Incomes-Classification.joblib")

['Incomes-Classification.joblib']

<h1>PCA</h1>

In [63]:
data.head()

Unnamed: 0,age,education-num,sex,capital-gain,capital-loss,hours-per-week,Federal-gov,Local-gov,Private,Self-emp-inc,...,Not-in-family,Other-relative,Own-child,Unmarried,Wife,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White
0,39,13,1,2174,0,40,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
1,50,13,1,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,38,9,1,0,0,40,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,53,7,1,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,28,13,0,0,0,40,0,0,1,0,...,0,0,0,0,1,0,0,1,0,0


In [64]:
pca = PCA(0.99)
X_pca = pca.fit_transform(data)
X_pca

array([[ 1082.14645266],
       [-1091.85204692],
       [-1091.85043223],
       ...,
       [-1091.84759224],
       [-1091.85531318],
       [13932.12788736]])

In [65]:
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, target, test_size=0.2,random_state=42)

In [66]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [67]:
lr.score(X_test, y_test)

0.8367313111221615