In [1]:
import joblib
import json
import pandas

import sklearn.linear_model

# Instructions

- Read **the train data** from the CSV file and properly set the index


In [2]:
data_train = pandas.read_csv('./data/features.train.csv').set_index('id')
data_train

Unnamed: 0_level_0,fnlwgt,age-group,education-num,capitalgain,capitalloss,hoursperweek,workclass_self-emp-not-inc,workclass_private,workclass_self-emp-inc,workclass_local-gov,...,native-country_honduras,native-country_thailand,native-country_laos,native-country_peru,native-country_holand-netherlands,native-country_haiti,native-country_scotland,sex_male,sex_female,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2103,1.395470,2.0,14.0,0.0,4.0,2.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
14649,-0.873898,0.0,8.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7379,-0.592186,2.0,8.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
24479,-1.349595,2.0,9.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
19532,2.041210,3.0,1.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8695,-1.405521,0.0,9.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2192,-0.821560,2.0,9.0,2.0,0.0,2.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8250,1.210247,0.0,8.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
18511,2.796123,2.0,13.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


# Instructions

- Instantiate a `sklearn.linear_model.LogisticRegression` model with the following configurations
```
    penalty       = None,
    fit_intercept = False,
    random_state  = 0,
    solver        = 'lbfgs',
```
- Fit the model with data columns properly selected
- Use `joblib.dump` to save the model to `./model/model.joblib`
- Also, use `.get_params`, `json.dump` to save the model configuration to  `./model/config.json`
- Print out all model parameters (not to be confused with model configuration)


In [3]:
model = sklearn.linear_model.LogisticRegression(
    penalty       = 'l2',
    fit_intercept = False,
    random_state  = 0,
    solver        = 'lbfgs',
    class_weight= 'balanced',
    C= 2
)

In [4]:
model.fit(
    data_train.drop(['label'], axis = 'columns'),
    data_train['label']
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
joblib.dump(model, './model/logistic_regression_model.joblib')

['./model/logistic_regression_model.joblib']

In [6]:
model.get_params()

{'C': 2,
 'class_weight': 'balanced',
 'dual': False,
 'fit_intercept': False,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 0,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [7]:
with open('./model/logistic_regression_model_config.json', 'w') as f:
    json.dump(model.get_params(), f)

# Instructions

- Type the the model equation $p(y=1|x_1, x_2)$ using the Markdown language 
- Print out thoses model parameters (not to be confused with model configuration)

$\mathrm{P}(y=1|x_1, x_2) = \dfrac{1}{1+e^{-(w_0 + w_1 x_1 + w_2 x_2)}}$

In [8]:
model.coef_

array([[ 0.07569563,  0.33205144,  0.2887415 ,  0.93161242,  0.44701635,
         0.53391401, -1.16403685, -0.76887547, -0.3165519 , -0.93519211,
        -0.34031625, -1.19758424, -0.01214421, -0.39918955, -1.56202251,
         0.59962488, -1.46064913, -1.2418957 ,  0.40488856, -1.06060426,
        -0.81323241,  0.246004  , -0.08440539,  0.14702832, -0.20749391,
        -0.62079294, -1.22232181, -0.94098389,  0.1557455 , -0.28772184,
        -1.47479423, -0.56788352, -0.27904911,  0.13995583, -0.13717761,
        -0.74385033, -0.74944248, -1.38135365,  0.37056167, -1.70506056,
        -0.92474523, -0.7368657 , -1.51028817, -1.11550085, -0.62235919,
        -1.14887666,  0.14993426, -1.02405036,  0.29831295,  0.01804766,
        -0.51109902,  0.12370594, -0.08242298, -0.02921964, -0.47887144,
         0.14792632, -0.21119462,  0.15415408,  0.54676026,  0.02847652,
        -0.2321472 , -0.15329633,  0.06461573,  0.21701881,  0.27236287,
        -0.05076195, -0.23122303, -0.13317917, -0.0

In [9]:
model.intercept_

array([0.])