# model trainig

First, we import pandas and some objects and functions from scikit-learn.

In [42]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

Next, we read the prepared data and save it in a pandas dataframe named 
<code>df</code>.

In [43]:
df = pd.read_csv('adult_income_prepared.csv')
df

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,income,workclass_?,workclass_Federal-gov,workclass_Local-gov,...,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,gender_Male
0,25,226802,7,0,0,40,True,False,False,False,...,False,True,False,False,False,False,True,False,False,True
1,38,89814,9,0,0,50,True,False,False,False,...,False,False,False,False,False,False,False,False,True,True
2,28,336951,12,0,0,40,True,False,False,True,...,False,False,False,False,False,False,False,False,True,True
3,44,160323,10,7688,0,40,True,False,False,False,...,False,False,False,False,False,False,True,False,False,True
4,18,103497,10,0,0,30,True,True,False,False,...,False,True,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48702,27,257302,12,0,0,38,True,False,False,False,...,False,False,False,True,False,False,False,False,True,False
48703,40,154374,9,0,0,40,True,False,False,False,...,False,False,False,False,False,False,False,False,True,True
48704,58,151910,9,0,0,40,True,False,False,False,...,False,False,True,False,False,False,False,False,True,False
48705,22,201490,9,0,0,20,True,False,False,False,...,False,True,False,False,False,False,False,False,True,True


After defining X and y we need three sets of data:
<ol>
    <li>A trainig set to fit our model</li>
    <li>A validation set to find best hyper parameters for our model</li>
    <li>A test set to evaluate our model performance</li>
</ol>

In [32]:
#defining X and y
#splitting train set, validation set and test set

X = df.drop('income', axis=1)
y = df['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test ,test_size=0.5, random_state=42)

We should make sure there is no disparity in percentage of ones between the three y sets.
$$
    \mathrm{percentage\ of\ ones} = 
    \frac{\mathrm{number\ of\ ones}}{\mathrm{total\ number\ of\ samples}}
$$
$$
    \mathrm{average} =
    \frac{1 \times \mathrm{number\ of\ ones} + 0 \times \mathrm{number\ of\ zeroes}}
    {\mathrm{total\ number\ of\ samples}}
$$

As we can see percantage of ones in a binary array is equal to average value of the array.

In [33]:
#checking to see if distribution in sets are fair

print(y_val.mean())
print(y_train.mean())
print(y_test.mean())

0.2427194421657096
0.2365065800717826
0.24405250205086138


We have to scale our X sets. Here we use standard scaler. We could also use min max scaler.

In [34]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

If we want a high performance model we have to find the best hyper parameters. 
To do that we use train set to fit models with different hyper parameters and validation set to 
evaluate performance of these models. Finally with comparing the results we can find best hyper parameters.

As our data is unbalanced (number of ones in y is a third of number of zeroes), accuracy score is not a good metric for 
evaluating our model. We shall use recall score, precision score and f1 score for evaluation. Of course depending on 
our objective the importance of these metrics can vary. For example if identifing the most ones is of greater importance to us we shall put greater importance to recall score.

In [35]:
#making our output a scroolable box with limited height
# just something about visualisation

from IPython.display import HTML
HTML('''
<style>
.jp-Cell-outputArea {
    overflow-y: auto;
    max-height: 400px;
}
</style>
''')

### Logistic Regression


In the cell below we use logistic regresion models and evaluate their performance

In [None]:
from sklearn.linear_model import LogisticRegression

Cs = [0.01, 0.1, 1, 10]
max_iters = [100, 1000, 10000]
solvers = ['liblinear', 'lbfgs']
penalties = ['l1', 'l2']
class_weights = ['balanced', None]
counter = 1

for class_weight in class_weights:
    for penalty in penalties:
        for solver in solvers:
            for C in Cs:
                for max_iter in max_iters:

                    if (penalty=='l1' and solver=='lbfgs'):
                        continue
                        
                    model = LogisticRegression(
                        penalty=penalty,
                        C=C, 
                        solver=solver,
                        max_iter=max_iter,
                        class_weight=class_weight,
                        random_state=42
                    )
        
                    model.fit(X_train, y_train)
                    logisticreg_pred = model.predict(X_val)
                    f1 = f1_score(y_val,  logisticreg_pred)

                    print(f"{counter}   -----------------")
                    print(
                        f"C: {C}, "
                        f"max_iter: {max_iter}, "
                        f"solver: {solver}, "
                        f"penalty: {penalty}, "
                        f"class_weight: {class_weight}"
                    )
                    print(f"f1_score:{f1:.3f}")
                    counter += 1

1   -----------------
C: 0.01, max_iter: 100, solver: liblinear, penalty: l1, class_weight: balanced
precision: 0.562, recall: 0.851, f1:0.677
2   -----------------
C: 0.01, max_iter: 1000, solver: liblinear, penalty: l1, class_weight: balanced
precision: 0.562, recall: 0.851, f1:0.677
3   -----------------
C: 0.01, max_iter: 10000, solver: liblinear, penalty: l1, class_weight: balanced
precision: 0.562, recall: 0.851, f1:0.677
4   -----------------
C: 0.1, max_iter: 100, solver: liblinear, penalty: l1, class_weight: balanced
precision: 0.569, recall: 0.840, f1:0.678
5   -----------------
C: 0.1, max_iter: 1000, solver: liblinear, penalty: l1, class_weight: balanced
precision: 0.569, recall: 0.840, f1:0.678
6   -----------------
C: 0.1, max_iter: 10000, solver: liblinear, penalty: l1, class_weight: balanced
precision: 0.569, recall: 0.840, f1:0.678
7   -----------------
C: 1, max_iter: 100, solver: liblinear, penalty: l1, class_weight: balanced
precision: 0.571, recall: 0.840, f1:0.680

### Support Vector Machine


In the cell below we use SVM models and evaluate their performance

In [None]:
from sklearn.svm import SVC

kernels = ['linear', 'rbf']
class_weights = ['balanced', None]
counter = 1

for class_weight in class_weights:
    for kernel in kernels:                       
        model = SVC(
            kernel=kernel,
            C=0.1,
            class_weight=class_weight,
        )

        model.fit(X_train, y_train)
        svm_pred = model.predict(X_val)
        f1 = f1_score(y_val,  svm_pred)

        print(f"{counter}   -----------------")
        print(
            f"kernel: {kernel}, "
            f"class_weight: {class_weight}"
        )
        print(f"f1_score:{f1:.3f}")
        counter += 1

1   -----------------
kernel: linear, class_weight: balanced
precision: 0.546, recall: 0.854, f1:0.666
2   -----------------
kernel: rbf, class_weight: balanced
precision: 0.542, recall: 0.829, f1:0.655
3   -----------------
kernel: linear, class_weight: None
precision: 0.741, recall: 0.585, f1:0.654
4   -----------------
kernel: rbf, class_weight: None
precision: 0.744, recall: 0.549, f1:0.632


### XGBoost

In [36]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(objective='binary:logistic',
                               scale_pos_weight=3, #number of zeroes/number of ones
                               random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_val)
f1 = f1_score(y_val, xgb_pred)
print(f"f1-score:{f1:.3f}")

f1-score:0.714


In this problem our goal is to maximize f1 score. The highest f1 score belongs to XGboost model and the difference is substantial. Therefore it will bo our choice.

In [None]:
# rejoining trainig set and validation set now that validation set has served it's purpose

X_train = np.concatenate([X_train, X_val], axis=0)
y_train = np.concatenate([y_train, y_val], axis=0)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# making a xgboost model(final choice)
model = xgb_model = xgb.XGBClassifier(objective='binary:logistic',
                               scale_pos_weight=3, #number of zeroes/number of ones
                               random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred)
print(f"final model f1_score: {f1:.3f}")

final model f1_score: 0.717


In [39]:
from sklearn.decomposition import PCA

pca = PCA(n_components=30)
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)