In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_boston

In [2]:
dataset = load_boston()
X = pd.DataFrame(dataset.data)
X.columns = dataset.feature_names
y = dataset.target

print(f'X - {X.shape}\n'\
      f'y - {y.shape}')

X - (506, 13)
y - (506,)


### 1. Разделите выборку на обучающую и тестовую в отношении 80%/20%

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
RANDOM_STATE = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=RANDOM_STATE)

print(f'X_train - {X_train.shape}\n'\
      f'y_train - {y_train.shape}\n'\
      f'X_test - {X_test.shape}\n'\
      f'y_test - {y_test.shape}')

X_train - (404, 13)
y_train - (404,)
X_test - (102, 13)
y_test - (102,)


### 2. Обучите стандартную регрессию, а также Ridge и  Lasso с параметрами по умолчанию и выведите их R2 на тестовой выборке

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html

In [5]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

In [6]:
def get_r2_score(model, X, y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=RANDOM_STATE)
    
    model.fit(X_train, y_train)
    
    y_pred_train = model.predict(X_train)
    r2_train = r2_score(y_train, y_pred_train)
    
    y_pred_test = model.predict(X_test)
    r2_test = r2_score(y_test, y_pred_test)
    
    return r2_train, r2_test

In [7]:
models = [
    {'name':'LinearRegression', 'model':LinearRegression()}, 
    {'name':'Ridge', 'model':Ridge()}, 
    {'name':'Lasso', 'model':Lasso()}
]

for model in models:
    model_name = model['name']
    model = model['model']
    r2_score_train, r2_score_test = get_r2_score(model, X, y)
    print(f'{model_name}\n'\
          #f'r2 (train) - {r2_score_train}\n'\
          f'r2 (test) - {r2_score_test:.3f}\n')

LinearRegression
r2 (test) - 0.669

Ridge
r2 (test) - 0.666

Lasso
r2 (test) - 0.667



### 3. Для Ridge и Lasso подберите коэффициент регуляризации(используйте GridSearchCV, RidgeCV, LassoCV) в пределах от $10^{-5}$ до $10^5$ (по степеням 10). Посчитайте R2 на тестовой выборке по лучшим моделям и сравните с предыдущими результатами. Напишите как изменился результат

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeCV.html

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html

In [8]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeCV, LassoCV

In [9]:
alphas = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e+1, 1e+2, 1e+3, 1e+4, 1e+5]

In [10]:
def get_search_cv(model, X, y, grid_search, alphas=alphas, scoring='r2'):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=RANDOM_STATE)
    
    if grid_search:
        params = [{'alpha': alphas}]
        search = GridSearchCV(model(), params, scoring=scoring).fit(X_train, y_train)
        best_alpha = search.best_params_['alpha']
    else:
        search = model(alphas=alphas).fit(X_train, y_train)
        best_alpha = search.alpha_
        
    score_train = search.score(X_train, y_train)
    score_test = search.score(X_test, y_test)
    
    return score_train, score_test, best_alpha

In [11]:
models = [
    {'name':'Ridge', 'clf':Ridge, 'grid_search':True}, 
    {'name':'RidgeCV', 'clf':RidgeCV, 'grid_search':False},
    {'name':'Lasso', 'clf':Lasso, 'grid_search':True},
    {'name':'LassoCV', 'clf':LassoCV, 'grid_search':False}
]

for model in models:
    model_name = model['name']
    clf = model['clf']
    grid_search = model['grid_search']
    
    r2_score_train, r2_score_test, best_alpha = get_search_cv(clf, X, y, grid_search)
    
    print(f'{model_name}\n'\
          #f'r2 (train) - {r2_score_train}\n'\
          f'r2 (test) - {r2_score_test:.6f}\n'\
          f'best alpha - {best_alpha}\n')

Ridge
r2 (test) - 0.668759
best alpha - 1e-05

RidgeCV
r2 (test) - 0.668751
best alpha - 0.01

Lasso
r2 (test) - 0.668760
best alpha - 1e-05

LassoCV
r2 (test) - 0.668760
best alpha - 1e-05



Результат незначительно улучшился. RidgeCV и GridSearchCV(Ridge) выявили разные лучшие коэффициенты регуляризации.

### 4. Проведите масштабирование выборки(используйте Pipeline, StandardScaler, MinMaxScaler), посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

In [12]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

In [13]:
models = [
    {'name':'Ridge after StandardScaler', 'clf':Ridge(), 'scaler':StandardScaler()},
    {'name':'Ridge after MinMaxScaler', 'clf':Ridge(), 'scaler':MinMaxScaler()},
    {'name':'Lasso after StandardScaler', 'clf':Lasso(), 'scaler':StandardScaler()},
    {'name':'Lasso after MinMaxScaler', 'clf':Lasso(), 'scaler':MinMaxScaler()}
]


for model in models:
    model_name = model['name']
    clf = model['clf']
    scaler = model['scaler']
    
    pipe = Pipeline(steps=[('scaler', scaler), ('clf', clf)]).fit(X_train, y_train)
    score = pipe.score(X_test, y_test)
    
    print(f'{model_name}\n'\
          f'{score}\n')

Ridge after StandardScaler
0.6684624359643564

Ridge after MinMaxScaler
0.6764100365423611

Lasso after StandardScaler
0.6239428734251422

Lasso after MinMaxScaler
0.2573921442545194



Лучший результат у Ridge with MinMaxScaler. Lasso with StandardScaler ухудшился. Lasso with MinMaxScaler сильно ухудшился.

### 5. Подберите коэффициент регуляризации для Ridge и Lasso на масштабированных данных, посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [14]:
models = [
    {'name':'Ridge after StandardScaler', 'clf':Ridge, 'scaler':StandardScaler()},
    {'name':'Ridge after MinMaxScaler', 'clf':Ridge, 'scaler':MinMaxScaler()},
    {'name':'Lasso after StandardScaler', 'clf':Lasso, 'scaler':StandardScaler()},
    {'name':'Lasso after MinMaxScaler', 'clf':Lasso, 'scaler':MinMaxScaler()}
]

best_model = {}

for model in models:
    
    model_name = model['name']
    clf = model['clf']
    scaler = model['scaler']
    
    for alpha in alphas:
    
        pipe = Pipeline(steps=[('scaler', scaler), ('clf', clf(alpha=alpha))]).fit(X_train, y_train)
        score = pipe.score(X_test, y_test)
        
        if len(best_model) == 0 or score > best_model['score']:
            best_model['name'] = model_name
            best_model['score'] = score
            best_model['alpha'] = alpha
    
print(f'best model - {best_model["name"]}\n'\
      f'score - {best_model["score"]}\n'\
      f'alpha - {best_model["alpha"]}')

best model - Ridge after MinMaxScaler
score - 0.6764100365423611
alpha - 1


Лучший результат остался преждним, т.к. в Ridge(alpha = 1) по умолчанию.

### 6. Добавьте попарные произведения признаков и их квадраты (используйте PolynomialFeatures) на масштабированных признаках, посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html

In [15]:
from sklearn.preprocessing import PolynomialFeatures

In [21]:
%%time

models = [
    {'name':'Ridge after StandardScaler', 'clf':Ridge, 'scaler':StandardScaler(), 'poly':PolynomialFeatures(2)},
    {'name':'Ridge after MinMaxScaler', 'clf':Ridge, 'scaler':MinMaxScaler(),'poly':PolynomialFeatures(2)},
    {'name':'Ridge after StandardScaler', 'clf':Ridge, 'scaler':StandardScaler(), 'poly':PolynomialFeatures(interaction_only=True)},
    {'name':'Ridge after MinMaxScaler', 'clf':Ridge, 'scaler':MinMaxScaler(),'poly':PolynomialFeatures(interaction_only=True)},
    
    {'name':'Lasso after StandardScaler', 'clf':Lasso, 'scaler':StandardScaler(), 'poly':PolynomialFeatures(2)},
    {'name':'Lasso after MinMaxScaler', 'clf':Lasso, 'scaler':MinMaxScaler(), 'poly':PolynomialFeatures(2)},
    {'name':'Lasso after StandardScaler', 'clf':Lasso, 'scaler':StandardScaler(), 'poly':PolynomialFeatures(interaction_only=True)},
    {'name':'Lasso after MinMaxScaler', 'clf':Lasso, 'scaler':MinMaxScaler(), 'poly':PolynomialFeatures(interaction_only=True)}
]

best_model = {}

for model in models:
    
    model_name = model['name']
    clf = model['clf']
    scaler = model['scaler']
    poly = model['poly']
    
    pipe = Pipeline(steps=[('scaler', scaler), ('poly', poly), ('clf', clf())]).fit(X_train, y_train)
    score = pipe.score(X_test, y_test)
    
    if len(best_model) == 0 or score > best_model['score']:
        best_model['name'] = model_name
        best_model['score'] = score
        best_model['poly'] = poly
    
print(f'best model - {best_model["name"]}\n'\
      f'score - {best_model["score"]}\n'\
      f'poly - {best_model["poly"]}')

best model - Ridge after StandardScaler
score - 0.847900286007188
poly - PolynomialFeatures(interaction_only=True)
Wall time: 85 ms


Результат значительно улучшился.

### 7. Подберите наилучшую модель (используйте Pipeline, GridSearchCV) подбирая тип регуляризации (L1,L2), коэффициент регуляризации, метод масштабирования и степень полинома в PolynomialFeatures. Выведите итоговые параметры и результат R2. Напишите как изменился R2 по сравнению с предыдущими экспериментами

In [22]:
import warnings
warnings.filterwarnings('ignore')

In [23]:
%%time

pipe = Pipeline(steps=[
    ('scaler', 'passthrough'),
    ('poly', 'passthrough'),
    ('clf', 'passthrough')
])

params = [{
    'scaler': [None, StandardScaler(), MinMaxScaler()],
    'poly': [None, PolynomialFeatures(2), PolynomialFeatures(interaction_only=True)],
    'clf': [Ridge(), Lasso()],
    'clf__alpha': alphas
}]

search = GridSearchCV(pipe, params, scoring='r2').fit(X_train, y_train)

score = search.score(X_test, y_test)

print(f'best model - {search.best_params_["clf"]}\n'\
      f'score - {score}\n'\
      f'scaler - {search.best_params_["scaler"]}\n'\
      f'poly - {search.best_params_["poly"]}')

best model - Ridge(alpha=10.0)
score - 0.8496468217328315
scaler - StandardScaler()
poly - PolynomialFeatures(interaction_only=True)
Wall time: 15.9 s


Результат незначительно улучшился, от предыдущего результата итоговая модель отличается только alpha=10. По времени гораздо более затратный.

### 8. Разделите выборку на признаки и целевую переменную(колонка со зачениями {<=50K,>50K}). Замените целевую переменную на числовые значения.

http://archive.ics.uci.edu/ml/datasets/Adult

In [24]:
link = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/adult-all.csv'
data = pd.read_csv(link, header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [25]:
df = data.copy()

vals = {}
for i, val in enumerate(df.iloc[:,-1].unique()):
    vals[val] = i

df.iloc[:,-1] = df.iloc[:,-1].apply(lambda x: vals[x])
X, y = df.iloc[:,:-1], df.iloc[:,-1]

print(f'df size - {df.shape}\n'\
      f'X size - {X.shape}\n'\
      f'y size - {y.shape}\n'\
      f'y values - {y.unique()}')

df size - (48842, 15)
X size - (48842, 14)
y size - (48842,)
y values - [0 1]


### 9. Выясните, присутствуют ли в данных пропуски. Заполните их самыми частыми значениями (испольуйте SimpleImputer)

https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

In [26]:
from sklearn.impute import SimpleImputer

In [27]:
print(f'df size before clear - {df.shape}\n'\
      f'df size after clear - {df.dropna().shape}')

df size before clear - (48842, 15)
df size after clear - (48842, 15)


Пропуски None / NaN / ""  - отсутствуют. 

### 10. Выберите колонки с числовыми и категориальными переменными.

Не уверен, но помоему все колонки кроме 2 могут быть категориальными. Но для избежания получения огромного признакового пространства, методом OneHotEncoding, все признаки значения которых являются числами - отнес к числовым.

In [31]:
num_cols = [0,2,4,10,11,12]
cat_cols = [1,3,5,6,7,8,9,13]

In [32]:
X_num = df[num_cols]
X_num

Unnamed: 0,0,2,4,10,11,12
0,39,77516,13,2174,0,40
1,50,83311,13,0,0,13
2,38,215646,9,0,0,40
3,53,234721,7,0,0,40
4,28,338409,13,0,0,40
...,...,...,...,...,...,...
48837,39,215419,13,0,0,36
48838,64,321403,9,0,0,40
48839,38,374983,13,0,0,50
48840,44,83891,13,5455,0,40


In [33]:
X_cat = df[cat_cols]
X_cat

Unnamed: 0,1,3,5,6,7,8,9,13
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba
...,...,...,...,...,...,...,...,...
48837,Private,Bachelors,Divorced,Prof-specialty,Not-in-family,White,Female,United-States
48838,?,HS-grad,Widowed,?,Other-relative,Black,Male,United-States
48839,Private,Bachelors,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States
48840,Private,Bachelors,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,United-States


In [34]:
y

0        0
1        0
2        0
3        0
4        0
        ..
48837    0
48838    0
48839    0
48840    0
48841    1
Name: 14, Length: 48842, dtype: int64

### 11. Создайте пайплайн по обработке колонок(используйте OneHotEncoder, MinMaxScaler).

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

In [35]:
from sklearn.preprocessing import OneHotEncoder

In [36]:
def df_ohe_preprocessing(df_for_preprocess, scaler=MinMaxScaler()):
    features_preprocess = [
        {'type':'cat', 'preprocessing':OneHotEncoder(), 'data':df_for_preprocess[cat_cols]},
        {'type':'num', 'preprocessing':scaler, 'data':df_for_preprocess[num_cols]}
    ]

    dfs_pipes = []
    for feature in features_preprocess:
    
        ftype = feature['type']
        preprocessing = feature['preprocessing']
        fdata = feature['data']
        
        pipe = Pipeline([('preprocessing', preprocessing)]).fit_transform(fdata)
        
        if ftype == 'cat':
            pipe = pipe.toarray()
        
        pipe_df = pd.DataFrame(pipe)
        dfs_pipes.append(pipe_df)
    
    y = df_for_preprocess.iloc[:,-1:].astype(int)
    y.index = [i for i in range(y.shape[0])]
    dfs_pipes.append(y)    
    
    
    df_result = pd.concat([pipe_df for pipe_df in dfs_pipes], axis=1)
    new_cols_names = [col_name for col_name in range(df_result.columns.size)]
    df_result.columns = new_cols_names
    
    return df_result.copy()

In [37]:
df1 = df_ohe_preprocessing(df)
df1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99,100,101,102,103,104,105,106,107,108
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.301370,0.044131,0.800000,0.021740,0.0,0.397959,0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.452055,0.048052,0.800000,0.000000,0.0,0.122449,0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.287671,0.137581,0.533333,0.000000,0.0,0.397959,0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.493151,0.150486,0.400000,0.000000,0.0,0.397959,0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.150685,0.220635,0.800000,0.000000,0.0,0.397959,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.301370,0.137428,0.800000,0.000000,0.0,0.357143,0
48838,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.643836,0.209130,0.533333,0.000000,0.0,0.397959,0
48839,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.287671,0.245379,0.800000,0.000000,0.0,0.500000,0
48840,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.369863,0.048444,0.800000,0.054551,0.0,0.397959,0


### 12. Посчитайте метрики accuracy и f1_score на предсказании только самого частого класса в целевой переменной.

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html

In [38]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [46]:
def split_df(df, size=1):
    df_res = df.sample(int(df.shape[0]*size)).copy() if size != 1 else df.copy()
    X_res, y_res = df_res.iloc[:,:-1], df_res.iloc[:,-1]
    X_res_train, X_res_test, y_res_train, y_res_test = train_test_split(X_res, y_res, test_size=.2, random_state=RANDOM_STATE)
    return df_res, X_res, y_res, X_res_train, X_res_test, y_res_train, y_res_test

In [50]:
df1, X1, y1, X1_train, X1_test, y1_train, y1_test = split_df(df1)

In [52]:
y1.value_counts()

0    37155
1    11687
Name: 108, dtype: int64

In [53]:
df1_most_freq = df1[y1==0].copy()
df1_most_freq.shape

(37155, 109)

In [54]:
X1_most_freq, y1_most_freq = df1_most_freq.iloc[:,:-1], df1_most_freq.iloc[:,-1]

In [56]:
from sklearn.linear_model import LogisticRegression

In [58]:
%%time

logreg = LogisticRegression().fit(X1_train, y1_train)

y1_pred = logreg.predict(X1_most_freq)

acc_logreg = accuracy_score(y1_most_freq, y1_pred)
f1_logreg_micro = f1_score(y1_most_freq, y1_pred, average='micro')
f1_logreg_weighted = f1_score(y1_most_freq, y1_pred, average='weighted')
f1_logreg = f1_score(y1_most_freq, y1_pred, average='binary')

print(f'Accuracy (LogisticRegression) : {acc_logreg}\n'\
      f'F1_micro (LogisticRegression) : {f1_logreg_micro}\n'\
      f'F1_weighted (LogisticRegression) : {f1_logreg_weighted}\n'\
      f'F1_binary (LogisticRegression) : {f1_logreg}')

Accuracy (LogisticRegression) : 0.9335486475575293
F1_micro (LogisticRegression) : 0.9335486475575293
F1_weighted (LogisticRegression) : 0.9656324383012485
F1_binary (LogisticRegression) : 0.0
Wall time: 1.01 s


### 13. Посчитайте cross_val_score по алгоритмам LogisticRegression, SVC, LinearSVC по метрикам accuracy и f1_score.
Напишите удалось ли превзойти предыдущий результат.

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html

In [59]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC, LinearSVC

In [60]:
X1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,98,99,100,101,102,103,104,105,106,107
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.301370,0.044131,0.800000,0.021740,0.0,0.397959
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.452055,0.048052,0.800000,0.000000,0.0,0.122449
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.287671,0.137581,0.533333,0.000000,0.0,0.397959
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.493151,0.150486,0.400000,0.000000,0.0,0.397959
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.150685,0.220635,0.800000,0.000000,0.0,0.397959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.301370,0.137428,0.800000,0.000000,0.0,0.357143
48838,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.643836,0.209130,0.533333,0.000000,0.0,0.397959
48839,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.287671,0.245379,0.800000,0.000000,0.0,0.500000
48840,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.369863,0.048444,0.800000,0.054551,0.0,0.397959


SVC не рекомендовано использовать на больших данных, поэтому для данного алгоритма оставлю только последние 6 вещественных признаков.

In [62]:
X1.iloc[:,-6:]

Unnamed: 0,102,103,104,105,106,107
0,0.301370,0.044131,0.800000,0.021740,0.0,0.397959
1,0.452055,0.048052,0.800000,0.000000,0.0,0.122449
2,0.287671,0.137581,0.533333,0.000000,0.0,0.397959
3,0.493151,0.150486,0.400000,0.000000,0.0,0.397959
4,0.150685,0.220635,0.800000,0.000000,0.0,0.397959
...,...,...,...,...,...,...
48837,0.301370,0.137428,0.800000,0.000000,0.0,0.357143
48838,0.643836,0.209130,0.533333,0.000000,0.0,0.397959
48839,0.287671,0.245379,0.800000,0.000000,0.0,0.500000
48840,0.369863,0.048444,0.800000,0.054551,0.0,0.397959


In [63]:
def get_cvs(model, X, y, cv=3):
    
    metrics = ['accuracy', 'f1', 'f1_weighted']
    scores = {}
    
    for metric in metrics:
        score = cross_val_score(model, X, y, cv=cv, scoring=metric).mean()
        scores[metric] = score
        
    return scores

In [64]:
# работает очень долго, даже при сильно уменьшенном кол-ве признаков
models = [SVC()]
X_cvs, y_cvs = X1.iloc[:,-6:], y1

for model in models:
    %time scores = get_cvs(model, X_cvs, y_cvs)
    print(model, scores)

Wall time: 3min 36s
SVC() {'accuracy': 0.8215060266055292, 'f1': 0.48166049265646144, 'f1_weighted': 0.7939577349155948}


In [66]:
models = [LogisticRegression(), LinearSVC()]
X_cvs, y_cvs = X1, y1

for model in models:
    %time scores = get_cvs(model, X_cvs, y_cvs)
    print(model, scores, '\n')

Wall time: 8.75 s
LogisticRegression() {'accuracy': 0.8510503178690022, 'f1': 0.6562999424184716, 'f1_weighted': 0.8454323317271303} 

Wall time: 10.2 s
LinearSVC() {'accuracy': 0.8528725280383656, 'f1': 0.6581704182911926, 'f1_weighted': 0.8468993124618663} 



### 14. Можно заметить что в данных присутствуют значения '?', замените их самыми частыми значениями (испольуйте SimpleImputer)

In [67]:
def clear_df(df, val='?', clear='drop'):
    df_cleared = df.copy()
    if clear == 'drop':
        df_cleared = df_cleared.replace(val, np.nan).dropna()
    if clear == 'most_frequent':
        imputer = SimpleImputer(missing_values=val, strategy='most_frequent')
        imputer = imputer.fit_transform(df_cleared) 
        df_cleared = pd.DataFrame(imputer)
    return df_cleared.copy()

In [69]:
df_cleared = clear_df(df, clear='most_frequent')
df_cleared

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,0
48838,64,Private,321403,HS-grad,9,Widowed,Prof-specialty,Other-relative,Black,Male,0,0,40,United-States,0
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,0
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,0


### 15. Посчитайте cross_val_score на новых данных. Напишите удалось ли улучшить результат.

In [70]:
df2 = df_ohe_preprocessing(df_cleared)
X2, y2 = df2.iloc[:,:-1], df2.iloc[:,-1]
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,96,97,98,99,100,101,102,103,104,105
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.301370,0.044131,0.800000,0.021740,0.0,0.397959,0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.452055,0.048052,0.800000,0.000000,0.0,0.122449,0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.287671,0.137581,0.533333,0.000000,0.0,0.397959,0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.493151,0.150486,0.400000,0.000000,0.0,0.397959,0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.150685,0.220635,0.800000,0.000000,0.0,0.397959,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.301370,0.137428,0.800000,0.000000,0.0,0.357143,0
48838,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.643836,0.209130,0.533333,0.000000,0.0,0.397959,0
48839,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.287671,0.245379,0.800000,0.000000,0.0,0.500000,0
48840,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.369863,0.048444,0.800000,0.054551,0.0,0.397959,0


In [71]:
models = [LogisticRegression(), LinearSVC()]
X_cvs, y_cvs = X2, y2

for model in models:
    %time scores = get_cvs(model, X_cvs, y_cvs)
    print(model, scores)

Wall time: 8.38 s
LogisticRegression() {'accuracy': 0.8499856545323654, 'f1': 0.651984000093424, 'f1_weighted': 0.843989807900473}
Wall time: 10.9 s
LinearSVC() {'accuracy': 0.8513779005901725, 'f1': 0.6521782249023217, 'f1_weighted': 0.8448837235591471}


Результаты по всем метрикам незначительно ухудшились.

### 16. Посчитайте cross_val_score, если просто удалить значения '?'. Напишите как изменился результат

In [72]:
df_cleared_drop = clear_df(df, clear='drop')
df_cleared_drop

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48836,33,Private,245211,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,0
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,0
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,0
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,0


In [73]:
df3 = df_ohe_preprocessing(df_cleared_drop)
X3, y3 = df3.iloc[:,:-1], df3.iloc[:,-1]
df3

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,95,96,97,98,99,100,101,102,103,104
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.301370,0.043350,0.800000,0.021740,0.0,0.397959,0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.452055,0.047274,0.800000,0.000000,0.0,0.122449,0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.287671,0.136877,0.533333,0.000000,0.0,0.397959,0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.493151,0.149792,0.400000,0.000000,0.0,0.397959,0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.150685,0.219998,0.800000,0.000000,0.0,0.397959,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45217,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.219178,0.156895,0.800000,0.000000,0.0,0.397959,0
45218,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.301370,0.136723,0.800000,0.000000,0.0,0.357143,0
45219,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.287671,0.244762,0.800000,0.000000,0.0,0.500000,0
45220,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.369863,0.047666,0.800000,0.054551,0.0,0.397959,0


In [75]:
models = [LogisticRegression(), LinearSVC()]
X_cvs, y_cvs = X3, y3

for model in models:
    %time scores = get_cvs(model, X_cvs, y_cvs)
    print(model, scores, '\n')

Wall time: 8.29 s
LogisticRegression() {'accuracy': 0.8464464198841273, 'f1': 0.6590041974934969, 'f1_weighted': 0.8409575945947138} 

Wall time: 9.67 s
LinearSVC() {'accuracy': 0.8484366016540621, 'f1': 0.6615318478123332, 'f1_weighted': 0.8426693505081153} 



Результаты незначительно меняются в пределах 0.01.

 ### 17. Посчитайте cross_val_score для RandomForestClassifier,GradientBoostingClassifier. Напишите как изменился результат и какой вывод можно из этого сделать.
 
 https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
 
 https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

In [76]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [77]:
models = [RandomForestClassifier(), GradientBoostingClassifier()]
X_cvs, y_cvs = X3, y3

for model in models:
    %time scores = get_cvs(model, X_cvs, y_cvs)
    print(model, scores, '\n')

Wall time: 59.7 s
RandomForestClassifier() {'accuracy': 0.8485250541771704, 'f1': 0.6678039494198877, 'f1_weighted': 0.8448536625201571} 

Wall time: 1min 21s
GradientBoostingClassifier() {'accuracy': 0.8640263588518863, 'f1': 0.6907478068172582, 'f1_weighted': 0.8580058338918821} 



Значения по метрикам у RandomForestClassifier почти не изменились, зато длительность вычислений выросла в разы.
В то время как GradientBoostingClassifier показал по всем метрикам лучшие результаты - значения выросли на ~ 0.01-0.025, что в некоторых задачах важно, но по времени показал худший результат (бельше чем у LogisticRegression и LinearSVC примерно в 8 раз).

### 18. Подберите наилучшую модель, подбирая методы обработки колонок - масштабирование признаков, кодирование признаков и заполнение пропусков. Параметры алгоритмов оставьте по умолчанию. Выведите итоговые параметры и результат accuracy и f1_score.

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV

In [78]:
df_size = .05 # для итоговой оценки взял только 5% от общего кол-ва наблюдений
df_f, X_f, y_f, X_f_train, X_f_test, y_f_train, y_f_test = split_df(df, size=df_size)

In [81]:
clears = ['most_frequent', 'drop']
scalers = [StandardScaler(), MinMaxScaler()]
metrics = ['accuracy', 'f1', 'f1_weighted']

clfs = [LogisticRegression(), LinearSVC(), RandomForestClassifier(), GradientBoostingClassifier()]

best_model = {}

df_step0 = df_f
for clear in clears:
    df_step1_cleared = clear_df(df_step0, clear=clear)
    for scaler in scalers:
        df_step2_preprocessed = df_ohe_preprocessing(df_step1_cleared, scaler=scaler)
        df_step2, X_step2, y_step2, X_step2_train, X_step2_test, y_step2_train, y_step2_test = split_df(df_step2_preprocessed)
        for metric in metrics:
            pipe = Pipeline(steps=[('clf', 'passthrough')])
            params = [{'clf': clfs}]
            %time search = GridSearchCV(pipe, params, scoring='f1_weighted').fit(X_step2_train, y_step2_train)
            score = search.score(X_step2_test, y_step2_test)
            
            print(search.best_params_['clf'], score, clear, scaler, metric, '\n')
            if len(best_model) == 0 or best_model['score'] < score:
                best_model['clf'] = search.best_params_['clf']
                best_model['score'] = score
                best_model['clear'] = clear
                best_model['scaler'] = scaler
                best_model['metric'] = metric
                
                
print('best model : ')
for key in best_model:
    print(key, '-', best_model[key])

Wall time: 4.91 s
LogisticRegression() 0.7943788171627268 most_frequent StandardScaler() accuracy 

Wall time: 5.53 s
RandomForestClassifier() 0.8258382210836199 most_frequent StandardScaler() f1 

Wall time: 4.8 s
LogisticRegression() 0.7943788171627268 most_frequent StandardScaler() f1_weighted 

Wall time: 4.84 s
GradientBoostingClassifier() 0.8294137610251634 most_frequent MinMaxScaler() accuracy 

Wall time: 4.91 s
GradientBoostingClassifier() 0.8294137610251634 most_frequent MinMaxScaler() f1 

Wall time: 4.6 s
RandomForestClassifier() 0.8187310848016429 most_frequent MinMaxScaler() f1_weighted 

Wall time: 4.45 s
GradientBoostingClassifier() 0.8306374747232058 drop StandardScaler() accuracy 

Wall time: 5.23 s
GradientBoostingClassifier() 0.8306374747232058 drop StandardScaler() f1 

Wall time: 4.49 s
GradientBoostingClassifier() 0.8306374747232058 drop StandardScaler() f1_weighted 

Wall time: 4.8 s
GradientBoostingClassifier() 0.8306374747232058 drop MinMaxScaler() accuracy 



GradientBoostingClassifier показывает лучший результат на большинстве указанных вариаций предобработанных данных, но как уже было видно на примере предыдущих вычислений - работает в разы медленее.