In [None]:
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np

In [None]:
RANDOM_STATE = 42

In [None]:
dataset = load_boston()
X = pd.DataFrame(dataset.data)
X.columns = dataset.feature_names
y = dataset.target

# 1. Разделите выборку на обучающую и тестовую в отношении 80%/20%

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=42)

# 2. Обучите стандартную регрессию, а также Ridge и  Lasso и параметрами по умолчанию и выведите их R2 на тестовой выборке

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score

In [None]:
lr = LinearRegression()
Rr = Ridge()
Lsr = Lasso()

In [None]:
lr.fit(X_train, y_train)
Rr.fit(X_train, y_train)
Lsr.fit(X_train, y_train)

Lasso()

In [None]:
Ypred_lr = lr.predict(X_test)
Ypred_Rr = Rr.predict(X_test)
Ypred_Lsr = Lsr.predict(X_test)

In [None]:
print('R2 linear regression: ', r2_score(y_test, Ypred_lr))
print('R2 Ridge regression: ', r2_score(y_test, Ypred_Rr))
print('R2 Lasso regression: ', r2_score(y_test, Ypred_Lsr))

R2 linear regression:  0.6687594935356307
R2 Ridge regression:  0.666222167016852
R2 Lasso regression:  0.6671453631686304


**Вывод:** Линейная регрессия без регуляризации показывает лучший результат по метрике R2

# 3. Для Ridge и Lasso подберите коэффициент регуляризации(используйте GridSearchCV, RidgeCV, LassoCV) в пределах от $10^{-5}$ до $10^5$ (по степеням 10). Посчитайте R2 на тестовой выборке по лучшим моделям и сравните с предыдущими результатами. Напишите как изменился результат

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeCV, LassoCV

In [None]:
alfas = []
for n in range(-5,6):
  alfas.append(10**n)

**GridSearchCV**

In [None]:
GS_Rr = GridSearchCV(Rr, {'alpha':alfas})
GS_Lsr = GridSearchCV(Lsr, {'alpha':alfas})

In [None]:
GS_Rr.fit(X_train, y_train)
GS_Lsr.fit(X_train, y_train)
print('Best Alpha for Ridge: ', GS_Rr.best_params_)
print('Best Alpha for Lasso: ', GS_Lsr.best_params_)

Best Alpha for Ridge:  {'alpha': 1e-05}
Best Alpha for Lasso:  {'alpha': 1e-05}


In [None]:
Ypred_GS_Rr = GS_Rr.predict(X_test)
Ypred_GS_Lsr = GS_Lsr.predict(X_test)
print('R2 Ridge regression: ', r2_score(y_test, Ypred_GS_Rr))
print('R2 Lasso regression: ', r2_score(y_test, Ypred_GS_Lsr))


R2 Ridge regression:  0.6687594856409733
R2 Lasso regression:  0.6687598638315153


**RidgeCV** **&** **LassoCV**

In [None]:
R_CV = RidgeCV(alphas=alfas, cv=5)
L_CV = LassoCV(alphas=alfas, cv=5)

In [None]:
R_CV.fit(X_train, y_train)
L_CV.fit(X_train, y_train)
print('Best Alpha for Ridge: ', R_CV.alpha_)
print('Best Alpha for Lasso: ', L_CV.alpha_)

Best Alpha for Ridge:  1e-05
Best Alpha for Lasso:  1e-05


In [None]:
print('R2 Ridge regression: ', r2_score(y_test, R_CV.predict(X_test)))
print('R2 Lasso regression: ', r2_score(y_test, L_CV.predict(X_test)))

R2 Ridge regression:  0.6687594856409733
R2 Lasso regression:  0.6687598638315153


Вывод:


*   Результаты полученные с использованием GridSearchCV и RidgeCV & LassoCV
идентичны
*   Лучший коэффициент регуляризации - наименьший из рассматриваемых. При этом метрика R2 несколько увеличилась. Из этого можно сделать заключение, что модель без регуляризации показывает лучшее качество.



# 4. Проведите масштабирование выборки(используйте Pipeline, StandardScaler, MinMaxScaler), посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

**StandardScaler**

In [None]:
pipe_lr = Pipeline([('scaler', StandardScaler()), ('model', LinearRegression())])
pipe_Rr = Pipeline([('scaler', StandardScaler()), ('model', Ridge())])
pipe_Lsr = Pipeline([('scaler', StandardScaler()), ('model', Lasso())])

In [None]:
pipe_lr.fit(X_train, y_train)
pipe_Rr.fit(X_train, y_train)
pipe_Lsr.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()), ('model', Lasso())])

In [None]:
print('R2 Linear regression: ', r2_score(y_test, pipe_lr.predict(X_test)))
print('R2 Ridge regression: ', r2_score(y_test, pipe_Rr.predict(X_test)))
print('R2 Lasso regression: ', r2_score(y_test, pipe_Lsr.predict(X_test)))

R2 Linear regression:  0.6687594935356321
R2 Ridge regression:  0.6684624359643558
R2 Lasso regression:  0.6239428734251422


MinMaxScaler

In [None]:
pipe_lr = Pipeline([('scaler', MinMaxScaler()), ('model', LinearRegression())])
pipe_Rr = Pipeline([('scaler', MinMaxScaler()), ('model', Ridge())])
pipe_Lsr = Pipeline([('scaler', MinMaxScaler()), ('model', Lasso())])

In [None]:
pipe_lr.fit(X_train, y_train)
pipe_Rr.fit(X_train, y_train)
pipe_Lsr.fit(X_train, y_train)

Pipeline(steps=[('scaler', MinMaxScaler()), ('model', Lasso())])

In [None]:
print('R2 Linear regression: ', r2_score(y_test, pipe_lr.predict(X_test)))
print('R2 Ridge regression: ', r2_score(y_test, pipe_Rr.predict(X_test)))
print('R2 Lasso regression: ', r2_score(y_test, pipe_Lsr.predict(X_test)))

R2 Linear regression:  0.6687594935356318
R2 Ridge regression:  0.6764100365423598
R2 Lasso regression:  0.2573921442545195


**Вывод:** масштабирование признаков практически не повлияло на качество линейной регрессии, немного увеличило качество Ridge регрессии и уменьшило качество  Lasso регресисии.

# 5. Подберите коэффициент регуляризации для Ridge и Lasso на масштабированных данных, посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [None]:
pipe_Rr = Pipeline([('scaler', StandardScaler()), 
                    ('model', RidgeCV(alphas=alfas, cv=5))])
pipe_Lsr = Pipeline([('scaler', StandardScaler()), 
                     ('model', LassoCV(alphas=alfas, cv=5))])

In [None]:
pipe_Rr.fit(X_train, y_train)
pipe_Lsr.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('model',
                 LassoCV(alphas=[1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100,
                                 1000, 10000, 100000],
                         cv=5))])

In [None]:
print('R2 Ridge regression: ', r2_score(y_test, pipe_Rr.predict(X_test)))
print('R2 Lasso regression: ', r2_score(y_test, pipe_Lsr.predict(X_test)))

R2 Ridge regression:  0.6684624359643558
R2 Lasso regression:  0.668759038334717


**Вывод:** использование регуляризации и масштабирования признаков  немного увеличивает качество моделей. В большей степени Lasso регрессию.

# 6. Добавьте попарные произведения признаков и их квадраты (используйте PolynomialFeatures) на масштабированных признаках, посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
SS = StandardScaler()
X_SS = SS.fit_transform(X)

In [None]:
poly = PolynomialFeatures(2)
X_poly = poly.fit_transform(X_SS)

In [None]:
df = pd.DataFrame(X_poly)
df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,95,96,97,98,99,100,101,102,103,104
0,1.0,-0.419782,0.28483,-1.287909,-0.272599,-0.144217,0.413672,-0.120013,0.140214,-0.982843,...,0.444367,0.972582,-0.294009,0.716979,2.128682,-0.643495,1.569246,0.194527,-0.474379,1.156834
1,1.0,-0.417339,-0.487722,-0.593381,-0.272599,-0.740262,0.194274,0.367166,0.55716,-0.867883,...,0.97482,0.299254,-0.435464,0.4862,0.091866,-0.13368,0.149255,0.194527,-0.217191,0.242497
2,1.0,-0.417342,-0.487722,-0.593381,-0.272599,-0.740262,1.282714,-0.265812,0.55716,-0.867883,...,0.97482,0.299254,-0.391404,1.193412,0.091866,-0.120155,0.366358,0.157154,-0.479172,1.461022


In [None]:
X_poly = df.drop(df.columns[[0]], axis=1)
X_poly.head(3)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,95,96,97,98,99,100,101,102,103,104
0,-0.419782,0.28483,-1.287909,-0.272599,-0.144217,0.413672,-0.120013,0.140214,-0.982843,-0.666608,...,0.444367,0.972582,-0.294009,0.716979,2.128682,-0.643495,1.569246,0.194527,-0.474379,1.156834
1,-0.417339,-0.487722,-0.593381,-0.272599,-0.740262,0.194274,0.367166,0.55716,-0.867883,-0.987329,...,0.97482,0.299254,-0.435464,0.4862,0.091866,-0.13368,0.149255,0.194527,-0.217191,0.242497
2,-0.417342,-0.487722,-0.593381,-0.272599,-0.740262,1.282714,-0.265812,0.55716,-0.867883,-0.987329,...,0.97482,0.299254,-0.391404,1.193412,0.091866,-0.120155,0.366358,0.157154,-0.479172,1.461022


In [None]:
Xp_train, Xp_test, y_train, y_test = train_test_split(X_poly, y, 
                                                      test_size=0.2, 
                                                      random_state=42)

In [None]:
R = Ridge()
L = Lasso()
R.fit(Xp_train, y_train)
L.fit(Xp_train, y_train)

Lasso()

In [None]:
print('R2 Ridge regression: ', r2_score(y_test, R.predict(Xp_test)))
print('R2 Lasso regression: ', r2_score(y_test, L.predict(Xp_test)))

R2 Ridge regression:  0.8161012856428587
R2 Lasso regression:  0.7352137474156291


**Вывод:** Комбинации признаков значительно увеличило качество обучения моделей.

# 7. Подберите наилучшую модель (используйте Pipeline, GridSearchSCV) подбирая тип регуляризации (L1,L2), коэффициент регуляризации, метод масштабирования и степень полинома в PolynomialFeatures. Выведите итоговые параметры и результат R2. Напишите как изменился R2 по сравнению с предыдущими экспериментами

In [None]:
step = ([('polynomial', PolynomialFeatures()), 
         ('normalizer', StandardScaler()), 
         ('classifier', Ridge())])

In [None]:
pipe = Pipeline(step)

In [None]:
parameters = dict(normalizer=[StandardScaler(), MinMaxScaler()],
                  polynomial__degree=[2, 3, 4],
                  classifier=[Ridge(), Lasso()],
                  classifier__alpha=alfas)

In [None]:
GS = GridSearchCV(pipe, parameters, scoring='r2')

In [None]:
GS.fit(X_train, y_train)

In [None]:
GS.best_params_

{'classifier': Lasso(alpha=0.001),
 'classifier__alpha': 0.001,
 'normalizer': MinMaxScaler(),
 'polynomial__degree': 3}

In [None]:
print('R2: ', r2_score(y_test, GS.predict(X_test)))

R2:  0.8651767295811961


Вывод:


* Лучший алгоритм Lasso c коэффициентом регуляризации 0.001.
* Лучший метод масштабирования MinMaxScaler.
* Лучшая степень полинома 3.
* R2 для модели с вышеуказанными параметрами: 0.865.
* Качество модели улучшилось в сравнении с предыдущими расчетами.





http://archive.ics.uci.edu/ml/datasets/Adult

In [None]:
link = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/adult-all.csv'
data = pd.read_csv(link, header=None)

In [None]:
name_columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
                'marital-status', 'occupation','relationship', 'race', 'sex', 
                'capital-gain', 'capital-loss', 'hours-per-week',
                'native-country', 'income']

In [None]:
data.columns = name_columns

In [None]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# 8. Разделите выборку на признаки и целевую переменную(колонка со зачениями {<=50K,>50K}). Замените целевую переменную на числовые значения.

In [None]:
X = data.drop(data.columns[14], axis=1).copy()
y = data[data.columns[14]].copy()

In [None]:
y.unique()

array(['<=50K', '>50K'], dtype=object)

In [None]:
y[y == '<=50K'] = 1
y[y == '>50K'] = 0

In [None]:
y.unique()

array([1, 0], dtype=object)

In [None]:
y = y.astype(int)

In [None]:
y.unique()

array([1, 0])

# 9. Выясните, присутствуют ли в данных пропуски. Заполните их самыми частыми значениями (испольуйте SimpleImputer)



In [None]:
data.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

Пропуски данных отсутсвуют.

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imp_most_frequent = SimpleImputer(strategy='most_frequent')
X = pd.DataFrame(imp_most_frequent.fit_transform(X))

In [None]:
X_columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
                'marital-status', 'occupation','relationship', 'race', 'sex', 
                'capital-gain', 'capital-loss', 'hours-per-week',
                'native-country']

In [None]:
X.columns = X_columns
X.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States


# 10. Выберите колонки с числовыми и категориальными переменными.

In [None]:
numerical = ['age', 'fnlwgt', 'education-num', 
                'capital-gain', 'capital-loss', 'hours-per-week']
X_num = X[numerical]
X_num.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,39,77516,13,2174,0,40
1,50,83311,13,0,0,13
2,38,215646,9,0,0,40
3,53,234721,7,0,0,40
4,28,338409,13,0,0,40


In [None]:
X_cat = X.drop(numerical, axis=1)
X_cat.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [None]:
categorical = ['workclass', 'education', 'marital-status', 'occupation', 
               'relationship', 'race', 'sex', 'native-country']

# 11. Создайте пайплайн по обработке колонок(используйте OneHotEncoder,MinMaxScaler).

In [None]:
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import FeatureUnion

In [None]:
MMS = MinMaxScaler() 
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [None]:
numerical_selector = FunctionTransformer(lambda X: X[numerical], validate=False)
numerical_preprocessor = Pipeline([("numerical_selector", numerical_selector), ("scaling", MMS)])

In [None]:
dummy_selector = FunctionTransformer(lambda X: X[categorical], validate=False)
dummy_preprocessor = Pipeline([("dummy_selector", dummy_selector), ("ohe", ohe)])

In [None]:
feature_union = FeatureUnion([("numerical_preprocessor", numerical_preprocessor),
("dummy_preprocessor", dummy_preprocessor)])
pipeline = Pipeline([("preprocessing", feature_union)])

In [None]:
preproc_df = pipeline.fit_transform(X)

In [None]:
X_preproc = pd.DataFrame(preproc_df)
X_preproc.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,98,99,100,101,102,103,104,105,106,107
0,0.30137,0.044131,0.8,0.02174,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.452055,0.048052,0.8,0.0,0.0,0.122449,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.287671,0.137581,0.533333,0.0,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# 12. Посчитайте метрики accuracy и f1_score на предсказании только самого частого класса в целевой переменной.

In [None]:
from collections import Counter
Counter(y)

Counter({1: 37155, 0: 11687})

In [None]:
y_pred = np.ones_like(y)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

In [None]:
accuracy_score(y, y_pred)

0.7607182343065395

In [None]:
f1_score(y, y_pred)

0.8640999104619929

# 13. Посчитайте cross_val_score по алгоритмам LogisticRegression, SVC, LinearSVC по метрикам accuracy и f1_score.
Напишите удалось ли превзойти предыдущий результат.

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC

In [None]:
log_reg = LogisticRegression(max_iter=1000)
svc = SVC(gamma='auto')
lin_scv = LinearSVC()

LogisticRegression

In [None]:
accur = cross_val_score(log_reg, X_preproc, y, cv=5, scoring='accuracy')
f1 = cross_val_score(log_reg, X_preproc, y, cv=5, scoring='f1')
print('accuracy of LogisticRegression ', accur.mean())
print('f1 of LogisticRegression ', f1.mean())

accuracy of LogisticRegression  0.8508456839478438
f1 of LogisticRegression  0.9047999829968246


SVC

In [None]:
accur = cross_val_score(svc, X_preproc, y, cv=5, scoring='accuracy')
f1 = cross_val_score(svc, X_preproc, y, cv=5, scoring='f1')
print('accuracy of SVC ', accur.mean())
print('f1 of SVC ', f1.mean())

accuracy of SVC  0.836431812376126
f1 of SVC  0.8972870260227503


LinearSVC

In [None]:
accur = cross_val_score(lin_scv, X_preproc, y, cv=5, scoring='accuracy')
f1 = cross_val_score(lin_scv, X_preproc, y, cv=5, scoring='f1')
print('accuracy of LinearSVC ', accur.mean())
print('f1 of LinearSVC ', f1.mean())

accuracy of LinearSVC  0.8529135478362626
f1 of LinearSVC  0.9063223083526415


Вывод: модели с использованием стандартных настроек показывают лучшее качество чем предсказание самым частым классом. Алгоритм SVC показал более низкое качество притдлительном расчете, поэтому далее использоваться не будет.

# 14. Можно заметить что в данных присутствуют значения '?', замените их самыми частыми значениями (испольуйте SimpleImputer)

In [None]:
X.isin(['?']).any() # ? присутствует в необработанных данных

age               False
workclass          True
fnlwgt            False
education         False
education-num     False
marital-status    False
occupation         True
relationship      False
race              False
sex               False
capital-gain      False
capital-loss      False
hours-per-week    False
native-country     True
dtype: bool

In [None]:
X_preproc.isin(['?']).any().any() 
# после кодирования "?" стал одним из признаков

False

In [None]:
imp_most_frequent = SimpleImputer(missing_values='?', strategy='most_frequent')
X_mf = pd.DataFrame(imp_most_frequent.fit_transform(X))

In [None]:
X_mf.isin(['?']).any().any()

False

In [None]:
X_mf.columns = X_columns

In [None]:
X_mf.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States


In [None]:
preproc_X_mf = pipeline.fit_transform(X_mf)

In [None]:
X_preproc2 = pd.DataFrame(preproc_X_mf)
X_preproc2.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,95,96,97,98,99,100,101,102,103,104
0,0.30137,0.044131,0.8,0.02174,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.452055,0.048052,0.8,0.0,0.0,0.122449,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.287671,0.137581,0.533333,0.0,0.0,0.397959,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
X_preproc.shape[1] - X_preproc2.shape[1]
#размер выборки уменьшился на количество признаков со знаком ? из первоначальной выборки

3

Вывод: можно заметить что после операции масштабирования значения "?" стали новыми признакми. Когда "?" заменяем на самые частые значения , количество признаков в выборке после кодирования уменьшилось на 3.

# 15. Посчитайте cross_val_score на новых данных. Напишите удалось ли улучшить результат.

In [None]:
accur = cross_val_score(log_reg, X_preproc2, y, cv=5, scoring='accuracy')
f1 = cross_val_score(log_reg, X_preproc2, y, cv=5, scoring='f1')
print('accuracy of LogisticRegression ', accur.mean())
print('f1 of LogisticRegression ', f1.mean())

accuracy of LogisticRegression  0.8504975981201797
f1 of LogisticRegression  0.9046682721122504


In [None]:
accur = cross_val_score(lin_scv, X_preproc2, y, cv=5, scoring='accuracy')
f1 = cross_val_score(lin_scv, X_preproc2, y, cv=5, scoring='f1')
print('accuracy of LinearSVC ', accur.mean())
print('f1 of LinearSVC ', f1.mean())

accuracy of LinearSVC  0.8508251523375897
f1 of LinearSVC  0.905141981677368


**Вывод:** метрики качества после удаления очистки данных от "?" несколько ухудшились, однако более чистые данные позволяют получить более качественную модель.

# 16. Посчитайте cross_val_score, если просто удалить значения '?'. Напишите как изменился результат

In [None]:
X.isin(['?']).any().any()

True

In [None]:
#отбираем индексы строк содержищие "?"
index_line = X[(X=='?').sum(axis=1) > 0].index
index_line.shape

(3620,)

In [None]:
X.shape

(48842, 14)

In [None]:
preproc_X_dl = X.drop(index = index_line)
preproc_X_dl.shape

(45222, 14)

In [None]:
#проверка количество строк
preproc_X_dl.shape[0] + index_line.shape[0] == X.shape[0]

True

In [None]:
preproc_X_dl = pipeline.fit_transform(preproc_X_dl)

In [None]:
X_preproc3 = pd.DataFrame(preproc_X_dl)
X_preproc3.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94,95,96,97,98,99,100,101,102,103
0,0.30137,0.04335,0.8,0.02174,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.452055,0.047274,0.8,0.0,0.0,0.122449,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.287671,0.136877,0.533333,0.0,0.0,0.397959,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
accur = cross_val_score(log_reg, X_preproc3, y.drop(index = index_line), 
                        cv=5, scoring='accuracy')
f1 = cross_val_score(log_reg, X_preproc3, y.drop(index = index_line), 
                     cv=5, scoring='f1')
print('accuracy of LogisticRegression ', accur.mean())
print('f1 of LogisticRegression ', f1.mean())

accuracy of LogisticRegression  0.8468224507224553
f1 of LogisticRegression  0.9011360598602053


In [None]:
accur = cross_val_score(lin_scv, X_preproc3, y.drop(index = index_line), 
                        cv=5, scoring='accuracy')
f1 = cross_val_score(lin_scv, X_preproc3, y.drop(index = index_line), 
                     cv=5, scoring='f1')
print('accuracy of LinearSVC ', accur.mean())
print('f1 of LinearSVC ', f1.mean())

accuracy of LinearSVC  0.8485030154158197
f1 of LinearSVC  0.9024033119235104


**Вывод:** после удаление данных содержаших "?" метрики качества ухудшились, предположительно из-за уменьшения выборки.

# 17. Посчитайте cross_val_score для RandomForestClassifier,GradientBoostingClassifier. 
Напишите как изменился результат и какой вывод можно из этого сделать.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
ran_for_clf = RandomForestClassifier()
grad_boost_clf = GradientBoostingClassifier()

In [None]:
accur = cross_val_score(ran_for_clf, X_preproc2, y, 
                        cv=5, scoring='accuracy')
f1 = cross_val_score(ran_for_clf, X_preproc2, y, cv=5, scoring='f1')
print('accuracy of RandomForestClassifier ', accur.mean())
print('f1 of RandomForestClassifier ', f1.mean())

accuracy of RandomForestClassifier  0.8509479626380025
f1 of RandomForestClassifier  0.9045553736543803


In [None]:
accur = cross_val_score(grad_boost_clf, X_preproc2, y, 
                        cv=5, scoring='accuracy')
f1 = cross_val_score(grad_boost_clf, X_preproc2, y, cv=5, scoring='f1')
print('accuracy of GradientBoostingClassifier ', accur.mean())
print('f1 of GradientBoostingClassifier ', f1.mean())

accuracy of GradientBoostingClassifier  0.8663855852334714
f1 of GradientBoostingClassifier  0.9153954075870411


**Вывод:** GradientBoostingClassifier показывает более высокие показатели метрик чем остальные алгоритмы.

# 18. Подберите наилучшую модель, подбирая методы обработки колонок - масштабирование признаков, кодирование признаков и заполнение пропусков. Параметры алгоритмов оставьте по умолчанию. Выведите итоговые параметры и результат accuracy и f1_score.

* Так как на проведенных выше расчетах алгоритм GradientBoostingClassifier показал наиболее высокие результаты, для этого задания будем использовать его
* Пропуски в начальных данных отсутствуют, значения "?" заменим на самые частые т.к. это позволяет получить более качестенные данные и показывает лучший результат чем удаление строк со значениями "?".
* Таким образом в данном задании будем сравнивать метрики качества при использовании разных способов масштабирования и кодирования данных.

In [None]:
!pip install category_encoders
from category_encoders.binary import BinaryEncoder

In [None]:
bn = BinaryEncoder()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_mf, y, test_size=0.2, random_state=42)

In [None]:
numerical_selector = FunctionTransformer(lambda X: X[numerical], validate=False)
numerical_preprocessor = Pipeline([("numerical_selector", numerical_selector), ("scaling", SS)])

In [None]:
dummy_selector = FunctionTransformer(lambda X: X[categorical], validate=False)
dummy_preprocessor = Pipeline([("dummy_selector", dummy_selector), ("bn", bn)])

In [None]:
feature_union = FeatureUnion([("numerical_preprocessor", numerical_preprocessor),
("dummy_preprocessor", dummy_preprocessor)])
pipeline = Pipeline([("preprocessing", feature_union), ('modelling', grad_boost_clf)])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
accuracy_score(y_test, pipeline.predict(X_test))

0.8625243115979118

In [None]:
f1_score(y_test, pipeline.predict(X_test))

0.9132372892305705

Вывод: использование другого метода кодироваия не повысило качество модели. Лучшая модель остается: алгоритм GradientBoostingClassifier, метод маштабирования MinMaxScaler, способ колирования OneHotCoder c метриками качества:
Аccuracy:  0.8663
F1:  0.9153
