In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.datasets import fetch_20newsgroups

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, r2_score


from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression,Ridge,Lasso


In [2]:
RANDOM_STATE = 42

results_regression = pd.DataFrame(columns = ['model', 'task', 'R2'])
results_classification = pd.DataFrame(columns = ['model', 'task', 'f1', 'accuracy'])

In [3]:
data = pd.read_csv('boston.csv')
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


1. Разделите выборку на обучающую и тестовую в отношении 80%/20%, предварительно выделив целевую переменную (колонка 'MEDV').

In [4]:
X, y = load_diabetes(return_X_y=True)

X = data.drop('MEDV', axis=1)  # Признаки
y = data['MEDV']  # Целевая переменная

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

# Вывод размеров полученных выборок
print(f"Training set size: {X_train.shape}; Test set size: {X_test.shape}")


Training set size: (404, 13); Test set size: (102, 13)


2. Обучите стандартную регрессию, а также Ridge и Lasso с параметрами по умолчанию и выведите их R2 на тестовой выборке

In [5]:
#%%time
lr = LinearRegression()
lr.fit(X_train, y_train)
y_lr = lr.predict(X_test)


rid = Ridge(alpha=1.0)
rid.fit(X_train, y_train)
y_rid = rid.predict(X_test)

las = Lasso(alpha=0.1)
las.fit(X_train, y_train)
y_las = las.predict(X_test)

r2_lr = r2_score(y_test, y_lr)
r2_ridge = r2_score(y_test, y_rid)
r2_lasso = r2_score(y_test, y_las)
results_regression.loc[0] = ['LR', 'task2', r2_lr]
results_regression.loc[1] = ['Ridge', 'task2', r2_ridge]
results_regression.loc[2] = ['Lasso', 'task2', r2_lasso]

results_regression

Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.656706


3. Для Ridge и Lasso подберите коэффициент регуляризации двумя способами 1) GridSearchCV, 2) RidgeCV и LassoCV, в пределах от 10^-5 до 10^5 (по степеням 10). Посчитайте R2 на тестовой выборке по всем моделям и сравните с предыдущими результатами.

In [6]:
from sklearn.linear_model import LassoCV, Ridge, Lasso, RidgeCV
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import r2_score

alpha_values = {'alpha': np.logspace(-5, 5, 10)}

#grid для ridge
ridge_g = GridSearchCV(Ridge(random_state=RANDOM_STATE),
                     alpha_values,
                     scoring = 'neg_mean_squared_error',
                     cv = 10)
ridge_g.fit(X_train, y_train)
r2_ridge_grid_search = r2_score(y_test, ridge_g.predict(X_test))

#grid для lasso
lasso_g = GridSearchCV(Lasso(random_state=RANDOM_STATE, max_iter=10000),
                     alpha_values,
                     cv = 10)
lasso_g.fit(X_train, y_train)
r2_lasso_grid_search = r2_score(y_test, lasso_g.predict(X_test))

ridge_cv = RidgeCV(alphas=np.logspace(-5, 5, 10), cv=10).fit(X_train, y_train)
r2_ridge_cv = r2_score(y_test, ridge_cv.predict(X_test))

lasso_cv = LassoCV(alphas=np.logspace(-5, 5, 10), cv=10, max_iter=10000).fit(X_train, y_train)
r2_lasso_cv = r2_score(y_test, lasso_cv.predict(X_test))

results_regression.loc[3] = ['Ridge_GridSearchCV', 'task3', r2_ridge_grid_search]
results_regression.loc[4] = ['RidgeCV', 'task3', r2_ridge_cv]
results_regression.loc[5] = ['Lasso_GridSearchCV', 'task3', r2_lasso_grid_search]
results_regression.loc[6] = ['LassoCV', 'task3', r2_lasso_cv]

results_regression

Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.656706
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668483
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483


4. Проведите масштабирование выборки (используйте Pipeline, StandardScaler, MinMaxScaler), посчитайте R2 для Ridge и Lasso с параметрами по умолчанию и сравните с предыдущими результатами.

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import r2_score

# Пайплайн для Ridge с StandardScaler.
ridge_standard_scaler_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge())])
ridge_standard_scaler_pipeline.fit(X_train, y_train)
r2_ridge_standart_scaler = r2_score(y_test, ridge_standard_scaler_pipeline.predict(X_test))

# Пайплайн для Ridge с MinMaxScaler.
ridge_min_max_scaler_pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('ridge', Ridge())])
ridge_min_max_scaler_pipeline.fit(X_train, y_train)
r2_ridge_min_max_scaler = r2_score(y_test, ridge_min_max_scaler_pipeline.predict(X_test))

# Пайплайн для Lasso с StandardScaler.
lasso_standard_scaler_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lasso', Lasso())])
lasso_standard_scaler_pipeline.fit(X_train, y_train)
r2_lasso_standart_scaler = r2_score(y_test, lasso_standard_scaler_pipeline.predict(X_test))

# Пайплайн для Lasso с MinMaxScaler.
lasso_min_max_scaler_pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('lasso', Lasso())])
lasso_min_max_scaler_pipeline.fit(X_train, y_train)
r2_lasso_min_max_scaler = r2_score(y_test, lasso_min_max_scaler_pipeline.predict(X_test))

results_regression.loc[7] = ['Ridge_StandardScaler', 'task4', r2_ridge_standart_scaler]
results_regression.loc[8] = ['Ridge_MinMaxScaler', 'task4', r2_ridge_min_max_scaler]
results_regression.loc[9] = ['Lasso_StandardScaler', 'task4', r2_lasso_standart_scaler]
results_regression.loc[10] = ['Lasso_MinMaxScaler', 'task4', r2_lasso_min_max_scaler]

results_regression

Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.656706
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668483
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483
7,Ridge_StandardScaler,task4,0.66819
8,Ridge_MinMaxScaler,task4,0.676221
9,Lasso_StandardScaler,task4,0.624045


5. Подберите коэффициент регуляризации для Ridge и Lasso на масштабированных данных, посчитайте R2 и сравните с предыдущими результатами.

In [8]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import r2_score

# Параметры для поиска по сетке.
param_grid = {'alpha': np.logspace(-3, 3, 7)}  # Пример значений для коэффициента регуляризации.

# Создаем и обучаем GridSearchCV для Ridge с StandardScaler.
ridge_standard_scaler_pipeline_cv = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge_cv', GridSearchCV(Ridge(), param_grid, cv=5))
])
ridge_standard_scaler_pipeline_cv.fit(X_train, y_train)
r2_ridge_standart_scaler_cv = r2_score(y_test, ridge_standard_scaler_pipeline_cv.predict(X_test))

# Создаем и обучаем GridSearchCV для Ridge с MinMaxScaler.
ridge_min_max_scaler_pipeline_cv = Pipeline([
    ('scaler', MinMaxScaler()),
    ('ridge_cv', GridSearchCV(Ridge(), param_grid, cv=5))])
ridge_min_max_scaler_pipeline_cv.fit(X_train, y_train)
r2_ridge_min_max_scaler_cv = r2_score(y_test, ridge_min_max_scaler_pipeline_cv.predict(X_test))

# Создаем и обучаем GridSearchCV для Lasso с StandardScaler.
lasso_standard_scaler_pipeline_cv = Pipeline([
    ('scaler', StandardScaler()),
    ('lasso_cv', GridSearchCV(Lasso(), param_grid, cv=5))])
lasso_standard_scaler_pipeline_cv.fit(X_train, y_train)
r2_lasso_standart_scaler_cv = r2_score(y_test, lasso_standard_scaler_pipeline_cv.predict(X_test))

# Создаем и обучаем GridSearchCV для Lasso с MinMaxScaler.
lasso_min_max_scaler_pipeline_cv = Pipeline([
    ('scaler', MinMaxScaler()),
    ('lasso_cv', GridSearchCV(Lasso(), param_grid, cv=5))])
lasso_min_max_scaler_pipeline_cv.fit(X_train, y_train)
r2_lasso_min_max_scaler_cv = r2_score(y_test, lasso_min_max_scaler_pipeline_cv.predict(X_test))

results_regression.loc[11] = ['Ridge_StandardScaler_CV', 'task5', r2_ridge_standart_scaler_cv]
results_regression.loc[12] = ['Ridge_MinMaxScaler_CV', 'task5', r2_ridge_min_max_scaler_cv]
results_regression.loc[13] = ['Lasso_StandardScaler_CV', 'task5', r2_lasso_standart_scaler_cv]
results_regression.loc[14] = ['Lasso_MinMaxScaler_CV', 'task5', r2_lasso_min_max_scaler_cv]

results_regression

Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.656706
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668483
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483
7,Ridge_StandardScaler,task4,0.66819
8,Ridge_MinMaxScaler,task4,0.676221
9,Lasso_StandardScaler,task4,0.624045


6. Добавьте попарные произведения признаков и их квадраты (используйте PolynomialFeatures) на масштабированных признаках, посчитайте R2 для Ridge и Lasso с параметрами по умолчанию и сравните с предыдущими результатами.

In [9]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import r2_score

# Параметры для поиска по сетке.
param_grid = {'alpha': np.logspace(-3, 3, 7)}  # Пример значений для коэффициента регуляризации.

# Создаем и обучаем GridSearchCV для Ridge с StandardScaler.
ridge_standard_scaler_pipeline_cv = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge_cv', GridSearchCV(Ridge(), param_grid, cv=5))])
ridge_standard_scaler_pipeline_cv.fit(X_train, y_train)
r2_ridge_standart_scaler_cv = r2_score(y_test, ridge_standard_scaler_pipeline_cv.predict(X_test))

# Создаем и обучаем GridSearchCV для Ridge с MinMaxScaler.
ridge_min_max_scaler_pipeline_cv = Pipeline([
    ('scaler', MinMaxScaler()),
    ('ridge_cv', GridSearchCV(Ridge(), param_grid, cv=5))])
ridge_min_max_scaler_pipeline_cv.fit(X_train, y_train)
r2_ridge_min_max_scaler_cv = r2_score(y_test, ridge_min_max_scaler_pipeline_cv.predict(X_test))

# Создаем и обучаем GridSearchCV для Lasso с StandardScaler.
lasso_standard_scaler_pipeline_cv = Pipeline([
    ('scaler', StandardScaler()),
    ('lasso_cv', GridSearchCV(Lasso(), param_grid, cv=5))])
lasso_standard_scaler_pipeline_cv.fit(X_train, y_train)
r2_lasso_standart_scaler_cv = r2_score(y_test, lasso_standard_scaler_pipeline_cv.predict(X_test))

# Создаем и обучаем GridSearchCV для Lasso с MinMaxScaler.
lasso_min_max_scaler_pipeline_cv = Pipeline([
    ('scaler', MinMaxScaler()),
    ('lasso_cv', GridSearchCV(Lasso(), param_grid, cv=5))])
lasso_min_max_scaler_pipeline_cv.fit(X_train, y_train)
r2_lasso_min_max_scaler_cv = r2_score(y_test, lasso_min_max_scaler_pipeline_cv.predict(X_test))

results_regression.loc[15] = ['Ridge_StandardScaler_CV', 'task6', r2_ridge_standart_scaler_cv]
results_regression.loc[16] = ['Ridge_MinMaxScaler_CV', 'task6', r2_ridge_min_max_scaler_cv]
results_regression.loc[17] = ['Lasso_StandardScaler_CV', 'task6', r2_lasso_standart_scaler_cv]
results_regression.loc[18] = ['Lasso_MinMaxScaler_CV', 'task6', r2_lasso_min_max_scaler_cv]

results_regression


Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.656706
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668483
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483
7,Ridge_StandardScaler,task4,0.66819
8,Ridge_MinMaxScaler,task4,0.676221
9,Lasso_StandardScaler,task4,0.624045


7. Подберите коэффициент регуляризации для Ridge и Lasso на масштабированных данных, добавив PolynomialFeatures, посчитайте R2 и сравните с предыдущими результатами.

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, r2_score

X = data.drop('MEDV', axis=1)
y = data['MEDV']

# Сетка параметров для поиска
param_grid = {'regressor__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Список для хранения результатов
results = []

# Определяем scorer
scorer = make_scorer(r2_score)

# Создаем пайплайны для разных моделей и масштабирования
for name, model, scaler in [
    ('Ridge_StandardScaler', Ridge(), StandardScaler()),
    ('Ridge_MinMaxScaler', Ridge(), MinMaxScaler()),
    ('Lasso_StandardScaler', Lasso(), StandardScaler()),
    ('Lasso_MinMaxScaler', Lasso(), MinMaxScaler())]:
    pipeline = Pipeline([
        ('scale', scaler),
        ('poly', PolynomialFeatures()),
        ('regressor', model)])
    
    # Настраиваем GridSearchCV
    search = GridSearchCV(pipeline, param_grid, scoring=scorer, cv=5, refit=True)
    search.fit(X, y)
    
    results.append({
        'model': name,
        'task': 'task7',
        'R2': search.best_score_})

i=0
for result in results:
    results_regression.loc[len(results_regression)] = results[i]
    i += 1

results_regression

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.656706
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668483
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483
7,Ridge_StandardScaler,task4,0.66819
8,Ridge_MinMaxScaler,task4,0.676221
9,Lasso_StandardScaler,task4,0.624045


8. Подберите наилучшую модель (используйте Pipeline, GridSearchSCV) подбирая тип регуляризации (L1,L2), коэффициент регуляризации, метод масштабирования и степень полинома в PolynomialFeatures. Выведите итоговые параметры и результат R2.

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import r2_score

# Создание списка моделей и их параметров
models_and_parameters = {
    'Ridge': {
        'model': Ridge(),
        'params': {'regressor__alpha': np.logspace(-4, 4, 20)}
    },
    'Lasso': {
        'model': Lasso(max_iter=10000),
        'params': {'regressor__alpha': np.logspace(-4, 4, 20)}
    }
}

# Создание параметров для GridSearchCV
param_grid = {
    'scaler': [StandardScaler(), MinMaxScaler()],
    'poly__degree': [1, 2, 3],
    'regressor': [models_and_parameters['Ridge']['model'], models_and_parameters['Lasso']['model']],
    'regressor__alpha': np.logspace(-4, 4, 20)
}

# Создание конвейера
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures()),
    ('regressor', Ridge())])

# Настройка GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, scoring='r2', cv=5)
grid_search.fit(X, y)

# Извлечение наилучших параметров и R2
best_params = grid_search.best_params_
r2_best_model = grid_search.best_score_

# Обновление results_regression с наилучшими параметрами и R2
results_regression.loc[23] = ['Best_Model', 'task8', r2_best_model]

# Вывод результатов
print('Параметры лучшей модели:\n', best_params)
print('Лучшее значение R2:\n', r2_best_model)



  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Параметры лучшей модели:
 {'poly__degree': 2, 'regressor': Ridge(), 'regressor__alpha': 1.623776739188721, 'scaler': MinMaxScaler()}
Лучшее значение R2:
 0.6596655336212737


In [12]:
results_regression

Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.656706
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668483
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483
7,Ridge_StandardScaler,task4,0.66819
8,Ridge_MinMaxScaler,task4,0.676221
9,Lasso_StandardScaler,task4,0.624045


In [13]:
data = pd.read_csv('adult.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


9. Разделите выборку на признаки и целевую переменную(колонка class). Замените целевую переменную на числовые значения ('<=50K' - 1, '>50K' - 0).

In [14]:
import pandas as pd

# Разделение данных на признаки и целевую переменную
features = data.drop('class', axis=1)  # Удаление колонки 'class'
target = data['class']  # Извлечение колонки 'class' как целевой переменной

# Замена значений целевой переменной на числовые
target = target.map({'<=50K': 1, '>50K': 0})

# Проверка замены
print(target.head())

0    1
1    1
2    1
3    1
4    1
Name: class, dtype: int64


10. Посчитайте метрики accuracy и f1_score на предсказании только самого частого класса в целевой переменной.

In [15]:
from sklearn.metrics import accuracy_score, f1_score

# Определение самого частого класса
most_frequent_class = target.mode()[0]

# Создание массива предсказаний
predictions = [most_frequent_class] * len(target)

# Вычисление метрик
acc_most_frequent = accuracy_score(target, predictions)
f1_most_frequent = f1_score(target, predictions, pos_label=most_frequent_class)

# Вывод метрик
print(f'Accuracy of most frequent class predictor: {acc_most_frequent}')
print(f'F1 Score of most frequent class predictor: {f1_most_frequent}')

results_classification.loc[0] = ['Most Frequent class', 'task10', f1_most_frequent, acc_most_frequent]

Accuracy of most frequent class predictor: 0.7607182343065395
F1 Score of most frequent class predictor: 0.8640999104619929


11. Выясните, присутствуют ли в данных пропуски. Если присутствуют, заполните их самыми частыми значениями (испольуйте SimpleImputer)

In [16]:
import pandas as pd
from sklearn.impute import SimpleImputer

data = pd.read_csv('adult.csv')

# Проверка на наличие пропусков в данных
missing_values = data.isnull().sum()
print("Количество пропусков в каждом столбце:\n", missing_values)

if missing_values.any():
  
    imputer = SimpleImputer(strategy='most_frequent')
    

    data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
    
    # Замена оригинального DataFrame обработанными данными
    data = data_imputed

print("Количество пропусков после заполнения:\n", data.isnull().sum())

Количество пропусков в каждом столбце:
 age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
class             0
dtype: int64
Количество пропусков после заполнения:
 age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
class             0
dtype: int64


12. Выберите колонки с числовыми и категориальными переменными (используя возможности pandas).

In [17]:
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
print("Числовые колонки:", numerical_cols)

categorical_cols = data.select_dtypes(include=['object', 'category']).columns
print("Категориальные колонки:", categorical_cols)

Числовые колонки: Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')
Категориальные колонки: Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country', 'class'],
      dtype='object')


13. Создайте пайплайн по обработке числовых и категориальных значений колонок (используйте OneHotEncoder,MinMaxScaler) и посчитайте cross_val_score по алгоритмам LogisticRegression, KNeighborsClassifier, LinearSVC по метрикам accuracy и f1_score.

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, make_scorer

# Определение числовых и категориальных колонок
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns
categorical_features = data.select_dtypes(include=['object']).columns

# Создание пайплайнов для числовых и категориальных данных
numeric_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Объединение пайплайнов с помощью ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Создание пайплайна модели
classifiers = [
    ('LogisticRegression', LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)),
    ('KNeighborsClassifier', KNeighborsClassifier()),
    ('LinearSVC', LinearSVC(max_iter=10000, random_state=RANDOM_STATE))
]

for name, classifier in classifiers:
    # Создаем пайплайн с предобработчиком и классификатором
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', classifier)])

    # Подготовка метрик
    acc = cross_val_score(pipeline, data, target, cv=5, scoring='accuracy').mean()
    f1 = cross_val_score(pipeline, data, target, cv=5, scoring=make_scorer(f1_score, average='weighted')).mean()

    # Добавление результатов в DataFrame
    results_classification.loc[len(results_classification)] = [name, 'task13', f1, acc]

results_classification



Unnamed: 0,model,task,f1,accuracy
0,Most Frequent class,task10,0.8641,0.760718
1,LogisticRegression,task13,1.0,1.0
2,KNeighborsClassifier,task13,0.994617,0.994636
3,LinearSVC,task13,1.0,1.0


14. Можно заметить что в данных присутствуют значения '?', замените их самыми частыми значениями, (испольуйте SimpleImputer). Посчитайте cross_val_score по алгоритмам LogisticRegression, KNeighborsClassifier, LinearSVC по метрикам accuracy и f1_score.

In [19]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

# Определение числовых и категориальных столбцов
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = data.select_dtypes(include=['object', 'category']).columns

# Создание пайплайнов для числовых и категориальных данных
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Создание ColumnTransformer для применения трансформации к соответствующим столбцам
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Список моделей для оценки
models = [
    ('LogisticRegression_impute', LogisticRegression(max_iter=1000)),
    ('KNeighborsClassifier_impute', KNeighborsClassifier()),
    ('LinearSVC_impute', LinearSVC(max_iter=10000))
]


# Вычисление cross_val_score для каждой модели
for i, (name, model) in enumerate(models):
    # Создание и обучение пайплайна с предобработкой и моделью
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

    # Вычисление метрик
    acc = cross_val_score(pipeline, data, target, cv=5, scoring='accuracy').mean()
    f1 = cross_val_score(pipeline, data, target, cv=5, scoring='f1_macro').mean()

    # Обновление результатов
    results_classification.loc[len(results_classification)] = [name, 'task14', f1, acc]

print(results_classification)



                         model    task        f1  accuracy
0          Most Frequent class  task10  0.864100  0.760718
1           LogisticRegression  task13  1.000000  1.000000
2         KNeighborsClassifier  task13  0.994617  0.994636
3                    LinearSVC  task13  1.000000  1.000000
4    LogisticRegression_impute  task14  1.000000  1.000000
5  KNeighborsClassifier_impute  task14  0.992580  0.994636
6             LinearSVC_impute  task14  1.000000  1.000000


15. Посчитайте cross_val_score по тем же алгоритмам и метрикам, если просто удалить значения '?'.

In [20]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import make_scorer, f1_score, accuracy_score

data = pd.read_csv('boston.csv')

# Заменяем '?' на NaN
data.replace('?', np.nan, inplace=True)

# Удаление строк с пропусками
data.dropna(inplace=True)

# последний столбец как целевая переменная
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Преобразование целевой переменной в двоичный формат для классификации, с использованием медианы как порога
median_value = y.median()
y = (y > median_value).astype(int)

# Определение моделей
models = {
    'LogisticRegression': LogisticRegression(random_state=RANDOM_STATE),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'LinearSVC': LinearSVC(random_state=RANDOM_STATE, max_iter=10000)
}

# Определение метрик
scoring = {'f1': make_scorer(f1_score), 'accuracy': make_scorer(accuracy_score)}

# Вычисление cross_val_score для каждой модели и метрики
for name, model in models.items():
    f1_scores = cross_val_score(model, X, y, cv=5, scoring=scoring['f1'])
    acc_scores = cross_val_score(model, X, y, cv=5, scoring=scoring['accuracy'])
    
    # Средние значения метрик
    f1_avg = np.mean(f1_scores)
    acc_avg = np.mean(acc_scores)
    
    # Добавление результатов в DataFrame
    if name == 'LogisticRegression':
        results_classification.loc[7] = [f'{name}_delete_missings', 'task15', f1_avg, acc_avg]
    elif name == 'KNeighborsClassifier':
        results_classification.loc[8] = [f'{name}_delete_missings', 'task15', f1_avg, acc_avg]
    elif name == 'LinearSVC':
        results_classification.loc[9] = [f'{name}_delete_missings', 'task15', f1_avg, acc_avg]

print(results_classification.loc[[7, 8, 9]])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

                                  model    task        f1  accuracy
7    LogisticRegression_delete_missings  task15  0.830208  0.849757
8  KNeighborsClassifier_delete_missings  task15  0.676127  0.688158
9             LinearSVC_delete_missings  task15  0.677232  0.778587




16. Посчитайте cross_val_score для RandomForestClassifier,GradientBoostingClassifier на данных с замененными значениями '?' на самые частые значения.

In [21]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.impute import SimpleImputer

data = pd.read_csv('boston.csv')
data.replace('?', np.nan, inplace=True)

# Заполнение пропусков самыми частыми значениями
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Последний столбец - это целевая переменная
X = data_imputed.iloc[:, :-1]
y = data_imputed.iloc[:, -1].astype('int')

# Словарь моделей
models = {
    'RandomForestClassifier': RandomForestClassifier(random_state=RANDOM_STATE),
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state=RANDOM_STATE)
}

# Расчет cross_val_score для каждой модели
for model_name, model in models.items():
    # Кросс-валидация для f1 метрики
    f1_scores = cross_val_score(model, X, y, cv=5, scoring='f1_macro')
    # Кросс-валидация для accuracy метрики
    acc_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    
    # Заполнение результатов
    results_classification.loc[len(results_classification)]  = [model_name,'task16', np.mean(f1_scores),np.mean(acc_scores)]

print(results_classification)



                                   model    task        f1  accuracy
0                    Most Frequent class  task10  0.864100  0.760718
1                     LogisticRegression  task13  1.000000  1.000000
2                   KNeighborsClassifier  task13  0.994617  0.994636
3                              LinearSVC  task13  1.000000  1.000000
4              LogisticRegression_impute  task14  1.000000  1.000000
5            KNeighborsClassifier_impute  task14  0.992580  0.994636
6                       LinearSVC_impute  task14  1.000000  1.000000
7     LogisticRegression_delete_missings  task15  0.830208  0.849757
8   KNeighborsClassifier_delete_missings  task15  0.676127  0.688158
9              LinearSVC_delete_missings  task15  0.677232  0.778587
10                RandomForestClassifier  task16  0.104893  0.138342
11            GradientBoostingClassifier  task16  0.085815  0.106717


In [22]:
results_classification

Unnamed: 0,model,task,f1,accuracy
0,Most Frequent class,task10,0.8641,0.760718
1,LogisticRegression,task13,1.0,1.0
2,KNeighborsClassifier,task13,0.994617,0.994636
3,LinearSVC,task13,1.0,1.0
4,LogisticRegression_impute,task14,1.0,1.0
5,KNeighborsClassifier_impute,task14,0.99258,0.994636
6,LinearSVC_impute,task14,1.0,1.0
7,LogisticRegression_delete_missings,task15,0.830208,0.849757
8,KNeighborsClassifier_delete_missings,task15,0.676127,0.688158
9,LinearSVC_delete_missings,task15,0.677232,0.778587


17. Подберите наилучшую модель, подбирая методы обработки колонок - масштабирование признаков, кодирование признаков и заполнение пропусков. Параметры алгоритмов оставьте по умолчанию. Выведите итоговые параметры и результат accuracy и f1_score.

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score, accuracy_score

# Определение категориальных и числовых признаков
categorical_features = data.select_dtypes(include=['object']).columns
numeric_features = data.select_dtypes(exclude=['object']).columns

# Предполагаем, что последний столбец - это целевая переменная
X = data.iloc[:, :-1]
y = data.iloc[:, -1].astype('int')  # предполагаем, что это классификация

# Определение предобработки для числовых и категориальных данных
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

# Сборка ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Список моделей для оценки
classifiers = [
    ('RandomForestClassifier', RandomForestClassifier(random_state=RANDOM_STATE)),
    ('GradientBoostingClassifier', GradientBoostingClassifier(random_state=RANDOM_STATE))
]

# Сетка для поиска по сетке
grid_params = [{
    'classifier': [clf],
} for _, clf in classifiers]

# Конвейер с предварительной обработкой и классификатором
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())  # Placeholder, будет заменен GridSearchCV
])

# Поиск по сетке
grid_search = GridSearchCV(pipeline, grid_params, cv=5, scoring='accuracy', return_train_score=True)
grid_search.fit(X, y)

# Лучшие параметры и оценки
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_
best_f1 = f1_score(y, best_estimator.predict(X), average='macro')
best_accuracy = accuracy_score(y, best_estimator.predict(X))

# Заполнение результатов
results_classification.loc[12] = [best_params['classifier'].__class__.__name__, 'task17', best_f1, best_accuracy]

print('Параметры лучшей модели:\n', best_params)
print('Лучший F1 Score:', best_f1)
print('Лучшая точность:', best_accuracy)



ValueError: 
All the 10 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Stas\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\indexes\base.py", line 3791, in get_loc
    return self._engine.get_loc(casted_key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "index.pyx", line 152, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 181, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\hashtable_class_helper.pxi", line 7080, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas\_libs\hashtable_class_helper.pxi", line 7088, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'MEDV'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\Stas\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\__init__.py", line 480, in _get_column_indices
    col_idx = all_columns.get_loc(col)
              ^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Stas\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\indexes\base.py", line 3798, in get_loc
    raise KeyError(key) from err
KeyError: 'MEDV'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\Stas\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Stas\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Stas\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Stas\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 408, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Stas\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\memory.py", line 353, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Stas\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 1303, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Stas\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\_set_output.py", line 273, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Stas\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Stas\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\compose\_column_transformer.py", line 906, in fit_transform
    self._validate_column_callables(X)
  File "c:\Users\Stas\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\compose\_column_transformer.py", line 496, in _validate_column_callables
    transformer_to_input_indices[name] = _get_column_indices(X, columns)
                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Stas\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\__init__.py", line 488, in _get_column_indices
    raise ValueError("A given column is not a column of the dataframe") from e
ValueError: A given column is not a column of the dataframe
