In [3]:
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Загрузка данных

In [None]:
%%capture
!wget https://www.dropbox.com/s/64ol9q9ssggz6f1/data_ford_price.xlsx

In [4]:
data = pd.read_excel('data_ford_price.xlsx') 

#  Отбор признаков: мотивация

## Предобработка данных

In [6]:
data = data[['price','year', 'cylinders', 'odometer', 'lat', 'long', 'weather']]
data.dropna(inplace = True)

y = data['price']
x = data.drop(columns='price')

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=40)

## Обучение модели

In [7]:
model = LinearRegression()
model.fit(X_train, y_train)
y_predicted = model.predict(X_test)
 
mae = mean_absolute_error(y_test, y_predicted)
print('MAE: %.3f' % mae)

MAE: 4682.957


## Удаление избыточного признака

In [8]:
x.drop('lat', axis = 1, inplace = True)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=40)

In [10]:
model = LinearRegression()
model.fit(X_train, y_train)
y_predicted = model.predict(X_test)
 
mae = mean_absolute_error(y_test, y_predicted)
print('MAE: %.3f' % mae)

MAE: 4672.930


#  Отбор признаков: классификация методов

## Метод рекурсивного исключения признаков

In [11]:
from sklearn.feature_selection import RFE

In [12]:
y = data['price']
x = data.drop(columns='price')

In [13]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=40)

In [14]:
estimator = LinearRegression()
selector = RFE(estimator, n_features_to_select=3, step=1)
selector = selector.fit(X_train, y_train)
 
selector.get_feature_names_out()

array(['year', 'cylinders', 'lat'], dtype=object)

In [15]:
X_train.columns

Index(['year', 'cylinders', 'odometer', 'lat', 'long', 'weather'], dtype='object')

In [16]:
selector.ranking_

array([1, 1, 4, 1, 3, 2])

##  МЕТОДЫ ВЫБОРА ПРИЗНАКОВ НА ОСНОВЕ ФИЛЬТРОВ

In [17]:
from sklearn.feature_selection import SelectKBest, f_regression

In [18]:
selector = SelectKBest(f_regression, k=3)
selector.fit(X_train, y_train)
 
selector.get_feature_names_out()

array(['year', 'cylinders', 'odometer'], dtype=object)

# Example 9.5

In [19]:
from sklearn.feature_selection import RFE
 
estimator = LinearRegression() #выделим три столбца-признака для обучения, выбранные RFE.
selector = RFE(estimator, n_features_to_select=3, step=1)
selector.fit(X_train, y_train)
y_train_predict = selector.transform(X_train)
selector.get_feature_names_out()


array(['year', 'cylinders', 'lat'], dtype=object)

In [20]:
x_rfe = x[['year', 'cylinders', 'lat']]

In [21]:
X_train_rfe, X_test_rfe, y_train_rfe, y_test_rfe = train_test_split(x_rfe, y, test_size=0.3, random_state=42) #выделим тренировочню и тестовую выборку

In [23]:
model_2 = LinearRegression() #обучим модель
model_2.fit(X_train_rfe, y_train_rfe)
y_predicted = model_2.predict(X_test_rfe)
#Рассчитаем MAE
print('MAE score: {:.3f} thou. $'.format(metrics.mean_absolute_error(y_test_rfe, y_predicted)))
#Рассчитаем MAPE
print('MAPE score: {:.3f} %'.format(metrics.mean_absolute_percentage_error(y_test_rfe, y_predicted) * 100))
#Рассчитаем коэффициент детерминации
print('R2 score: {:.3f}'.format(metrics.r2_score(y_test_rfe, y_predicted)))

MAE score: 5206.677 thou. $
MAPE score: 11204.347 %
R2 score: 0.574


In [24]:
selector = SelectKBest(f_regression, k=3)#выделим три столбца-признака для обучения, выбранные SelectKBest.
selector.fit(X_train, y_train)
 
selector.get_feature_names_out()

array(['year', 'cylinders', 'odometer'], dtype=object)

In [25]:
x_skb=x[['year', 'cylinders', 'odometer']]

In [26]:
X_train_B, X_test_B, y_train_B, y_test_B = train_test_split(x_skb, y, test_size=0.3, random_state=42) #выделим тренировочню и тестовую выборку

In [27]:
model_3 = LinearRegression() #обучим модель
model_3.fit(X_train_B, y_train_B)
y_predicted = model_3.predict(X_test_B)
#Рассчитаем MAE
print('MAE score: {:.3f} thou. $'.format(metrics.mean_absolute_error(y_test_B, y_predicted)))
#Рассчитаем MAPE
print('MAPE score: {:.3f} %'.format(metrics.mean_absolute_percentage_error(y_test_B, y_predicted) * 100))
#Рассчитаем коэффициент детерминации
print('R2 score: {:.3f}'.format(metrics.r2_score(y_test_B, y_predicted)))

MAE score: 4800.916 thou. $
MAPE score: 12098.810 %
R2 score: 0.593


# Вывод: 
Mетод отбора признаков SelectKBest показал наилучший результат на тестовой выборке.