<a href="https://colab.research.google.com/github/UznetDev/Data-science-home-work/blob/main/02_Avg%2C_2024_home_work.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import *
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import RFE, RFECV
import warnings
import re
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('car_prices.csv')
df.shape

(54273, 13)

In [3]:
df.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000
1,1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,8250
2,2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,Yes,15000
3,3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,Yes,63500
4,4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,Yes,7850


In [4]:
for col in df.columns:
    print(f'{col}: {df[col].nunique()}')

id: 54273
brand: 53
model: 1827
model_year: 34
milage: 3212
fuel_type: 7
engine: 1061
transmission: 46
ext_col: 260
int_col: 124
accident: 2
clean_title: 1
price: 1481


#### Kerakliy funksiyalarni yozamiz

In [5]:
def parse_engine_info(engine):
    # engine ustunidan ma'lumotlarni ajratib olamiz
    if pd.isna(engine):
        return pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan])

    hp = re.search(r'(\d+\.?\d*)HP', engine)
    liter = re.search(r'(\d+\.?\d*)L', engine)
    motor = re.search(r'(Straight|V)\s*\d*', engine)
    cylinder = re.search(r'(\d+)\s*Cylinder', engine)
    fuel = re.search(r'(Gasoline|Hybrid|Flex|Diesel)', engine)

    return pd.Series([
        hp.group(1) if hp else np.nan,
        liter.group(1) if liter else np.nan,
        motor.group(0) if motor else np.nan,
        cylinder.group(1) if cylinder else np.nan,
        fuel.group(1) if fuel else np.nan
    ])


def fill_nan_mean(df, col, columns):
    # numeric ustunlar uchun fill nall funcsiya
    print(f"Col: {col}, Columns: {columns}, Nan: {df[col].isna().sum()}", end='')
    value = df[df[col].notna()].groupby(by=columns)[col].mean().reset_index()
    df = pd.merge(df, value, on=columns, how='left', suffixes=('', '_mean'))
    df[col] = df[col].fillna(df[col + '_mean'])
    df.drop(columns=[col + '_mean'], inplace=True)
    print(f", After Clean: {df[col].isna().sum()}")
    return df


def fill_nan_mode(df, col, columns):
    # object ustunlar uchun fill nall funcsiya
    print(f"Col: {col}, Columns: {columns}, Nan: {df[col].isna().sum()}", end='')
    mode_values = df.groupby(columns)[col].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else pd.NA).reset_index()
    df = pd.merge(df, mode_values, on=columns, how='left', suffixes=('', '_mode'))
    df[col] = df[col].fillna(df[col + '_mode'])
    df.drop(columns=[col + '_mode'], inplace=True)

    print(f", After Clean: {df[col].isna().sum()}")
    return df

def to_numeric(x):
    try:
        return float(x)
    except ValueError as err:
        print(err)
        return np.nan


def ordinal_encoder(df, column, on):
    # object ustunlni 'price' ustuniga soslanib sortlayb raqamga aylantiradi va yangi ustun yaratadi
    avg_prices = df.groupby(column)[on].mean().reset_index()
    avg_prices.columns = [column, 'avg_price']
    avg_prices = avg_prices.sort_values(by='avg_price').reset_index(drop=True)
    avg_prices[column+'_rank'] = avg_prices.index + 1
    return pd.merge(df, avg_prices[[column, column+'_rank']], on=column, how='left')

#### engine ustunidagi ma'lumotlarni ajratib olamiz

In [6]:
new_columns = ['hp', 'litr', 'motor', 'Cylinder', 'fuel']
df[new_columns] = df['engine'].apply(parse_engine_info)

#### bizga keragsiz data

In [7]:
df = df.drop(columns=['clean_title', 'id'])

In [8]:
df.head(1)

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,price,hp,litr,motor,Cylinder,fuel
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,11000,375.0,3.5,V6,6,Gasoline


In [9]:
df[new_columns].isna().sum()

Unnamed: 0,0
hp,4057
litr,606
motor,29640
Cylinder,4175
fuel,4258


#### Nan qiymatlarni tuldirib chiqamiz

In [10]:
df['hp'] = df['hp'].agg(to_numeric)

df = fill_nan_mean(df.copy(), 'hp', ['motor', 'Cylinder', 'litr'])
df = fill_nan_mean(df.copy(), 'hp', ['motor', 'litr'])
df = fill_nan_mean(df.copy(), 'hp', ['brand', 'model'])
df = fill_nan_mean(df.copy(), 'hp', ['litr', 'fuel_type'])
df = fill_nan_mean(df.copy(), 'hp', ['model'])
df = fill_nan_mean(df.copy(), 'hp', ['motor'])
df = fill_nan_mean(df.copy(), 'hp', ['price', 'brand'])
df = fill_nan_mean(df.copy(), 'hp', ['price', 'fuel_type'])
df = fill_nan_mean(df.copy(), 'hp', ['engine'])



print(df['hp'].isna().sum())

Col: hp, Columns: ['motor', 'Cylinder', 'litr'], Nan: 4057, After Clean: 4055
Col: hp, Columns: ['motor', 'litr'], Nan: 4055, After Clean: 2809
Col: hp, Columns: ['brand', 'model'], Nan: 2809, After Clean: 369
Col: hp, Columns: ['litr', 'fuel_type'], Nan: 369, After Clean: 119
Col: hp, Columns: ['model'], Nan: 119, After Clean: 111
Col: hp, Columns: ['motor'], Nan: 111, After Clean: 107
Col: hp, Columns: ['price', 'brand'], Nan: 107, After Clean: 23
Col: hp, Columns: ['price', 'fuel_type'], Nan: 23, After Clean: 14
Col: hp, Columns: ['engine'], Nan: 14, After Clean: 0
0


In [11]:
df['litr'] = df['litr'].agg(to_numeric)


df = fill_nan_mean(df.copy(), 'litr', ['motor', 'Cylinder', 'hp'])
# df = fill_nan_mean(df.copy(), 'litr', ['Cylinder', 'hp'])
# df = fill_nan_mean(df.copy(), 'litr', ['motor', 'hp'])
df = fill_nan_mean(df.copy(), 'litr', ['hp', 'fuel_type'])
df = fill_nan_mean(df.copy(), 'litr', ['model'])
df = fill_nan_mean(df.copy(), 'litr', ['engine'])
df = fill_nan_mean(df.copy(), 'litr', ['price', 'fuel_type'])
df = fill_nan_mean(df.copy(), 'litr', ['price', 'transmission'])


print(df['litr'].isna().sum())

Col: litr, Columns: ['motor', 'Cylinder', 'hp'], Nan: 606, After Clean: 606
Col: litr, Columns: ['hp', 'fuel_type'], Nan: 606, After Clean: 396
Col: litr, Columns: ['model'], Nan: 396, After Clean: 124
Col: litr, Columns: ['engine'], Nan: 124, After Clean: 46
Col: litr, Columns: ['price', 'fuel_type'], Nan: 46, After Clean: 3
Col: litr, Columns: ['price', 'transmission'], Nan: 3, After Clean: 0
0


In [12]:
df['Cylinder'] = df['Cylinder'].agg(to_numeric)

df = fill_nan_mean(df.copy(), 'Cylinder', ['litr', 'hp'])
df = fill_nan_mean(df.copy(), 'Cylinder', ['litr', 'model'])
df = fill_nan_mean(df.copy(), 'Cylinder', ['hp', 'model'])
df = fill_nan_mean(df.copy(), 'Cylinder', ['model'])
df = fill_nan_mean(df.copy(), 'Cylinder', ['hp', 'fuel_type'])
df = fill_nan_mean(df.copy(), 'Cylinder', ['price', 'fuel_type'])
df = fill_nan_mean(df.copy(), 'Cylinder', ['price', 'transmission'])
df = fill_nan_mean(df.copy(), 'Cylinder', ['litr', 'fuel_type'])
df = fill_nan_mean(df.copy(), 'Cylinder', ['engine'])


print(df['Cylinder'].isna().sum())

Col: Cylinder, Columns: ['litr', 'hp'], Nan: 4175, After Clean: 3772
Col: Cylinder, Columns: ['litr', 'model'], Nan: 3772, After Clean: 1383
Col: Cylinder, Columns: ['hp', 'model'], Nan: 1383, After Clean: 1033
Col: Cylinder, Columns: ['model'], Nan: 1033, After Clean: 632
Col: Cylinder, Columns: ['hp', 'fuel_type'], Nan: 632, After Clean: 214
Col: Cylinder, Columns: ['price', 'fuel_type'], Nan: 214, After Clean: 32
Col: Cylinder, Columns: ['price', 'transmission'], Nan: 32, After Clean: 8
Col: Cylinder, Columns: ['litr', 'fuel_type'], Nan: 8, After Clean: 2
Col: Cylinder, Columns: ['engine'], Nan: 2, After Clean: 0
0


In [13]:
df.columns

Index(['brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine',
       'transmission', 'ext_col', 'int_col', 'accident', 'price', 'hp', 'litr',
       'motor', 'Cylinder', 'fuel'],
      dtype='object')

In [14]:
df.head(1)

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,price,hp,litr,motor,Cylinder,fuel
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,11000,375.0,3.5,V6,6.0,Gasoline


In [15]:
df = fill_nan_mode(df.copy(), 'motor', ['litr', 'hp', 'Cylinder', 'model'])
df = fill_nan_mode(df.copy(), 'motor', ['litr', 'hp', 'Cylinder'])
df = fill_nan_mode(df.copy(), 'motor', ['litr', 'hp'])
df = fill_nan_mode(df.copy(), 'motor', ['model'])
df = fill_nan_mode(df.copy(), 'motor', ['engine'])
df = fill_nan_mode(df.copy(), 'motor', ['fuel_type', 'hp'])
df = fill_nan_mode(df.copy(), 'motor', ['litr', 'Cylinder'])
df = fill_nan_mode(df.copy(), 'motor', ['Cylinder', 'price'])
df = fill_nan_mode(df.copy(), 'motor', ['Cylinder', 'brand'])
df = fill_nan_mode(df.copy(), 'motor', ['litr', 'brand'])
df = fill_nan_mode(df.copy(), 'motor', ['hp', 'brand'])
df = fill_nan_mode(df.copy(), 'motor', ['hp', 'fuel'])
df = fill_nan_mode(df.copy(), 'motor', ['brand', 'price'])
df = fill_nan_mode(df.copy(), 'motor', ['brand', 'price'])
df = fill_nan_mode(df.copy(), 'motor', ['brand', 'fuel_type'])


print(df['motor'].isna().sum())

Col: motor, Columns: ['litr', 'hp', 'Cylinder', 'model'], Nan: 29640, After Clean: 28986
Col: motor, Columns: ['litr', 'hp', 'Cylinder'], Nan: 28986, After Clean: 21847
Col: motor, Columns: ['litr', 'hp'], Nan: 21847, After Clean: 21438
Col: motor, Columns: ['model'], Nan: 21438, After Clean: 6649
Col: motor, Columns: ['engine'], Nan: 6649, After Clean: 1320
Col: motor, Columns: ['fuel_type', 'hp'], Nan: 1320, After Clean: 522
Col: motor, Columns: ['litr', 'Cylinder'], Nan: 522, After Clean: 76
Col: motor, Columns: ['Cylinder', 'price'], Nan: 76, After Clean: 66
Col: motor, Columns: ['Cylinder', 'brand'], Nan: 66, After Clean: 52
Col: motor, Columns: ['litr', 'brand'], Nan: 52, After Clean: 40
Col: motor, Columns: ['hp', 'brand'], Nan: 40, After Clean: 12
Col: motor, Columns: ['hp', 'fuel'], Nan: 12, After Clean: 11
Col: motor, Columns: ['brand', 'price'], Nan: 11, After Clean: 4
Col: motor, Columns: ['brand', 'price'], Nan: 4, After Clean: 4
Col: motor, Columns: ['brand', 'fuel_type']

In [16]:
df = fill_nan_mode(df.copy(), 'fuel', ['litr', 'hp', 'Cylinder', 'model'])
df = fill_nan_mode(df.copy(), 'fuel', ['litr', 'hp', 'model'])
df = fill_nan_mode(df.copy(), 'fuel', ['hp', 'model'])
df = fill_nan_mode(df.copy(), 'fuel', ['model'])
df = fill_nan_mode(df.copy(), 'fuel', ['motor', 'hp'])
df = fill_nan_mode(df.copy(), 'fuel', ['motor', 'hp'])
df = fill_nan_mode(df.copy(), 'fuel', ['motor', 'Cylinder'])
df = fill_nan_mode(df.copy(), 'fuel', ['motor', 'litr'])
df = fill_nan_mode(df.copy(), 'fuel', ['motor'])

print(df['fuel'].isna().sum())

Col: fuel, Columns: ['litr', 'hp', 'Cylinder', 'model'], Nan: 4258, After Clean: 3855
Col: fuel, Columns: ['litr', 'hp', 'model'], Nan: 3855, After Clean: 3756
Col: fuel, Columns: ['hp', 'model'], Nan: 3756, After Clean: 3515
Col: fuel, Columns: ['model'], Nan: 3515, After Clean: 787
Col: fuel, Columns: ['motor', 'hp'], Nan: 787, After Clean: 304
Col: fuel, Columns: ['motor', 'hp'], Nan: 304, After Clean: 304
Col: fuel, Columns: ['motor', 'Cylinder'], Nan: 304, After Clean: 182
Col: fuel, Columns: ['motor', 'litr'], Nan: 182, After Clean: 58
Col: fuel, Columns: ['motor'], Nan: 58, After Clean: 0
0


In [17]:
df.head(1)

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,price,hp,litr,motor,Cylinder,fuel
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,11000,375.0,3.5,V6,6.0,Gasoline


In [18]:
df[new_columns].isna().sum()

Unnamed: 0,0
hp,0
litr,0
motor,0
Cylinder,0
fuel,0


In [19]:
df.select_dtypes(include='object').columns

Index(['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col',
       'int_col', 'accident', 'motor', 'fuel'],
      dtype='object')

In [20]:
for col in df.select_dtypes(include='object').columns:
    df = ordinal_encoder(df.copy(), col, 'price')

In [21]:
df.head(1)

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,...,brand_rank,model_rank,fuel_type_rank,engine_rank,transmission_rank,ext_col_rank,int_col_rank,accident_rank,motor_rank,fuel_rank
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,...,31,1256,4,821,28,91,14,2,1,3


### Birinchi model

In [22]:
columns = ['model_year',
           'milage',
           'hp',
           'litr',
           'Cylinder',
           'brand_rank',
           'model_rank',
           'fuel_type_rank',
           'engine_rank',
           'transmission_rank',
           'ext_col_rank',
           'int_col_rank',
           'accident_rank',
           'motor_rank',
           'fuel_rank'
           ]

X = df[columns]
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.37, random_state=125)

model = LinearRegression()

cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_rmse = np.sqrt(np.mean((-cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error'))))

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'CV RMSE: {cv_rmse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

CV RMSE: 68040.11296286967
RMSE: 69944.68088403609
MAE: 19389.996709298433
R2: 0.1094867351823049


### PolynomialFeatures

In [23]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.37, random_state=125)

model = LinearRegression()

cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_rmse = np.sqrt(np.mean((-cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error'))))

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'CV RMSE: {cv_rmse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

CV RMSE: 67275.41922755181
RMSE: 68974.20169206391
MAE: 18491.276450056685
R2: 0.1340269588226396


#### Outlierlarni tashlab kuramiz

In [24]:
data = df[df['price'] <= df['price'].quantile(0.95)]

In [25]:
X = data[columns]
y = data['price']

In [26]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.37, random_state=125)

model = LinearRegression()

cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_rmse = np.sqrt(np.mean((-cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error'))))

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'CV RMSE: {cv_rmse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

CV RMSE: 12015.062164884188
RMSE: 11961.465939967047
MAE: 8395.476871320328
R2: 0.6362013866097889


In [27]:
linear_model = LinearRegression()
linear_model.fit(X, y)

tail = TheilSenRegressor()
tail.fit(X, y)

huber = HuberRegressor()
huber.fit(X, y)

ransac = RANSACRegressor()
ransac.fit(X, y)


linear_rmse = np.sqrt(mean_squared_error(y, linear_model.predict(X)))
tail_rmse = np.sqrt(mean_squared_error(y, tail.predict(X)))
huber_rmse = np.sqrt(mean_squared_error(y, huber.predict(X)))
ransac_rmse = np.sqrt(mean_squared_error(y, ransac.predict(X)))

mae_linear = mean_absolute_error(y, linear_model.predict(X))
mae_tail = mean_absolute_error(y, tail.predict(X))
mae_huber = mean_absolute_error(y, huber.predict(X))
mae_ransac = mean_absolute_error(y, ransac.predict(X))


median_error = median_absolute_error(y, linear_model.predict(X))
median_error_tail = median_absolute_error(y, tail.predict(X))
median_error_huber = median_absolute_error(y, huber.predict(X))
median_error_ransac = median_absolute_error(y, ransac.predict(X))

print('Median Error Linear:', median_error)
print('Linear MAE:', mae_linear)
print('Linear RMSE:', linear_rmse)
print()
print('Median Error RANSAC:', median_error_ransac)
print('RANSAC MAE:', mae_ransac)
print('RANSAC RMSE:', ransac_rmse)
print()
print('Median Error Tail:', median_error_tail)
print('Theil MAE:', mae_tail)
print('Theil RMSE:', tail_rmse)
print()
print('Median Error Huber:', median_error_huber)
print('Huber MAE:', mae_huber)
print('Huber RMSE:', huber_rmse)

Median Error Linear: 6964.211005050922
Linear MAE: 9370.45954890787
Linear RMSE: 12738.703244700333

Median Error RANSAC: 7587.266779759084
RANSAC MAE: 11003.070403337191
RANSAC RMSE: 15512.844554591966

Median Error Tail: 7899.997946936797
Theil MAE: 38315.01397900745
Theil RMSE: 107701.21258565316

Median Error Huber: 6712.668363424516
Huber MAE: 9544.071345701945
Huber RMSE: 13304.318997229213


#### PolynomialFeatures

In [28]:
linear_model = LinearRegression()
linear_model.fit(X_poly, y)

tail = TheilSenRegressor()
tail.fit(X_poly, y)

huber = HuberRegressor()
huber.fit(X_poly, y)

ransac = RANSACRegressor()
ransac.fit(X_poly, y)


linear_rmse = np.sqrt(mean_squared_error(y, linear_model.predict(X_poly)))
tail_rmse = np.sqrt(mean_squared_error(y, tail.predict(X_poly)))
huber_rmse = np.sqrt(mean_squared_error(y, huber.predict(X_poly)))
ransac_rmse = np.sqrt(mean_squared_error(y, ransac.predict(X_poly)))

mae_linear = mean_absolute_error(y, linear_model.predict(X_poly))
mae_tail = mean_absolute_error(y, tail.predict(X_poly))
mae_huber = mean_absolute_error(y, huber.predict(X_poly))
mae_ransac = mean_absolute_error(y, ransac.predict(X_poly))


median_error = median_absolute_error(y, linear_model.predict(X_poly))
median_error_tail = median_absolute_error(y, tail.predict(X_poly))
median_error_huber = median_absolute_error(y, huber.predict(X_poly))
median_error_ransac = median_absolute_error(y, ransac.predict(X_poly))

print('Median Error Linear:', median_error)
print('Linear MAE:', mae_linear)
print('Linear RMSE:', linear_rmse)
print()
print('Median Error RANSAC:', median_error_ransac)
print('RANSAC MAE:', mae_ransac)
print('RANSAC RMSE:', ransac_rmse)
print()
print('Median Error Tail:', median_error_tail)
print('Theil MAE:', mae_tail)
print('Theil RMSE:', tail_rmse)
print()
print('Median Error Huber:', median_error_huber)
print('Huber MAE:', mae_huber)
print('Huber RMSE:', huber_rmse)

Median Error Linear: 5758.6183851361275
Linear MAE: 8405.499635176066
Linear RMSE: 11940.658125018634

Median Error RANSAC: 10887.509486973286
RANSAC MAE: 749234.3447910015
RANSAC RMSE: 2979160.5543561485

Median Error Tail: 6445.324210666469
Theil MAE: 3353220.893850126
Theil RMSE: 14098021.722371379

Median Error Huber: 6178.496940543995
Huber MAE: 9032.010376761316
Huber RMSE: 12771.829826063731


#### RFECV orqaliy eng yaxshi parametrlarni aniqlab olamiz


In [30]:
X = data[columns]
y = data['price']
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = pd.DataFrame(poly.fit_transform(X), columns=poly.get_feature_names_out(X.columns))


cv = KFold(n_splits=6, shuffle=True, random_state=42)

linear_model = LinearRegression()

rfecv = RFECV(estimator=linear_model, step=1, cv=cv, scoring='neg_mean_squared_error', verbose=2)

rfecv.fit(X_poly, y)
optimal_num_features = rfecv.n_features_
selected_features = X_poly.columns[rfecv.support_]

print("Optimal number of features:", optimal_num_features)
print("Selected features:", selected_features.tolist())

Fitting estimator with 135 features.
Fitting estimator with 134 features.
Fitting estimator with 133 features.
Fitting estimator with 132 features.
Fitting estimator with 131 features.
Fitting estimator with 130 features.
Fitting estimator with 129 features.
Fitting estimator with 128 features.
Fitting estimator with 127 features.
Fitting estimator with 126 features.
Fitting estimator with 125 features.
Fitting estimator with 124 features.
Fitting estimator with 123 features.
Fitting estimator with 122 features.
Fitting estimator with 121 features.
Fitting estimator with 120 features.
Fitting estimator with 119 features.
Fitting estimator with 118 features.
Fitting estimator with 117 features.
Fitting estimator with 116 features.
Fitting estimator with 115 features.
Fitting estimator with 114 features.
Fitting estimator with 113 features.
Fitting estimator with 112 features.
Fitting estimator with 111 features.
Fitting estimator with 110 features.
Fitting estimator with 109 features.
F

#### 3 ta modelni 3 xil metrika blan snab kuramiz

In [32]:
X_poly = pd.DataFrame(poly.fit_transform(X), columns=poly.get_feature_names_out(X.columns))
X = X_poly[selected_features]
y = data['price']

linear_model = LinearRegression()
linear_model.fit(X, y)

tail = TheilSenRegressor()
tail.fit(X, y)

huber = HuberRegressor()
huber.fit(X, y)

ransac = RANSACRegressor()
ransac.fit(X, y)


linear_rmse = np.sqrt(mean_squared_error(y, linear_model.predict(X)))
tail_rmse = np.sqrt(mean_squared_error(y, tail.predict(X)))
huber_rmse = np.sqrt(mean_squared_error(y, huber.predict(X)))
ransac_rmse = np.sqrt(mean_squared_error(y, ransac.predict(X)))

mae_linear = mean_absolute_error(y, linear_model.predict(X))
mae_tail = mean_absolute_error(y, tail.predict(X))
mae_huber = mean_absolute_error(y, huber.predict(X))
mae_ransac = mean_absolute_error(y, ransac.predict(X))


median_error = median_absolute_error(y, linear_model.predict(X))
median_error_tail = median_absolute_error(y, tail.predict(X))
median_error_huber = median_absolute_error(y, huber.predict(X))
median_error_ransac = median_absolute_error(y, ransac.predict(X))

print('Median Error Linear:', median_error)
print('Linear MAE:', mae_linear)
print('Linear RMSE:', linear_rmse)
print()
print('Median Error RANSAC:', median_error_ransac)
print('RANSAC MAE:', mae_ransac)
print('RANSAC RMSE:', ransac_rmse)
print()
print('Median Error Tail:', median_error_tail)
print('Theil MAE:', mae_tail)
print('Theil RMSE:', tail_rmse)
print()
print('Median Error Huber:', median_error_huber)
print('Huber MAE:', mae_huber)
print('Huber RMSE:', huber_rmse)

Median Error Linear: 5758.6183851361275
Linear MAE: 8405.499635176066
Linear RMSE: 11940.658125018634

Median Error RANSAC: 11610.303764402866
RANSAC MAE: 261444.96447900686
RANSAC RMSE: 1062357.6115595729

Median Error Tail: 6458.87349051435
Theil MAE: 3341337.54653622
Theil RMSE: 13988185.990592198

Median Error Huber: 6178.496940543995
Huber MAE: 9032.010376761316
Huber RMSE: 12771.829826063731
