<a href="https://colab.research.google.com/github/UznetDev/Data-science-home-work/blob/main/05_Avg_2024_home_work.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import *
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import RFE, RFECV
import warnings
import re
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('car_prices.csv')
df.shape

FileNotFoundError: [Errno 2] No such file or directory: 'car_prices.csv'

In [None]:
df.head()

In [None]:
for col in df.columns:
    print(f'{col}: {df[col].nunique()}')

#### Kerakliy funksiyalarni yozamiz

In [None]:
def parse_engine_info(engine):
    # engine ustunidan ma'lumotlarni ajratib olamiz
    if pd.isna(engine):
        return pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan])

    hp = re.search(r'(\d+\.?\d*)HP', engine)
    liter = re.search(r'(\d+\.?\d*)L', engine)
    motor = re.search(r'(Straight|V)\s*\d*', engine)
    cylinder = re.search(r'(\d+)\s*Cylinder', engine)
    fuel = re.search(r'(Gasoline|Hybrid|Flex|Diesel)', engine)

    return pd.Series([
        hp.group(1) if hp else np.nan,
        liter.group(1) if liter else np.nan,
        motor.group(0) if motor else np.nan,
        cylinder.group(1) if cylinder else np.nan,
        fuel.group(1) if fuel else np.nan
    ])


def fill_nan_mean(df, col, columns):
    # numeric ustunlar uchun fill nall funcsiya
    print(f"Col: {col}, Columns: {columns}, Nan: {df[col].isna().sum()}", end='')
    value = df[df[col].notna()].groupby(by=columns)[col].mean().reset_index()
    df = pd.merge(df, value, on=columns, how='left', suffixes=('', '_mean'))
    df[col] = df[col].fillna(df[col + '_mean'])
    df.drop(columns=[col + '_mean'], inplace=True)
    print(f", After Clean: {df[col].isna().sum()}")
    return df


def fill_nan_mode(df, col, columns):
    # object ustunlar uchun fill nall funcsiya
    print(f"Col: {col}, Columns: {columns}, Nan: {df[col].isna().sum()}", end='')
    mode_values = df.groupby(columns)[col].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else pd.NA).reset_index()
    df = pd.merge(df, mode_values, on=columns, how='left', suffixes=('', '_mode'))
    df[col] = df[col].fillna(df[col + '_mode'])
    df.drop(columns=[col + '_mode'], inplace=True)

    print(f", After Clean: {df[col].isna().sum()}")
    return df

def to_numeric(x):
    try:
        return float(x)
    except ValueError as err:
        print(err)
        return np.nan


def ordinal_encoder(df, column, on):
    # object ustunlni 'price' ustuniga soslanib sortlayb raqamga aylantiradi va yangi ustun yaratadi
    avg_prices = df.groupby(column)[on].mean().reset_index()
    avg_prices.columns = [column, 'avg_price']
    avg_prices = avg_prices.sort_values(by='avg_price').reset_index(drop=True)
    avg_prices[column+'_rank'] = avg_prices.index + 1
    return pd.merge(df, avg_prices[[column, column+'_rank']], on=column, how='left')

#### engine ustunidagi ma'lumotlarni ajratib olamiz

In [None]:
new_columns = ['hp', 'litr', 'motor', 'Cylinder', 'fuel']
df[new_columns] = df['engine'].apply(parse_engine_info)

#### bizga keragsiz data

In [None]:
df = df.drop(columns=['clean_title', 'id'])

In [None]:
df.head(1)

In [None]:
df[new_columns].isna().sum()

#### Nan qiymatlarni tuldirib chiqamiz

In [None]:
df['hp'] = df['hp'].agg(to_numeric)

df = fill_nan_mean(df.copy(), 'hp', ['motor', 'Cylinder', 'litr'])
df = fill_nan_mean(df.copy(), 'hp', ['motor', 'litr'])
df = fill_nan_mean(df.copy(), 'hp', ['brand', 'model'])
df = fill_nan_mean(df.copy(), 'hp', ['litr', 'fuel_type'])
df = fill_nan_mean(df.copy(), 'hp', ['model'])
df = fill_nan_mean(df.copy(), 'hp', ['motor'])
df = fill_nan_mean(df.copy(), 'hp', ['price', 'brand'])
df = fill_nan_mean(df.copy(), 'hp', ['price', 'fuel_type'])
df = fill_nan_mean(df.copy(), 'hp', ['engine'])



print(df['hp'].isna().sum())

In [None]:
df['litr'] = df['litr'].agg(to_numeric)


df = fill_nan_mean(df.copy(), 'litr', ['motor', 'Cylinder', 'hp'])
# df = fill_nan_mean(df.copy(), 'litr', ['Cylinder', 'hp'])
# df = fill_nan_mean(df.copy(), 'litr', ['motor', 'hp'])
df = fill_nan_mean(df.copy(), 'litr', ['hp', 'fuel_type'])
df = fill_nan_mean(df.copy(), 'litr', ['model'])
df = fill_nan_mean(df.copy(), 'litr', ['engine'])
df = fill_nan_mean(df.copy(), 'litr', ['price', 'fuel_type'])
df = fill_nan_mean(df.copy(), 'litr', ['price', 'transmission'])


print(df['litr'].isna().sum())

In [None]:
df['Cylinder'] = df['Cylinder'].agg(to_numeric)

df = fill_nan_mean(df.copy(), 'Cylinder', ['litr', 'hp'])
df = fill_nan_mean(df.copy(), 'Cylinder', ['litr', 'model'])
df = fill_nan_mean(df.copy(), 'Cylinder', ['hp', 'model'])
df = fill_nan_mean(df.copy(), 'Cylinder', ['model'])
df = fill_nan_mean(df.copy(), 'Cylinder', ['hp', 'fuel_type'])
df = fill_nan_mean(df.copy(), 'Cylinder', ['price', 'fuel_type'])
df = fill_nan_mean(df.copy(), 'Cylinder', ['price', 'transmission'])
df = fill_nan_mean(df.copy(), 'Cylinder', ['litr', 'fuel_type'])
df = fill_nan_mean(df.copy(), 'Cylinder', ['engine'])


print(df['Cylinder'].isna().sum())

In [None]:
df.columns

In [None]:
df.head(1)

In [None]:
df = fill_nan_mode(df.copy(), 'motor', ['litr', 'hp', 'Cylinder', 'model'])
df = fill_nan_mode(df.copy(), 'motor', ['litr', 'hp', 'Cylinder'])
df = fill_nan_mode(df.copy(), 'motor', ['litr', 'hp'])
df = fill_nan_mode(df.copy(), 'motor', ['model'])
df = fill_nan_mode(df.copy(), 'motor', ['engine'])
df = fill_nan_mode(df.copy(), 'motor', ['fuel_type', 'hp'])
df = fill_nan_mode(df.copy(), 'motor', ['litr', 'Cylinder'])
df = fill_nan_mode(df.copy(), 'motor', ['Cylinder', 'price'])
df = fill_nan_mode(df.copy(), 'motor', ['Cylinder', 'brand'])
df = fill_nan_mode(df.copy(), 'motor', ['litr', 'brand'])
df = fill_nan_mode(df.copy(), 'motor', ['hp', 'brand'])
df = fill_nan_mode(df.copy(), 'motor', ['hp', 'fuel'])
df = fill_nan_mode(df.copy(), 'motor', ['brand', 'price'])
df = fill_nan_mode(df.copy(), 'motor', ['brand', 'price'])
df = fill_nan_mode(df.copy(), 'motor', ['brand', 'fuel_type'])


print(df['motor'].isna().sum())

In [None]:
df = fill_nan_mode(df.copy(), 'fuel', ['litr', 'hp', 'Cylinder', 'model'])
df = fill_nan_mode(df.copy(), 'fuel', ['litr', 'hp', 'model'])
df = fill_nan_mode(df.copy(), 'fuel', ['hp', 'model'])
df = fill_nan_mode(df.copy(), 'fuel', ['model'])
df = fill_nan_mode(df.copy(), 'fuel', ['motor', 'hp'])
df = fill_nan_mode(df.copy(), 'fuel', ['motor', 'hp'])
df = fill_nan_mode(df.copy(), 'fuel', ['motor', 'Cylinder'])
df = fill_nan_mode(df.copy(), 'fuel', ['motor', 'litr'])
df = fill_nan_mode(df.copy(), 'fuel', ['motor'])

print(df['fuel'].isna().sum())

In [None]:
df.head(1)

In [None]:
df[new_columns].isna().sum()

In [None]:
df.select_dtypes(include='object').columns

In [None]:
for col in df.select_dtypes(include='object').columns:
    df = ordinal_encoder(df.copy(), col, 'price')

In [None]:
df.head(1)

### Birinchi model

In [None]:
columns = ['model_year',
           'milage',
           'hp',
           'litr',
           'Cylinder',
           'brand_rank',
           'model_rank',
           'fuel_type_rank',
           'engine_rank',
           'transmission_rank',
           'ext_col_rank',
           'int_col_rank',
           'accident_rank',
           'motor_rank',
           'fuel_rank'
           ]

X = df[columns]
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.37, random_state=125)

model = LinearRegression()

cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_rmse = np.sqrt(np.mean((-cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error'))))

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'CV RMSE: {cv_rmse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

### PolynomialFeatures

In [None]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.37, random_state=125)

model = LinearRegression()

cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_rmse = np.sqrt(np.mean((-cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error'))))

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'CV RMSE: {cv_rmse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

#### Outlierlarni tashlab kuramiz

In [None]:
data = df[df['price'] <= df['price'].quantile(0.95)]

In [None]:
X = data[columns]
y = data['price']

In [None]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.37, random_state=125)

model = LinearRegression()

cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_rmse = np.sqrt(np.mean((-cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error'))))

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'CV RMSE: {cv_rmse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

In [None]:
linear_model = LinearRegression()
linear_model.fit(X, y)

tail = TheilSenRegressor()
tail.fit(X, y)

huber = HuberRegressor()
huber.fit(X, y)

ransac = RANSACRegressor()
ransac.fit(X, y)


linear_rmse = np.sqrt(mean_squared_error(y, linear_model.predict(X)))
tail_rmse = np.sqrt(mean_squared_error(y, tail.predict(X)))
huber_rmse = np.sqrt(mean_squared_error(y, huber.predict(X)))
ransac_rmse = np.sqrt(mean_squared_error(y, ransac.predict(X)))

mae_linear = mean_absolute_error(y, linear_model.predict(X))
mae_tail = mean_absolute_error(y, tail.predict(X))
mae_huber = mean_absolute_error(y, huber.predict(X))
mae_ransac = mean_absolute_error(y, ransac.predict(X))


median_error = median_absolute_error(y, linear_model.predict(X))
median_error_tail = median_absolute_error(y, tail.predict(X))
median_error_huber = median_absolute_error(y, huber.predict(X))
median_error_ransac = median_absolute_error(y, ransac.predict(X))

print('Median Error Linear:', median_error)
print('Linear MAE:', mae_linear)
print('Linear RMSE:', linear_rmse)
print()
print('Median Error RANSAC:', median_error_ransac)
print('RANSAC MAE:', mae_ransac)
print('RANSAC RMSE:', ransac_rmse)
print()
print('Median Error Tail:', median_error_tail)
print('Theil MAE:', mae_tail)
print('Theil RMSE:', tail_rmse)
print()
print('Median Error Huber:', median_error_huber)
print('Huber MAE:', mae_huber)
print('Huber RMSE:', huber_rmse)

#### PolynomialFeatures

In [None]:
linear_model = LinearRegression()
linear_model.fit(X_poly, y)

tail = TheilSenRegressor()
tail.fit(X_poly, y)

huber = HuberRegressor()
huber.fit(X_poly, y)

ransac = RANSACRegressor()
ransac.fit(X_poly, y)


linear_rmse = np.sqrt(mean_squared_error(y, linear_model.predict(X_poly)))
tail_rmse = np.sqrt(mean_squared_error(y, tail.predict(X_poly)))
huber_rmse = np.sqrt(mean_squared_error(y, huber.predict(X_poly)))
ransac_rmse = np.sqrt(mean_squared_error(y, ransac.predict(X_poly)))

mae_linear = mean_absolute_error(y, linear_model.predict(X_poly))
mae_tail = mean_absolute_error(y, tail.predict(X_poly))
mae_huber = mean_absolute_error(y, huber.predict(X_poly))
mae_ransac = mean_absolute_error(y, ransac.predict(X_poly))


median_error = median_absolute_error(y, linear_model.predict(X_poly))
median_error_tail = median_absolute_error(y, tail.predict(X_poly))
median_error_huber = median_absolute_error(y, huber.predict(X_poly))
median_error_ransac = median_absolute_error(y, ransac.predict(X_poly))

print('Median Error Linear:', median_error)
print('Linear MAE:', mae_linear)
print('Linear RMSE:', linear_rmse)
print()
print('Median Error RANSAC:', median_error_ransac)
print('RANSAC MAE:', mae_ransac)
print('RANSAC RMSE:', ransac_rmse)
print()
print('Median Error Tail:', median_error_tail)
print('Theil MAE:', mae_tail)
print('Theil RMSE:', tail_rmse)
print()
print('Median Error Huber:', median_error_huber)
print('Huber MAE:', mae_huber)
print('Huber RMSE:', huber_rmse)

#### RFECV orqaliy eng yaxshi parametrlarni aniqlab olamiz


In [None]:
X = data[columns]
y = data['price']
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = pd.DataFrame(poly.fit_transform(X), columns=poly.get_feature_names_out(X.columns))


cv = KFold(n_splits=6, shuffle=True, random_state=42)

linear_model = LinearRegression()

rfecv = RFECV(estimator=linear_model, step=1, cv=cv, scoring='neg_mean_squared_error', verbose=2)

rfecv.fit(X_poly, y)
optimal_num_features = rfecv.n_features_
selected_features = X_poly.columns[rfecv.support_]

print("Optimal number of features:", optimal_num_features)
print("Selected features:", selected_features.tolist())

#### 3 ta modelni 3 xil metrika blan snab kuramiz

In [None]:
X_poly = pd.DataFrame(poly.fit_transform(X), columns=poly.get_feature_names_out(X.columns))
X = X_poly[selected_features]
y = data['price']

linear_model = LinearRegression()
linear_model.fit(X, y)

tail = TheilSenRegressor()
tail.fit(X, y)

huber = HuberRegressor()
huber.fit(X, y)

ransac = RANSACRegressor()
ransac.fit(X, y)


linear_rmse = np.sqrt(mean_squared_error(y, linear_model.predict(X)))
tail_rmse = np.sqrt(mean_squared_error(y, tail.predict(X)))
huber_rmse = np.sqrt(mean_squared_error(y, huber.predict(X)))
ransac_rmse = np.sqrt(mean_squared_error(y, ransac.predict(X)))

mae_linear = mean_absolute_error(y, linear_model.predict(X))
mae_tail = mean_absolute_error(y, tail.predict(X))
mae_huber = mean_absolute_error(y, huber.predict(X))
mae_ransac = mean_absolute_error(y, ransac.predict(X))


median_error = median_absolute_error(y, linear_model.predict(X))
median_error_tail = median_absolute_error(y, tail.predict(X))
median_error_huber = median_absolute_error(y, huber.predict(X))
median_error_ransac = median_absolute_error(y, ransac.predict(X))

print('Median Error Linear:', median_error)
print('Linear MAE:', mae_linear)
print('Linear RMSE:', linear_rmse)
print()
print('Median Error RANSAC:', median_error_ransac)
print('RANSAC MAE:', mae_ransac)
print('RANSAC RMSE:', ransac_rmse)
print()
print('Median Error Tail:', median_error_tail)
print('Theil MAE:', mae_tail)
print('Theil RMSE:', tail_rmse)
print()
print('Median Error Huber:', median_error_huber)
print('Huber MAE:', mae_huber)
print('Huber RMSE:', huber_rmse)

#### Outlier blan birga

In [None]:
X = df[columns]
y = df['price']
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = pd.DataFrame(poly.fit_transform(X), columns=poly.get_feature_names_out(X.columns))
X = X_poly[selected_features]

linear_model = LinearRegression()
linear_model.fit(X, y)

tail = TheilSenRegressor()
tail.fit(X, y)

huber = HuberRegressor()
huber.fit(X, y)

ransac = RANSACRegressor()
ransac.fit(X, y)


linear_rmse = np.sqrt(mean_squared_error(y, linear_model.predict(X)))
tail_rmse = np.sqrt(mean_squared_error(y, tail.predict(X)))
huber_rmse = np.sqrt(mean_squared_error(y, huber.predict(X)))
ransac_rmse = np.sqrt(mean_squared_error(y, ransac.predict(X)))

mae_linear = mean_absolute_error(y, linear_model.predict(X))
mae_tail = mean_absolute_error(y, tail.predict(X))
mae_huber = mean_absolute_error(y, huber.predict(X))
mae_ransac = mean_absolute_error(y, ransac.predict(X))


median_error = median_absolute_error(y, linear_model.predict(X))
median_error_tail = median_absolute_error(y, tail.predict(X))
median_error_huber = median_absolute_error(y, huber.predict(X))
median_error_ransac = median_absolute_error(y, ransac.predict(X))

print('Median Error Linear:', median_error)
print('Linear MAE:', mae_linear)
print('Linear RMSE:', linear_rmse)
print()
print('Median Error RANSAC:', median_error_ransac)
print('RANSAC MAE:', mae_ransac)
print('RANSAC RMSE:', ransac_rmse)
print()
print('Median Error Tail:', median_error_tail)
print('Theil MAE:', mae_tail)
print('Theil RMSE:', tail_rmse)
print()
print('Median Error Huber:', median_error_huber)
print('Huber MAE:', mae_huber)
print('Huber RMSE:', huber_rmse)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

mae = mean_absolute_error(y_test, y_pred)

medae = median_absolute_error(y_test, y_pred)

r2 = r2_score(y_test, y_pred)

n = X_test.shape[0]
p = X_test.shape[1]
adjusted_r2 = (1 - (1 - r2) * (n - 1) / (n - p - 1))

mape = (np.mean(np.abs((y_test - y_pred) / y_test)) * 100)

smape = (np.mean(2 * np.abs(y_test - y_pred) / (np.abs(y_test) + np.abs(y_pred))) * 100)

medape = (np.median(np.abs((y_test - y_pred) / y_test)) * 100)

smedape = (np.median(2 * np.abs(y_test - y_pred) / (np.abs(y_test) + np.abs(y_pred))) * 100)

print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'Median Absolute Error: {medae}')
print(f'R²: {r2}')
print(f'Adjusted R²: {adjusted_r2}')
print(f'MAPE: {mape}')
print(f'sMAPE: {smape}')
print(f'MedAPE: {medape}')
print(f'sMedAPE: {smedape}')
