<a href="https://colab.research.google.com/github/UznetDev/Aiogram-Bot-Template/blob/main/31_Iyul%2C_2024_home_work.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings('ignore')

In [13]:
df = pd.read_csv('car_prices.csv')
df.shape

(54273, 13)

#### outlierlarni uchiramiz chunki outlayerlar sababliy RMSE juda baland

In [14]:
df = df[df['price'] <= df['price'].quantile(0.95)]
df.shape

(51559, 13)

In [15]:
df.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000
1,1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,8250
2,2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,Yes,15000
3,3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,Yes,63500
4,4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,Yes,7850


In [16]:
X = df[['model_year', 'milage']]
y = df['price']

#### Oddiy Cros validatsiya qilib kuramiz

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

model = LinearRegression()

cros_rmse = np.sqrt(-cross_val_score(model,
                                     X_train,
                                     y_train,
                                     cv=5,
                                     scoring='neg_mean_squared_error'))
model.fit(X_train,
          y_train)

y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'CV RMSE: {cros_rmse.mean()}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

CV RMSE: 14436.888821884364
RMSE: 14512.117053902663
MAE: 10950.126928640908
R2: 0.4694463228907306


### Eng yaxshi modelni aniqlab olamiz

In [18]:
X = df[['model_year', 'milage']]
y = df['price']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


models = {
        'ElasticNet': ElasticNet(),
        'Ridge': Ridge(),
        'Lasso': Lasso()
    }

best_model = None
best_rmse = float('inf')
best_model_name = ""


for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    print(f"{name} modelining RMSE: {rmse}")

    if rmse < best_rmse:
        best_rmse = rmse
        best_model = model
        best_model_name = name

y_pred = best_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'CV RMSE: {cros_rmse.mean()}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')
print(f"Eng yaxshi model: {best_model_name}")
print(f"Eng past RMSE: {best_rmse}")

ElasticNet modelining RMSE: 14513.08891331319
Ridge modelining RMSE: 14512.117065680102
Lasso modelining RMSE: 14512.116956268605
CV RMSE: 14436.888821884364
RMSE: 14512.116956268605
MAE: 10950.127746806953
R2: 0.46944633002960734
Eng yaxshi model: Lasso
Eng past RMSE: 14512.116956268605


#### Eng yaxshi modelimizni yaniy Lasso modelimizni RandomizedSearchCV orqaliy parametirlarini aniqlab olamiz!

In [19]:
X = df[['model_year', 'milage']]
y = df['price']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

lasso_model = Lasso()

lasso_params = {
    'alpha': np.linspace(-4, 0, 1000),
    'max_iter': [1000, 5000, 10000]
}

random_model = RandomizedSearchCV(lasso_model,
                                  lasso_params,
                                  cv=5,
                                  scoring='neg_mean_squared_error',
                                  n_iter=1000,
                                  random_state=42)

random_model.fit(X_train, y_train)

print(random_model.best_params_)
print(np.sqrt(-random_model.best_score_))

{'max_iter': 10000, 'alpha': 0.0}
14469.04796913062


#### Lasso modelimizni GridSearchCV orqaliy snab kuramiz

In [20]:
X = df[['model_year', 'milage']]
y = df['price']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

lasso_model = Lasso()

lasso_params = {
    'alpha': np.linspace(-4, 0, 1000),
    'max_iter': [1000, 5000, 10000]
}

random_model = GridSearchCV(lasso_model,
                                  lasso_params,
                                  cv=5,
                                  scoring='neg_mean_squared_error',
                                  # n_iter=1000,
                                  # random_state=42
                            )

random_model.fit(X_train, y_train)

print(random_model.best_params_)
print(np.sqrt(-random_model.best_score_))

{'alpha': 0.0, 'max_iter': 10000}
14469.04796913062


#### ElasticNet va Ridge uchunham modellar yozib kuramiz!

In [21]:

X = df[['model_year', 'milage']]
y = df['price']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

elastic_model = ElasticNet(random_state=42)

elasticnet_params = {
    'alpha': np.linspace(0.1, 1000, 1000),
    'l1_ratio': np.linspace(0.1, 1.0, 10),
}

random_model = RandomizedSearchCV(elastic_model,
                                  elasticnet_params,
                                  cv=5,
                                  scoring='neg_mean_squared_error',
                                  n_iter=1000,
                                  random_state=42
                                  )

random_model.fit(X_train, y_train)

print(random_model.best_params_)
print(np.sqrt(-random_model.best_score_))

{'l1_ratio': 0.9, 'alpha': 0.1}
14456.64756172304


In [22]:
X = df[['model_year', 'milage']]
y = df['price']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)


ridge = Ridge()

ridge_params = {
    'alpha': [0.1, 1.0, 10, 100],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
}

random_grid = RandomizedSearchCV(ridge,
                                 ridge_params,
                                 cv=5,
                                 scoring='neg_mean_squared_error',
                                 n_iter=1000,
                                 random_state=42
                                 )
random_grid.fit(X_train, y_train)
print(random_grid.best_params_)
print(np.sqrt(-random_grid.best_score_))

y_pred = random_grid.predict(X_test)

np.sqrt(mean_squared_error(y_test, y_pred))

{'solver': 'svd', 'alpha': 100}
14419.168758820202


14633.414441967103

#### Bizda boshqacha holat Ridge da yaxshiroq RMSE topilmoqda

#### 0 dan 0.1 orasidagi sonlar uchun snab kuramiz

In [27]:
X = df[['model_year', 'milage']]
y = df['price']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)


ridge = Ridge(solver='svd')

ridge_params = {
    'alpha': np.random.uniform(0, 0.1, 1000),
    # 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
}

random_grid = GridSearchCV(ridge,
                           ridge_params,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           )
random_grid.fit(X_train, y_train)
print(random_grid.best_params_)

print(np.sqrt(-random_grid.best_score_))

y_pred = random_grid.predict(X_test)

np.sqrt(mean_squared_error(y_test, y_pred))

{'alpha': 0.09996689724636731}
14419.16883293316


14633.406136203459

#### Unchalik katta farq kuzatilmadi lekin farq bor

### PolynomialFeatures ni sanb kuramiz

In [28]:
X = df[['model_year', 'milage']]
poly = PolynomialFeatures(degree=2)
X = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)


ridge = Ridge(solver='svd')

ridge_params = {
    'alpha': np.random.uniform(0, 0.1, 1000),
    # 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
}

random_grid = GridSearchCV(ridge,
                           ridge_params,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           )
random_grid.fit(X_train, y_train)
print(random_grid.best_params_)
print(np.sqrt(-random_grid.best_score_))

y_pred = random_grid.predict(X_test)

np.sqrt(mean_squared_error(y_test, y_pred))

{'alpha': 0.005829271457262875}
13551.164966405197


13794.828167771802

#### Bizda PolynomialFeatures ancha foydaliy ekan