# Практическая работа №2
## по предмету "Системы искусственного интеллекта"

Целью практической работы является изучение методов регрессии.

В данно работе вам необходимо:
1. используя библиотеку sklearn, обучить линейную регрессию без использования регуляризации
2. изучить работу класса Lasso для регуляризации, подобрать наилучший параметр для данного набора данных.
3. изучить работу класса Ridge для регуляризации, подобрать наилучший параметр альфа для данного набора данных.

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('AISP2.csv')
df

Unnamed: 0,brand,processor_brand,processor_name,processor_gnrtn,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,msoffice,Price,rating,Number of Ratings,Number of Reviews
0,ASUS,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,34649,2 stars,3,0
1,Lenovo,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,38999,3 stars,65,5
2,Lenovo,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,39999,3 stars,8,1
3,ASUS,Intel,Core i5,10th,8 GB,DDR4,512 GB,0 GB,Windows,32-bit,2 GB,Casual,No warranty,No,No,69990,3 stars,0,0
4,ASUS,Intel,Celeron Dual,Not Available,4 GB,DDR4,0 GB,512 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,26990,3 stars,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
818,ASUS,AMD,Ryzen 9,Not Available,4 GB,DDR4,1024 GB,0 GB,Windows,64-bit,0 GB,Casual,1 year,No,No,135990,3 stars,0,0
819,ASUS,AMD,Ryzen 9,Not Available,4 GB,DDR4,1024 GB,0 GB,Windows,64-bit,0 GB,Casual,1 year,No,No,144990,3 stars,0,0
820,ASUS,AMD,Ryzen 9,Not Available,4 GB,DDR4,1024 GB,0 GB,Windows,64-bit,4 GB,Casual,1 year,No,No,149990,3 stars,0,0
821,ASUS,AMD,Ryzen 9,Not Available,4 GB,DDR4,1024 GB,0 GB,Windows,64-bit,4 GB,Casual,1 year,No,No,142990,3 stars,0,0


In [5]:
rating_map = {'1 star': 1, '2 stars': 2, '3 stars': 3, '4 stars': 4, '5 stars': 5}
df['rating_num'] = df['rating'].map(rating_map)

In [6]:
def clean_num(col):
    return pd.to_numeric(df[col].astype(str).str.extract('(\d+)')[0], errors='coerce')

In [10]:
df['NumRatings'] = clean_num('Number of Ratings')
df['NumReviews'] = clean_num('Number of Reviews')
df['ram_num'] = clean_num('ram_gb')
df['ssd_num'] = clean_num('ssd')
df['hdd_num'] = clean_num('hdd')
df['gpu_num'] = clean_num('graphic_card_gb')

# 6. Кодирование категорий (one-hot)
cats = ['brand', 'os', 'warranty', 'msoffice']  # Только основные, чтобы не усложнять
df_cats = pd.get_dummies(df[cats], drop_first=True)

# 7. Собрать все в один фрейм для корреляций
nums = ['rating_num', 'NumRatings', 'NumReviews', 'ram_num', 'ssd_num', 'hdd_num', 'gpu_num', 'Price']
all_df = pd.concat([df[nums], df_cats], axis=1)

In [11]:
all_df.corr()['Price']

rating_num             -0.033528
NumRatings             -0.140392
NumReviews             -0.148738
ram_num                 0.518323
ssd_num                 0.628272
hdd_num                -0.252699
gpu_num                 0.459986
Price                   1.000000
brand_ASUS              0.032036
brand_Avita            -0.033819
brand_DELL             -0.166272
brand_HP               -0.030649
brand_Lenovo           -0.039079
brand_MSI               0.123952
brand_acer             -0.024663
os_Mac                  0.312112
os_Windows             -0.337929
warranty_2 years       -0.029339
warranty_3 years        0.080610
warranty_No warranty   -0.045241
msoffice_Yes           -0.105752
Name: Price, dtype: float64

In [32]:
X = df[['ram_num', 'ssd_num', 'hdd_num', 'gpu_num']].values
y = df['Price'].values

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [20]:
model = LinearRegression()
model.fit(X_train, y_train)

In [21]:
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

In [22]:
print("Coefficients:", model.coef_)        # веса признаков
print("Intercept:", model.intercept_)      # свободный член
print("R^2 (test):", r2)                   # коэффициент детерминации
print("MSE (test):", mse)                  # среднеквадратичная ошибка

Coefficients: [2790.06706951   78.35877678   14.37317011 4453.21950095]
Intercept: 7938.270636975518
R^2 (test): 0.3954502558586592
MSE (test): 1178150733.98022


In [25]:
lasso_cv = make_pipeline(
    StandardScaler(),
    LassoCV(
        alphas=None,      # пусть подберет сетку автоматически
        cv=5,             # число фолдов
        random_state=42,  # для воспроизводимости пути
        n_jobs=-1
    )
)
lasso_cv.fit(X_train, y_train)

In [29]:
model = lasso_cv.named_steps['lassocv']
print("Best alpha:", model.alpha_)
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

Best alpha: 29.820794976049314
Coefficients: [12348.19356727 25071.38703258  5942.62389223  9250.93656051]
Intercept: 76531.63829787234


In [30]:
y_pred = lasso_cv.predict(X_test)
print("R^2 (test):", r2_score(y_test, y_pred))
print("MSE (test):", mean_squared_error(y_test, y_pred))

R^2 (test): 0.3956846089449232
MSE (test): 1177694025.060453


In [34]:
alphas = np.logspace(-3, 3, 50)

pipe = make_pipeline(
    StandardScaler(),
    RidgeCV(alphas=alphas, cv=5)
)
pipe.fit(X_train, y_train)

In [35]:
ridge = pipe.named_steps['ridgecv']
print("Best alpha:", ridge.alpha_)
print("Coefficients:", ridge.coef_)
print("Intercept:", ridge.intercept_)

Best alpha: 33.9322177189533
Coefficients: [12231.31902307 23071.79470927  4550.18852613  9429.42339188]
Intercept: 76531.63829787234


In [36]:
y_pred = pipe.predict(X_test)
print("R^2 (test):", r2_score(y_test, y_pred))
print("MSE (test):", mean_squared_error(y_test, y_pred))

R^2 (test): 0.4026323576705425
MSE (test): 1164154204.1608102
