In [5]:
from sklearn.datasets import fetch_california_housing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score

In [8]:
data= fetch_california_housing()
X=data.data
y=data.target
feature_names=data.feature_names

In [9]:
print("شکل داده‌ها:", X.shape)
print("نام ویژگی‌ها:", feature_names)
print("نمونه‌ای از y:", y[:5])

شکل داده‌ها: (20640, 8)
نام ویژگی‌ها: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
نمونه‌ای از y: [4.526 3.585 3.521 3.413 3.422]


In [10]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
print("تعداد نمونه‌های train:", len(X_train), "تعداد نمونه‌های test:", len(X_test))

تعداد نمونه‌های train: 16512 تعداد نمونه‌های test: 4128


In [11]:
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [13]:
lr=LinearRegression()
lr.fit(X_train_scaled,y_train)

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [15]:
train_r2_lr=r2_score(y_train,lr.predict(X_train_scaled))
test_r2_lr=r2_score(y_test,lr.predict(X_test_scaled))


In [16]:
print(f"R² Train (LR): {train_r2_lr:.4f}")
print(f"R² Test (LR): {test_r2_lr:.4f}")
print(f"اختلاف (شاخص overfitting): {train_r2_lr - test_r2_lr:.4f}")

R² Train (LR): 0.6126
R² Test (LR): 0.5758
اختلاف (شاخص overfitting): 0.0368


In [17]:
coefficients_lr = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': lr.coef_
}).sort_values(by='Coefficient', ascending=False)
print("ضرایب Linear Regression (مرتب‌شده نزولی):")
print(coefficients_lr)
print(f"ویژگی با بیشترین تأثیر مثبت: {coefficients_lr.iloc[0]['Feature']}")

ضرایب Linear Regression (مرتب‌شده نزولی):
      Feature  Coefficient
0      MedInc     0.854383
3   AveBedrms     0.339259
1    HouseAge     0.122546
4  Population    -0.002308
5    AveOccup    -0.040829
2    AveRooms    -0.294410
7   Longitude    -0.869842
6    Latitude    -0.896929
ویژگی با بیشترین تأثیر مثبت: MedInc


## Elastic Net

In [18]:
en = ElasticNet(alpha=0.1, l1_ratio=0.5)
en.fit(X_train_scaled, y_train)

0,1,2
,"alpha  alpha: float, default=1.0 Constant that multiplies the penalty terms. Defaults to 1.0. See the notes for the exact mathematical meaning of this parameter. ``alpha = 0`` is equivalent to an ordinary least square, solved by the :class:`LinearRegression` object. For numerical reasons, using ``alpha = 0`` with the ``Lasso`` object is not advised. Given this, you should use the :class:`LinearRegression` object.",0.1
,"l1_ratio  l1_ratio: float, default=0.5 The ElasticNet mixing parameter, with ``0 <= l1_ratio <= 1``. For ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2.",0.5
,"fit_intercept  fit_intercept: bool, default=True Whether the intercept should be estimated or not. If ``False``, the data is assumed to be already centered.",True
,"precompute  precompute: bool or array-like of shape (n_features, n_features), default=False Whether to use a precomputed Gram matrix to speed up calculations. The Gram matrix can also be passed as argument. For sparse input this option is always ``False`` to preserve sparsity. Check :ref:`an example on how to use a precomputed Gram Matrix in ElasticNet ` for details.",False
,"max_iter  max_iter: int, default=1000 The maximum number of iterations.",1000
,"copy_X  copy_X: bool, default=True If ``True``, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-4 The tolerance for the optimization: if the updates are smaller or equal to ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller or equal to ``tol``, see Notes below.",0.0001
,"warm_start  warm_start: bool, default=False When set to ``True``, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. See :term:`the Glossary `.",False
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive.",False
,"random_state  random_state: int, RandomState instance, default=None The seed of the pseudo random number generator that selects a random feature to update. Used when ``selection`` == 'random'. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `.",


In [19]:
train_r2_en = r2_score(y_train, en.predict(X_train_scaled))
test_r2_en = r2_score(y_test, en.predict(X_test_scaled))

In [20]:
print(f"R² Train (EN): {train_r2_en:.4f}")
print(f"R² Test (EN): {test_r2_en:.4f}")
print(f"اختلاف (شاخص overfitting): {train_r2_en - test_r2_en:.4f}")

R² Train (EN): 0.5308
R² Test (EN): 0.5148
اختلاف (شاخص overfitting): 0.0160


In [21]:
comparison = pd.DataFrame({
    'Feature': feature_names,
    'LR_Coeff': lr.coef_,
    'EN_Coeff': en.coef_,
    'Difference': lr.coef_ - en.coef_
})
print("جدول مقایسه ضرایب:")
print(comparison)

جدول مقایسه ضرایب:
      Feature  LR_Coeff  EN_Coeff  Difference
0      MedInc  0.854383  0.713818    0.140565
1    HouseAge  0.122546  0.137288   -0.014742
2    AveRooms -0.294410 -0.000000   -0.294410
3   AveBedrms  0.339259  0.000000    0.339259
4  Population -0.002308  0.000000   -0.002308
5    AveOccup -0.040829 -0.000000   -0.040829
6    Latitude -0.896929 -0.175737   -0.721192
7   Longitude -0.869842 -0.133250   -0.736592


In [22]:
print("\nتحلیل بخش 5:")
print(f"مدل با overfitting کمتر: Elastic Net (اختلاف: {train_r2_en - test_r2_en:.4f} vs {train_r2_lr - test_r2_lr:.4f})")
print("Elastic Net ضرایب را shrink کرده (بسیاری از تفاوت‌ها مثبت هستند، یعنی LR بزرگ‌تر است).")
print(f"بهترین مدل برای generalization: Elastic Net (Test R² بالاتر: {test_r2_en:.4f} vs {test_r2_lr:.4f})")


تحلیل بخش 5:
مدل با overfitting کمتر: Elastic Net (اختلاف: 0.0160 vs 0.0368)
Elastic Net ضرایب را shrink کرده (بسیاری از تفاوت‌ها مثبت هستند، یعنی LR بزرگ‌تر است).
بهترین مدل برای generalization: Elastic Net (Test R² بالاتر: 0.5148 vs 0.5758)


# PCA

In [23]:
pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
print(f"واریانس توضیح‌داده‌شده توسط PCA: {pca.explained_variance_ratio_.sum():.4f}")

واریانس توضیح‌داده‌شده توسط PCA: 0.9016


In [24]:
lr_pca = LinearRegression()
lr_pca.fit(X_train_pca, y_train)
train_r2_pca = r2_score(y_train, lr_pca.predict(X_train_pca))
test_r2_pca = r2_score(y_test, lr_pca.predict(X_test_pca))

In [25]:
print(f"R² Train (PCA+LR): {train_r2_pca:.4f}")
print(f"R² Test (PCA+LR): {test_r2_pca:.4f}")
print(f"اختلاف (شاخص overfitting): {train_r2_pca - test_r2_pca:.4f}")

R² Train (PCA+LR): 0.4567
R² Test (PCA+LR): 0.4329
اختلاف (شاخص overfitting): 0.0238


In [26]:
print("\nمقایسه Test R²:")
print(f"LR: {test_r2_lr:.4f}")
print(f"EN: {test_r2_en:.4f}")
print(f"PCA+LR: {test_r2_pca:.4f}")


مقایسه Test R²:
LR: 0.5758
EN: 0.5148
PCA+LR: 0.4329
