In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [3]:
# Load the data into a pandas DataFrame
data = pd.read_csv('housing.csv', delimiter=',')

In [4]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
data.isnull()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
20635,False,False,False,False,False,False,False,False,False,False
20636,False,False,False,False,False,False,False,False,False,False
20637,False,False,False,False,False,False,False,False,False,False
20638,False,False,False,False,False,False,False,False,False,False


In [6]:
# Split the data into features (X) and target variable (y)
X = data.drop('median_house_value', axis=1)
y = data['median_house_value']

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Preprocess the data: fill missing values
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Preprocess the data: encode categorical variable and fill missing values
categorical_features = ['ocean_proximity']
numeric_features = list(X_train.drop(categorical_features, axis=1).columns)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', SimpleImputer(strategy='mean'), numeric_features)
    ],
    remainder='passthrough'
)

# Fit and transform the training data
X_train_preprocessed = preprocessor.fit_transform(X_train)

# Transform the testing data
X_test_preprocessed = preprocessor.transform(X_test)

**Linear Regression**

In [15]:
linear_reg = LinearRegression()
linear_reg.fit(X_train_preprocessed, y_train)
linear_reg_predictions = linear_reg.predict(X_test_preprocessed)
linear_reg_mse = mean_squared_error(y_test, linear_reg_predictions)
# Adjust the print options
np.set_printoptions(threshold=np.inf)
print("Linear Regression Predictions:", linear_reg_predictions)
print("Linear Regression MSE:", linear_reg_mse)

Linear Regression Predictions: [  64693.82761502  134863.71765328  266127.7582078   278640.81028528
  273407.81353177  150244.68267176  301303.80263018  238903.25509132
  267145.1643261   418562.23715091  128286.67795807  188194.40673037
   60211.9269838   157534.38003588  259817.22953877   62257.81153677
  279609.24465125  217176.39265076  248479.35749359  122884.04764381
  276435.79342574  283515.42153978  166797.60735683  324907.45122634
  205451.60014307   84976.28395086  140692.48712364  240624.83848668
  169164.47318798  342767.76543848  176465.9550414   162646.08778456
  211771.39241605  386959.10783056  258554.80099634  139136.27760113
  309261.04687684  192050.4322407   157886.6450694   211257.35978146
  274815.57315967  167010.6217137   129319.72711431  185171.79153196
  133622.39836251  167065.09234488  228818.30071171  206532.69512081
  229772.42827022  171706.672542    265347.9662232   166385.34235903
  226238.23483262   82119.84657068  176477.07935607  191640.16773582
   

**Polynomial Regression**

In [16]:
poly_features = PolynomialFeatures(degree=2)
X_poly = poly_features.fit_transform(X_train_preprocessed)
poly_reg = LinearRegression()
poly_reg.fit(X_poly, y_train)
X_test_poly = poly_features.transform(X_test_preprocessed)
poly_reg_predictions = poly_reg.predict(X_test_poly)
poly_reg_mse = mean_squared_error(y_test, poly_reg_predictions)
# Adjust the print options
np.set_printoptions(threshold=np.inf)
print("Polynomial Regression Predictions:", poly_reg_predictions)
print("Polynomial Regression MSE:", poly_reg_mse)

Polynomial Regression Predictions: [ 102294.17247356  113315.72342479  334330.61147626  283279.22105712
  313453.15024448  120278.81396534  344284.74768152  278961.62877739
  292995.75383159  426904.44481028  132727.06219049  177681.61700574
  -85855.70582234   79462.4538642   244782.02408042 -154564.34408692
  288514.54081047  226055.69593338  268857.09381574  141568.24346229
  309191.1624737   327860.92357909  185809.09166393  318050.70214393
  174061.49062993  112462.81338435   63163.44653696  190814.02336568
  150490.97828201  381270.55364247  187266.68455585  212290.57627416
  211333.4789108   405098.10832433  294970.23263914  -76297.34039865
  386611.10025235  185037.51133622  125856.23262439  273318.78174404
  238721.40828452  170480.46411751  155166.13193388  132643.25199608
  120374.06461314  119050.16320402  239669.13118431  173844.36397885
  274994.24802574  111810.50819196  260410.3046382   143927.91726726
  272702.47355333  110016.24829452  170372.50632427  193080.74050687

**Ridge Regression**

In [17]:
ridge_reg = Ridge(alpha=0.5)
ridge_reg.fit(X_train_preprocessed, y_train)
ridge_reg_predictions = ridge_reg.predict(X_test_preprocessed)
ridge_reg_mse = mean_squared_error(y_test, ridge_reg_predictions)
# Adjust the print options
np.set_printoptions(threshold=np.inf)
print("Ridge Regression Predictions:", ridge_reg_predictions)
print("Ridge Regression MSE:", ridge_reg_mse)

Ridge Regression Predictions: [  64710.78474382  134882.26828165  266106.11285865  278641.07225076
  273421.99928564  150230.23457496  301327.28031349  238924.9282337
  267141.6917926   418573.59413405  128313.59255533  188187.47590313
   60167.36192057  157518.54614831  259812.22216313   62194.47143555
  279642.25522875  217216.2784863   248493.48867611  122912.69503057
  276403.56678143  283521.98329215  166788.03648027  324888.60060303
  205439.44325822   84998.09949625  140663.23305195  240639.58358269
  169173.81076351  342776.29250228  176493.28727407  162675.75018889
  211779.26102001  386987.37067906  258567.41379668  139056.06583625
  309286.86636274  192058.06841239  157906.3717048   211285.20179043
  274790.67357661  167032.04150389  129353.63798984  185136.25768476
  133627.88882307  167049.80796446  228831.48955504  206535.83624747
  229789.16513423  171668.93200036  265353.82742523  166366.17033084
  226263.85326649   82144.69777442  176485.09289073  191638.96408507
   58

**Lasso Regression**

In [18]:
lasso_reg = Lasso(alpha=0.5)
lasso_reg.fit(X_train_preprocessed, y_train)
lasso_reg_predictions = lasso_reg.predict(X_test_preprocessed)
lasso_reg_mse = mean_squared_error(y_test, lasso_reg_predictions)
# Adjust the print options
np.set_printoptions(threshold=np.inf)
print("Lasso Regression Predictions:", lasso_reg_predictions)
print("Lasso Regression MSE:", lasso_reg_mse)

Lasso Regression Predictions: [  64697.7114996   134869.67703306  266121.46715086  278641.36087071
  273407.35147073  150243.39860695  301308.59401198  238907.68476213
  267138.66169112  418565.88704522  128292.72736854  188194.3608545
   60204.03466834  157526.43691226  259816.88783133   62246.72690936
  279609.60369381  217185.51049857  248476.38529529  122891.2407104
  276427.55489255  283518.79983126  166796.97128861  324901.68125096
  205449.51957322   84981.51575307  140688.8403413   240630.11346549
  169169.37235051  342764.70267674  176465.09359682  162652.28150278
  211766.22873277  386958.42449329  258554.19462261  139123.35532807
  309262.19188335  192052.02426972  157892.71632999  211262.89188744
  274812.13460378  167015.61894048  129327.30159242  185167.03389132
  133624.16156541  167063.48566257  228814.97109961  206527.82014796
  229770.92001688  171697.09717442  265351.12758741  166383.34712674
  226242.52449116   82125.78024491  176479.28104581  191641.11724923
   589

  model = cd_fast.enet_coordinate_descent(


**Decision Tree Regression**

In [19]:
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train_preprocessed, y_train)
dt_reg_predictions = dt_reg.predict(X_test_preprocessed)
dt_reg_mse = mean_squared_error(y_test, dt_reg_predictions)
# Adjust the print options
np.set_printoptions(threshold=np.inf)
print("Decision Tree Regression Predictions:", dt_reg_predictions)
print("Decision Tree Regression MSE:", dt_reg_mse)

Decision Tree Regression Predictions: [ 67800.  26600. 500001. 280800. 319800. 147600. 296700. 162500. 192200.
 500001.  87500. 180400. 184300. 226900. 218300. 167400. 184000. 163200.
 204400.  92900. 356000. 500001. 165500. 450000. 178300.  58900. 142500.
 111800. 162800. 402900.  87100. 179200. 143800. 500001. 296600. 220100.
 387100. 157100. 149500. 254200. 417800.  90400. 113500. 153600. 147500.
 230800. 126500. 154200. 257100.  92800. 199200. 161500. 138100.  88500.
 156000. 163100.  86800. 215400. 457100. 195600. 495500. 290800. 111300.
  84200. 281500. 215500.  95600. 134800. 500000. 109400. 113400. 188300.
  81400. 262500. 180100. 156500. 450000. 120400. 175000. 235700.  58600.
 450000. 170800. 187500. 483700. 278000.  78800.  48300. 187500. 192300.
 111600. 261400.  86500.  49100.  96400. 118100. 488900. 156300. 102700.
 172500. 142100. 465600. 442700. 329300. 169800. 156900. 200000. 450000.
 220700. 125000. 275000. 210700. 444100. 390100. 107600. 193200. 189000.
 202200. 2126

**Random Forest Regression**

In [20]:
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train_preprocessed, y_train)
rf_reg_predictions = rf_reg.predict(X_test_preprocessed)
rf_reg_mse = mean_squared_error(y_test, rf_reg_predictions)
# Adjust the print options
np.set_printoptions(threshold=np.inf)
print("Random Forest Regression Predictions:", rf_reg_predictions)
print("Random Forest Regression MSE:", rf_reg_mse)

Random Forest Regression Predictions: [ 54223.    67538.   473273.45 247210.   264704.   162534.   250301.08
 162415.   308861.07 484992.79 134705.   205445.   173636.   205362.
 259280.   179510.   199157.   183539.   161813.    96323.   354896.05
 456329.48 155283.   428532.32 180742.    59715.   164410.   103793.
 158267.   406687.14  90997.   176386.03 208199.02 460913.48 287759.01
 257512.01 336486.02 161935.   138313.   177196.   314725.06 101663.
 127961.   180993.   149689.   217272.   131872.   169453.   246596.
 105314.   258483.   201181.   157363.    70256.   144129.   148987.
  82631.   230797.   287837.06 191646.   435898.21 305312.11 115555.
 113896.   279126.   181055.   140582.   144691.   159188.   167121.02
 118068.   201277.    95160.   173925.   221670.   183061.   424287.27
 224844.   184039.01 362336.26  73104.   195410.   191129.   182487.
 423058.34 263338.03  96212.    85337.   186493.   193225.   117962.
 250356.   149139.    50021.   120319.   124641.   4239

**Gradient Boosting Regression**

In [21]:
gb_reg = GradientBoostingRegressor()
gb_reg.fit(X_train_preprocessed, y_train)
gb_reg_predictions = gb_reg.predict(X_test_preprocessed)
gb_reg_mse = mean_squared_error(y_test, gb_reg_predictions)
# Adjust the print options
np.set_printoptions(threshold=np.inf)
print("Gradient Boosting Regression Predictions:", gb_reg_predictions)
print("Gradient Boosting Regression MSE:", gb_reg_mse)

Gradient Boosting Regression Predictions: [ 56931.86917349  83690.9148769  377035.89328928 267868.10321237
 285164.98138617 182055.85714567 279512.77033253 223598.01431673
 283161.21959579 464914.79303613 133842.20282497 188590.76577257
 158012.16875207 221043.68803834 239541.8931783  202147.02945937
 260515.1973884  200753.4309983  195744.02884702 106611.39571449
 278972.4090615  367786.89363904 165861.54657704 406958.53075049
 168851.15918026  55425.87119483 174918.45726988 166478.39568724
 133220.28638182 398441.01439516 165049.42313322 203556.90917475
 236267.87944816 437358.02279715 279429.65339825 254612.37409324
 339105.45197217 192790.91159715 155981.11482378 181816.42262284
 314685.02335237 149785.04301725 117959.78946563 187443.28931064
 138631.98561359 199810.77681649 185075.35742557 165652.35632886
 223130.65063496 128130.61319623 257332.51836493 159088.3933061
 210663.58560713  88235.66238828 172712.87745285 171956.69103723
  79977.87755759 245733.59968785 288739.54454438 

**Support Vector Regression**

In [22]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_preprocessed)
X_test_scaled = scaler.transform(X_test_preprocessed)
svr_reg = SVR()
svr_reg.fit(X_train_scaled, y_train)
svr_reg_predictions = svr_reg.predict(X_test_scaled)
svr_reg_mse = mean_squared_error(y_test, svr_reg_predictions)
# Adjust the print options
np.set_printoptions(threshold=np.inf)
print("Support Vector Regression Predictions:", svr_reg_predictions)
print("Support Vector Regression MSE:", svr_reg_mse)

Support Vector Regression Predictions: [178055.2623179  178348.39512422 180166.39625666 180871.63346262
 180083.55200643 179201.70768549 180683.49357395 180359.7227092
 180059.27861322 181048.70133392 178262.45707052 180258.53472621
 180065.60415671 179972.3105427  180866.09890994 180084.13950476
 180075.43873054 178553.53251554 180025.87917623 178084.57966465
 180227.16641361 180548.7519608  178965.23353922 180332.16637224
 180507.10997762 178179.23376216 180176.03702241 179804.17821834
 178440.13733006 180402.50864948 179801.54128725 179813.49284805
 179900.37843462 180396.46298593 180024.49574941 180117.95480713
 180061.71495706 180320.51922641 178623.8253415  180084.57543105
 180833.94629654 178366.55013458 178220.79418174 180381.99809034
 178866.88551031 180294.89499064 179977.43021973 179949.16702574
 179927.0495547  179913.72301984 180144.67710358 180178.45239932
 180285.56327657 178011.31249925 180132.87179125 178932.98866116
 178518.97027183 180786.54437438 180090.64860088 179

**Gradient Descent**

In [23]:
gd_reg = make_pipeline(StandardScaler(), SGDRegressor())
gd_reg.fit(X_train_preprocessed, y_train)
gd_reg_predictions = gd_reg.predict(X_test_preprocessed)
gd_reg_mse = mean_squared_error(y_test, gd_reg_predictions)

# Adjust the print options
np.set_printoptions(threshold=np.inf)

# Print the predictions
print("Gradient Boosting Regression Predictions:")
print(gb_reg_predictions)

print("Gradient Descent Regression MSE:", gd_reg_mse)

Gradient Boosting Regression Predictions:
[ 56931.86917349  83690.9148769  377035.89328928 267868.10321237
 285164.98138617 182055.85714567 279512.77033253 223598.01431673
 283161.21959579 464914.79303613 133842.20282497 188590.76577257
 158012.16875207 221043.68803834 239541.8931783  202147.02945937
 260515.1973884  200753.4309983  195744.02884702 106611.39571449
 278972.4090615  367786.89363904 165861.54657704 406958.53075049
 168851.15918026  55425.87119483 174918.45726988 166478.39568724
 133220.28638182 398441.01439516 165049.42313322 203556.90917475
 236267.87944816 437358.02279715 279429.65339825 254612.37409324
 339105.45197217 192790.91159715 155981.11482378 181816.42262284
 314685.02335237 149785.04301725 117959.78946563 187443.28931064
 138631.98561359 199810.77681649 185075.35742557 165652.35632886
 223130.65063496 128130.61319623 257332.51836493 159088.3933061
 210663.58560713  88235.66238828 172712.87745285 171956.69103723
  79977.87755759 245733.59968785 288739.54454438 

**Performance** (A lower MSE indicates better performance)

In [24]:
print("Linear Regression MSE:", linear_reg_mse)
print("Polynomial Regression MSE:", poly_reg_mse)
print("Ridge Regression MSE:", ridge_reg_mse)
print("Lasso Regression MSE:", lasso_reg_mse)
print("Decision Tree Regression MSE:", dt_reg_mse)
print("Random Forest Regression MSE:", rf_reg_mse)
print("Gradient Boosting Regression MSE:", gb_reg_mse)
print("Support Vector Regression MSE:", svr_reg_mse)
print("Gradient Descent Regression MSE:", gd_reg_mse)

Linear Regression MSE: 4904409297.414914
Polynomial Regression MSE: 4428462422.659215
Ridge Regression MSE: 4905109934.93573
Lasso Regression MSE: 4904522495.368368
Decision Tree Regression MSE: 4804853100.139293
Random Forest Regression MSE: 2391467589.043087
Gradient Boosting Regression MSE: 3131481878.319038
Support Vector Regression MSE: 13651849621.976437
Gradient Descent Regression MSE: 2.501767521437021e+20


In [25]:
# Initialize the best MSE variable
best_mse = float('inf') # It should be an initial value that is higher than the maximum possible MSE to ensure that the first model's MSE is assigned correctly.
best_model = None

# Compare the MSE values and find the best model
if linear_reg_mse < best_mse:
    best_mse = linear_reg_mse
    best_model = "Linear Regression"

if poly_reg_mse < best_mse:
    best_mse = poly_reg_mse
    best_model = "Polynomial Regression"

if ridge_reg_mse < best_mse:
    best_mse = ridge_reg_mse
    best_model = "Ridge Regression"

if lasso_reg_mse < best_mse:
    best_mse = lasso_reg_mse
    best_model = "Lasso Regression"

if dt_reg_mse < best_mse:
    best_mse = dt_reg_mse
    best_model = "Decision Tree Regression"

if rf_reg_mse < best_mse:
    best_mse = rf_reg_mse
    best_model = "Random Forest Regression"

if gb_reg_mse < best_mse:
    best_mse = gb_reg_mse
    best_model = "Gradient Boosting Regression"

if svr_reg_mse < best_mse:
    best_mse = svr_reg_mse
    best_model = "Support Vector Regression"

if gd_reg_mse < best_mse:
    best_mse = gd_reg_mse
    best_model = "Gradient Descent Regression"

print("The best model is:", best_model)


The best model is: Random Forest Regression
