In [25]:
import pandas as pd

In [26]:
data=pd.read_csv("housing.csv")

In [27]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [29]:
data.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [30]:
X=data.drop(columns=['median_house_value'])
Y=data['median_house_value']

In [31]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.compose import ColumnTransformer

In [32]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=42)

In [33]:
num_cols=X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols=X.select_dtypes(include=['object']).columns.tolist()

In [34]:
num_pipeline=Pipeline([
    ("Impute",SimpleImputer(strategy='median')),
    ("scaler",StandardScaler())
])

In [35]:
cat_pipeline=Pipeline([
    ("Impute",SimpleImputer(strategy='most_frequent')),
    ("OHE",OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
])

In [36]:
prep=ColumnTransformer([
    ("num_pipe",num_pipeline,num_cols),
    ("cat_pipe",cat_pipeline,cat_cols)
])

In [37]:
pipe=Pipeline([
    ("prep",prep),
    ("model",SVR(kernel='linear'))
])

In [38]:
pipe.fit(X_train,Y_train)

In [41]:
Y_pred=pipe.predict(X_test)

In [42]:
print("r2 score is",r2_score(Y_test,Y_pred))
print("Mean Squared Error is",mean_squared_error(Y_test,Y_pred))
print("Mean Absolute Error is",mean_absolute_error(Y_test,Y_pred))

r2 score is 0.05260633018263339
Mean Squared Error is 12434921895.242579
Mean Absolute Error is 82531.35189903586


In [43]:
print("Training Accuracy is",pipe.score(X_train,Y_train))
print("Testing Accuracy is",pipe.score(X_test,Y_test))

Training Accuracy is 0.05321933582838967
Testing Accuracy is 0.05260633018263339


In [44]:
pipe=Pipeline([
    ("prep",prep),
    ("model",SVR())
])

In [45]:
param_grid = {
    "model__kernel": ["linear", "rbf", "poly"],
    "model__C": [0.1, 1, 10, 100],
    "model__epsilon": [0.01, 0.1, 0.2, 0.5],
    "model__gamma": ["scale", "auto"]
}


In [46]:
grid=GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=2
)

In [47]:
grid.fit(X_train,Y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


In [48]:
best_model=grid.best_estimator_
best_score=grid.best_score_
best_params=grid.best_params_

In [49]:
print("Best Score is",best_score)
print("Best Parameters are",best_params)

Best Score is 0.6119482150965715
Best Parameters are {'model__C': 100, 'model__epsilon': 0.01, 'model__gamma': 'scale', 'model__kernel': 'linear'}


In [50]:
Y_pred1=best_model.predict(X_test)

In [51]:
print("r2 score is",r2_score(Y_test,Y_pred1))
print("Mean Squared Error is",mean_squared_error(Y_test,Y_pred1))
print("Mean Absolute Error is",mean_absolute_error(Y_test,Y_pred1))

r2 score is 0.6108047680571909
Mean Squared Error is 5108343516.948562
Mean Absolute Error is 49280.59225866007


In [52]:
print("Training Accuracy is",grid.score(X_train,Y_train))
print("Testing Accuracy is",grid.score(X_test,Y_test))

Training Accuracy is 0.6163993396682945
Testing Accuracy is 0.6108047680571909
