In [144]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, f_regression

In [168]:
class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names=attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].values
    def get_feature_names_out(self, input_features=None):
        return np.array(self.attribute_names)

In [169]:
def splitting_data(data,category_to_split):
    split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
    for train_index,test_index in split.split(data,data[category_to_split]):
        strat_train_set=data.loc[train_index]
        strat_test_set=data.loc[test_index]
    return strat_test_set, strat_train_set

In [170]:
input_data=pd.read_csv('/home/greg/Workdir/Python_Learning/Data/housing.csv')

In [171]:
input_data['income_cat']=np.ceil(input_data['median_income']/1.5)

In [172]:
input_data['income_cat']=input_data['income_cat'].where(input_data['income_cat']<5,5.0)

In [173]:
strat_test_set,strat_train_set=splitting_data(input_data,'income_cat')

In [174]:
data_labels=strat_train_set['median_house_value'].copy()

In [175]:
data=strat_train_set.drop('median_house_value',axis=1)

In [176]:
num_attribs=list(data.drop('ocean_proximity',axis=1))
cat_attribs=['ocean_proximity']

In [177]:
imputer=SimpleImputer(strategy='median')

In [178]:
num_pipeline=Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
     ])

In [179]:
cat_pipeline=Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('cat_encoder',OneHotEncoder(sparse_output=True)),
     ])

In [180]:
full_pipeline=FeatureUnion(transformer_list=[
    ('num_pipeline',num_pipeline),
    ('cat_pipeline',cat_pipeline),])

In [181]:
data_prepared=full_pipeline.fit_transform(data)

In [89]:
linreg=LinearRegression()

In [90]:
linreg.fit(data_prepared,data_labels)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [91]:
#Checking pipeline
some_data=data.iloc[:5]
some_labels=data_labels.iloc[:5]
some_data_prepared=full_pipeline.transform(some_data)
print('Predicted:',linreg.predict(some_data_prepared))
print('Labels:',list(some_labels))

Predicted: [ 87710.91209139 312666.58754617 148194.03226643 181973.75286976
 242269.5013732 ]
Labels: [72100.0, 279600.0, 82700.0, 112500.0, 238300.0]


In [92]:
data_predictions=linreg.predict(data_prepared)
lin_mse=mean_squared_error(data_labels,data_predictions)
lin_rmse=np.sqrt(lin_mse)
lin_rmse

np.float64(68866.7855010671)

In [93]:
tree_reg=DecisionTreeRegressor()
tree_reg.fit(data_prepared,data_labels)
data_predictions=tree_reg.predict(data_prepared)
tree_mse=mean_squared_error(data_labels,data_predictions)
tree_rmse=np.sqrt(lin_mse)
tree_rmse

np.float64(68866.7855010671)

In [94]:
scores=cross_val_score(tree_reg,data_prepared,data_labels,scoring='neg_mean_squared_error',cv=10)

In [95]:
tree_rmse_scores=np.sqrt(-scores)

In [96]:
def display_scores(scores):
    print('Results:', scores)
    print('Mean:',scores.mean())
    print('Std dev:',scores.std())

In [97]:
display_scores(tree_rmse_scores)

Results: [71006.96976101 69687.37209553 65398.72326743 68726.27881268
 67970.09599827 69417.55622913 73517.96168565 69500.40913674
 67281.23829907 71927.88635947]
Mean: 69443.44916449886
Std dev: 2213.643075797487


In [98]:
lin_scores=cross_val_score(linreg,data_prepared,data_labels,scoring='neg_mean_squared_error',cv=10)

In [99]:
lin_rmse_scores=np.sqrt(-lin_scores)

In [100]:
display_scores(lin_rmse_scores)

Results: [72098.22665206 65263.25382399 67474.73970678 69344.34356548
 66405.23455751 72763.65486439 70271.51498936 69326.87905948
 66674.88294971 70584.58383035]
Mean: 69020.73139990955
Std dev: 2377.846821298085


In [None]:
forest_reg=RandomForestRegressor()
forest_reg.fit(data_prepared,data_labels)

In [None]:
param_grid=[
    {'n_estimators':[3,10,30],'max_features':[2,4,6,8]},
    {'bootstrap':[False],'n_estimators':[3,10],'max_features':[2,3,4]},]

In [120]:
grid_search=GridSearchCV(forest_reg,param_grid,cv=5,scoring='neg_mean_squared_error')


In [None]:
grid_search.fit(data_prepared,data_labels)

In [114]:
model=SVR(kernel='rbf',C=100,gamma=0.1,epsilon=0.1))

In [125]:
model.fit(data_prepared,data_labels)

0,1,2
,kernel,'linear'
,degree,3
,gamma,0.1
,coef0,0.0
,tol,0.001
,C,100
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [107]:
X_test=strat_test_set.drop('median_house_value',axis=1)
y_test=strat_test_set['median_house_value'].copy()

In [108]:
X_test_prepared=full_pipeline.transform(X_test)

In [116]:
y_pred=model.predict(X_test_prepared)

In [117]:
final_mse=mean_squared_error(y_test, y_pred)

In [118]:
final_rmse=np.sqrt(final_mse)

In [119]:
final_rmse

np.float64(69785.06476888972)

In [131]:
param_grid = {
    "kernel": ["rbf"],
    "C": [0.1, 1, 10, 100],
    "gamma": [0.01, 0.1, 1],
    "epsilon": [ 0.1, 0.5],
}

In [132]:
grid_search=GridSearchCV(model,param_grid,cv=5,scoring='neg_mean_squared_error')

In [133]:
grid_search.fit(data_prepared,data_labels)

0,1,2
,estimator,"SVR(C=100, ga...rnel='linear')"
,param_grid,"{'C': [1, 10, ...], 'epsilon': [0.1], 'gamma': [0.01, 0.1], 'kernel': ['rbf']}"
,scoring,'neg_mean_squared_error'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,kernel,'rbf'
,degree,3
,gamma,0.1
,coef0,0.0
,tol,0.001
,C,100
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [140]:
param_grid = {
    "kernel": ["rbf"],
    "C": [0.1, 1, 10, 100],
    "gamma": [0.01, 0.1, 1],
    "epsilon": [0.01, 0.1, 0.5],
}

In [142]:
grid_search_random=RandomizedSearchCV(model,param_grid,cv=5,scoring='neg_mean_squared_error')

In [138]:
grid_search_random.fit(data_prepared,data_labels)



0,1,2
,estimator,"SVR(C=100, ga...rnel='linear')"
,param_distributions,"{'C': [1, 10, ...], 'epsilon': [0.1], 'gamma': [0.01, 0.1], 'kernel': ['rbf']}"
,n_iter,10
,scoring,'neg_mean_squared_error'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,kernel,'rbf'
,degree,3
,gamma,0.1
,coef0,0.0
,tol,0.001
,C,100
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [None]:
grid_search_random.fit(data_prepared,data_labels)

In [182]:
selector = SelectKBest(score_func=f_regression, k=10)

selector.fit(data_prepared, data_labels)
X_train_selected = selector.transform(data_prepared)
X_test_selected = selector.transform(X_test_prepared)

In [183]:
X_train_selected

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 115582 stored elements and shape (16512, 10)>

In [186]:
feature_names = full_pipeline.get_feature_names_out()

selected_feature_names = [
    feature_names[i] for i in selected_indices
]

print("Wybrane cechy:", selected_feature_names)

Wybrane cechy: ['num_pipeline__longitude', 'num_pipeline__latitude', 'num_pipeline__housing_median_age', 'num_pipeline__population', 'num_pipeline__households', 'num_pipeline__median_income', 'num_pipeline__income_cat', 'cat_pipeline__ocean_proximity_<1H OCEAN', 'cat_pipeline__ocean_proximity_ISLAND', 'cat_pipeline__ocean_proximity_NEAR BAY']


In [187]:
selected_indices = selector.get_support(indices=True)