In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data cleaning and onehot encoding for qualitative data

In [None]:
data=pd.read_csv('Housing.csv')
data['priceperarea']= data['price']/data['area']*data['stories']
lists=['mainroad','guestroom','basement','hotwaterheating','airconditioning','prefarea']
for i in lists:
    data[f'{i}_']=data[i].map({'yes':1,'no':0})

data= pd.get_dummies(data, columns=['furnishingstatus'], prefix='furnishingstatus', drop_first=False)
data.drop(columns=['furnishingstatus_unfurnished'],inplace=True)
data['furnishingstatus_furnished']=data['furnishingstatus_furnished'].astype(int)
data['furnishingstatus_semi-furnished']=data['furnishingstatus_semi-furnished'].astype(int)
print(data.head())
print(data.shape)

      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  ...  prefarea priceperarea  mainroad_  \
0              no             yes  ...       yes  5377.358491          1   
1              no             yes  ...        no  5468.750000          1   
2              no              no  ...       yes  2459.839357          1   
3              no             yes  ...       yes  3257.333333          1   
4              no             yes  ...        no  3075.471698          1   

   guestroom_  basement_  hotwaterheating_  airconditioning_  prefar

In [None]:
print(data.columns.tolist())
print(data.describe())


['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea', 'priceperarea', 'mainroad_', 'guestroom_', 'basement_', 'hotwaterheating_', 'airconditioning_', 'prefarea_', 'furnishingstatus_furnished', 'furnishingstatus_semi-furnished']
              price          area    bedrooms   bathrooms     stories  \
count  5.450000e+02    545.000000  545.000000  545.000000  545.000000   
mean   4.766729e+06   5150.541284    2.965138    1.286239    1.805505   
std    1.870440e+06   2170.141023    0.738064    0.502470    0.867492   
min    1.750000e+06   1650.000000    1.000000    1.000000    1.000000   
25%    3.430000e+06   3600.000000    2.000000    1.000000    1.000000   
50%    4.340000e+06   4600.000000    3.000000    1.000000    2.000000   
75%    5.740000e+06   6360.000000    3.000000    2.000000    2.000000   
max    1.330000e+07  16200.000000    6.000000    4.000000    4.000000   

          parkin

In [None]:
numeric_columns = data.select_dtypes(include=['int64', 'float64','int32']).columns.tolist()
print(numeric_columns)

['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking', 'priceperarea', 'mainroad_', 'guestroom_', 'basement_', 'hotwaterheating_', 'airconditioning_', 'prefarea_', 'furnishingstatus_furnished', 'furnishingstatus_semi-furnished']


copying only numerical data and removing the text data

In [None]:
data_1=data[numeric_columns].copy()
print(data_1.head())


      price  area  bedrooms  bathrooms  stories  parking  priceperarea  \
0  13300000  7420         4          2        3        2   5377.358491   
1  12250000  8960         4          4        4        3   5468.750000   
2  12250000  9960         3          2        2        2   2459.839357   
3  12215000  7500         4          2        2        3   3257.333333   
4  11410000  7420         4          1        2        2   3075.471698   

   mainroad_  guestroom_  basement_  hotwaterheating_  airconditioning_  \
0          1           0          0                 0                 1   
1          1           0          0                 0                 1   
2          1           0          1                 0                 0   
3          1           0          1                 0                 1   
4          1           1          1                 0                 1   

   prefarea_  furnishingstatus_furnished  furnishingstatus_semi-furnished  
0          1                

In [None]:
data_1.priceperarea.describe()

count     545.000000
mean     1884.631552
std      1264.097033
min       270.395550
25%       902.515723
50%      1519.555077
75%      2515.882353
max      6817.391304
Name: priceperarea, dtype: float64

In [None]:
print(data_1[data_1['priceperarea']==min(data_1['priceperarea'])]
)

       price   area  bedrooms  bathrooms  stories  parking  priceperarea  \
403  3500000  12944         3          1        1        0     270.39555   

     mainroad_  guestroom_  basement_  hotwaterheating_  airconditioning_  \
403          1           0          0                 0                 0   

     prefarea_  furnishingstatus_furnished  furnishingstatus_semi-furnished  
403          0                           0                                0  


data cleaning to remove outliers

In [None]:
mean_priceperarea = data_1['priceperarea'].mean()
std_priceperarea = data_1['priceperarea'].std()

# Define the thresholds for filtering
lower_threshold = mean_priceperarea -  std_priceperarea
upper_threshold = mean_priceperarea +  std_priceperarea

# Filter the DataFrame to keep only rows within the thresholds
data_2 = data_1[(data_1['priceperarea'] >= lower_threshold) & (data_1['priceperarea'] <= upper_threshold)]
print(data_2.head())
print(data_2.shape)

      price   area  bedrooms  bathrooms  stories  parking  priceperarea  \
2  12250000   9960         3          2        2        2   2459.839357   
4  11410000   7420         4          1        2        2   3075.471698   
5  10850000   7500         3          3        1        2   1446.666667   
7  10150000  16200         5          3        2        0   1253.086420   
8   9870000   8100         4          1        2        2   2437.037037   

   mainroad_  guestroom_  basement_  hotwaterheating_  airconditioning_  \
2          1           0          1                 0                 0   
4          1           1          1                 0                 1   
5          1           0          1                 0                 1   
7          1           0          0                 0                 0   
8          1           1          1                 0                 1   

   prefarea_  furnishingstatus_furnished  furnishingstatus_semi-furnished  
2          1          

making X matrix by removing the price per area which was calculated for cleaning purpose only and price which is target value

In [None]:
X=data_1.drop(['price','priceperarea'], axis='columns')
X.head()
# X=data_2.drop(['price','priceperarea'], axis='columns')
# X.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,parking,mainroad_,guestroom_,basement_,hotwaterheating_,airconditioning_,prefarea_,furnishingstatus_furnished,furnishingstatus_semi-furnished
0,7420,4,2,3,2,1,0,0,0,1,1,1,0
1,8960,4,4,4,3,1,0,0,0,1,0,1,0
2,9960,3,2,2,2,1,0,1,0,0,1,0,1
3,7500,4,2,2,3,1,0,1,0,1,1,1,0
4,7420,4,1,2,2,1,1,1,0,1,0,1,0


creating Y that is target value

In [None]:
Y=data_1.price
Y.head()
# Y=data_2.price
# Y.head()

0    13300000
1    12250000
2    12250000
3    12215000
4    11410000
Name: price, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2, random_state=10)

In [None]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(x_train,y_train)
lr_clf.score(x_test,y_test)

0.7312408811520077

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=40, test_size=0.2, random_state=0)

accuracy=np.array(cross_val_score(LinearRegression(), X, Y, cv=cv))
print(accuracy)
print(np.mean(accuracy))


[0.66112143 0.63270309 0.71526907 0.57752042 0.71074636 0.64880386
 0.61689018 0.63173516 0.64517857 0.65791661 0.65694571 0.68757627
 0.76324882 0.67633131 0.70218261 0.64952632 0.55012624 0.66798768
 0.72980543 0.61296993 0.70109224 0.63506828 0.73103879 0.70053501
 0.6920404  0.68344593 0.68847759 0.70231646 0.60629061 0.75581873
 0.67361937 0.67021029 0.69575644 0.61293399 0.64388723 0.68600141
 0.75473312 0.72349161 0.58841678 0.76659263]
0.6726587994233053


In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X, y):
    algos = {
        'linear_regression': {
            'model': LinearRegression(),
            'params': {}  # Removed 'normalize' parameter
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1, 2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion': ['squared_error', 'friedman_mse'],  # Updated criterion values
                'splitter': ['best', 'random']
            }
        }
    }

    scores = []
    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=10)

    for algo_name, config in algos.items():
        gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False, error_score='raise')
        try:
            gs.fit(X, y)
            scores.append({
                'model': algo_name,
                'best_score': gs.best_score_,
                'best_params': gs.best_params_
            })
        except Exception as e:
            print(f"Error fitting {algo_name}: {e}")

    return pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
# Example usage
# X, y should be defined before calling the function
find_best_model_using_gridsearchcv(X, Y)


Unnamed: 0,model,best_score,best_params
0,linear_regression,0.650535,{}
1,lasso,0.650536,"{'alpha': 2, 'selection': 'random'}"
2,decision_tree,0.261625,"{'criterion': 'squared_error', 'splitter': 'ra..."


In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

def find_best_model_using_gridsearchcv(X, y):
    algos = {
        'linear_regression': {
            'model': LinearRegression(),
            'params': {}  # Removed 'normalize' parameter as it's deprecated
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [0.1, 0.5, 1, 2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion': ['squared_error', 'friedman_mse'],  # Updated criterion values
                'splitter': ['best', 'random'],
                'max_depth': [None, 10, 20, 30]
            }
        },
        'random_forest': {
            'model': RandomForestRegressor(),
            'params': {
                'n_estimators': [10, 50, 100],
                'criterion': ['squared_error', 'friedman_mse'],
                'max_depth': [None, 10, 20, 30]
            }
        },
        'gradient_boosting': {
            'model': GradientBoostingRegressor(),
            'params': {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 10]
            }
        },
        'svr': {
            'model': SVR(),
            'params': {
                'kernel': ['linear', 'rbf'],
                'C': [1, 10, 100],
                'epsilon': [0.1, 0.2, 0.5]
            }
        }
    }

    scores = []
    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=10)

    for algo_name, config in algos.items():
        gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False, error_score='raise')
        try:
            gs.fit(X, y)
            scores.append({
                'model': algo_name,
                'best_score': gs.best_score_,
                'best_params': gs.best_params_
            })
        except Exception as e:
            print(f"Error fitting {algo_name}: {e}")

    return pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])

# Example usage
# Assuming df_encoded contains your feature variables and Y contains your target variable (charges)
# df_encoded and Y should be defined before calling the function
result = find_best_model_using_gridsearchcv(X, Y)
print(result)


               model  best_score  \
0  linear_regression    0.650535   
1              lasso    0.650536   
2      decision_tree    0.309983   
3      random_forest    0.596467   
4  gradient_boosting    0.606061   
5                svr    0.274861   

                                         best_params  
0                                                 {}  
1                {'alpha': 2, 'selection': 'cyclic'}  
2  {'criterion': 'friedman_mse', 'max_depth': 10,...  
3  {'criterion': 'friedman_mse', 'max_depth': 10,...  
4  {'learning_rate': 0.1, 'max_depth': 3, 'n_esti...  
5     {'C': 100, 'epsilon': 0.5, 'kernel': 'linear'}  
