In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
pd.pandas.set_option('display.max_columns',None)
import warnings
# Suppress all warnings
warnings.filterwarnings("ignore")
import sklearn
from sklearn.model_selection import train_test_split,learning_curve
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import joblib
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet,LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

In [2]:
sklearn.set_config(transform_output="pandas")

In [3]:
train_df=pd.read_csv("Dataset/train_data.csv")
test_df=pd.read_csv("Dataset/test_data.csv")

In [4]:
train_df.dtypes

location       object
total_sqft    float64
bath          float64
balcony       float64
BHK           float64
price         float64
dtype: object

In [5]:
def split_data(data):
    X=data.drop(columns=['price'])
    y=data.price.copy()
    return (X,y)

In [6]:
X_train,y_train=split_data(train_df)

In [7]:
X_train

Unnamed: 0,location,total_sqft,bath,balcony,BHK
0,kengeri,1052.0,2.0,2.0,2.0
1,other,1632.0,3.0,0.0,4.0
2,haralur road,1027.0,2.0,2.0,2.0
3,haralur road,1140.0,2.0,2.0,2.0
4,other,2200.0,2.0,0.0,2.0
...,...,...,...,...,...
6249,vijayanagar,989.0,2.0,0.0,2.0
6250,varthur,1560.0,3.0,2.0,3.0
6251,brookefield,1225.0,2.0,2.0,2.0
6252,other,2260.0,3.0,1.0,2.0


In [8]:
X_train.shape

(6254, 5)

In [9]:
y_train.shape

(6254,)

In [10]:
X_test,y_test=split_data(test_df)

In [11]:
X_test.shape

(2681, 5)

In [12]:
y_test.shape

(2681,)

In [13]:
num_cols=['total_sqft','bath','balcony','BHK']
cat_cols=['location']

def frequency_encoder(X):
    return frequency_encode(pd.DataFrame(X))


In [14]:
num_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scalar',StandardScaler())
])

cat_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
])

In [15]:
preprocessor=ColumnTransformer(transformers=[
    ('num',num_transformer,num_cols),
    ('cat',cat_transformer,cat_cols)
])

In [16]:
preprocessor.fit(X_train, y_train)
X_train_transformed = preprocessor.transform(X_train)

In [17]:
X_train_transformed

Unnamed: 0,num__total_sqft,num__bath,num__balcony,num__BHK,cat__location_1st phase jp nagar,cat__location_2nd phase judicial layout,cat__location_5th phase jp nagar,cat__location_6th phase jp nagar,cat__location_7th phase jp nagar,cat__location_8th phase jp nagar,cat__location_9th phase jp nagar,cat__location_abbigere,cat__location_akshaya nagar,cat__location_ambalipura,cat__location_ambedkar nagar,cat__location_amruthahalli,cat__location_anandapura,cat__location_ananth nagar,cat__location_anekal,cat__location_anjanapura,cat__location_ardendale,cat__location_arekere,cat__location_attibele,cat__location_babusapalaya,cat__location_badavala nagar,cat__location_balagere,cat__location_banashankari,cat__location_banashankari stage iii,cat__location_banashankari stage v,cat__location_banaswadi,cat__location_bannerghatta,cat__location_bannerghatta road,cat__location_basavangudi,cat__location_battarahalli,cat__location_begur,cat__location_begur road,cat__location_bellandur,cat__location_bharathi nagar,cat__location_bhoganhalli,cat__location_billekahalli,cat__location_binny pete,cat__location_bisuvanahalli,cat__location_bommanahalli,cat__location_bommasandra,cat__location_bommasandra industrial area,cat__location_brookefield,cat__location_btm 2nd stage,cat__location_btm layout,cat__location_budigere,cat__location_chandapura,cat__location_channasandra,cat__location_chikka tirupathi,cat__location_chikkalasandra,cat__location_choodasandra,cat__location_cv raman nagar,cat__location_dasanapura,cat__location_dasarahalli,cat__location_devanahalli,cat__location_devarachikkanahalli,cat__location_dodda nekkundi,cat__location_doddakallasandra,cat__location_doddathoguru,cat__location_domlur,cat__location_electronic city,cat__location_electronic city phase ii,cat__location_electronics city phase 1,cat__location_epip zone,cat__location_frazer town,cat__location_garudachar palya,cat__location_gm palaya,cat__location_gottigere,cat__location_green glen layout,cat__location_gubbalala,cat__location_gunjur,cat__location_haralur road,cat__location_harlur,cat__location_hebbal,cat__location_hebbal kempapura,cat__location_hegde nagar,cat__location_hennur,cat__location_hennur road,cat__location_hoodi,cat__location_horamavu agara,cat__location_horamavu banaswadi,cat__location_hormavu,cat__location_hosa road,cat__location_hosakerehalli,cat__location_hoskote,cat__location_hosur road,cat__location_hrbr layout,cat__location_hsr layout,cat__location_hulimavu,cat__location_iblur village,cat__location_indira nagar,cat__location_itpl,cat__location_jakkur,cat__location_jalahalli,cat__location_jalahalli east,cat__location_jigani,cat__location_jp nagar,cat__location_kadugodi,cat__location_kaggadasapura,cat__location_kaggalipura,cat__location_kaikondrahalli,cat__location_kalena agrahara,cat__location_kalyan nagar,cat__location_kambipura,cat__location_kammasandra,cat__location_kanakapura,cat__location_kanakpura road,cat__location_kannamangala,cat__location_kasavanhalli,cat__location_kathriguppe,cat__location_kaval byrasandra,cat__location_kenchenahalli,cat__location_kengeri,cat__location_kengeri satellite town,cat__location_kereguddadahalli,cat__location_kodichikkanahalli,cat__location_kodigehaali,cat__location_kogilu,cat__location_koramangala,cat__location_kothannur,cat__location_kothanur,cat__location_kr puram,cat__location_kudlu,cat__location_kudlu gate,cat__location_kumaraswami layout,cat__location_kundalahalli,cat__location_lakshminarayana pura,cat__location_lingadheeranahalli,cat__location_magadi road,cat__location_mahadevpura,cat__location_mallasandra,cat__location_malleshpalya,cat__location_malleshwaram,cat__location_marathahalli,cat__location_margondanahalli,cat__location_munnekollal,cat__location_murugeshpalya,cat__location_mysore road,cat__location_nagarbhavi,cat__location_nagavara,cat__location_nagavarapalya,cat__location_neeladri nagar,cat__location_ngr layout,cat__location_nri layout,cat__location_old airport road,cat__location_old madras road,cat__location_ombr layout,cat__location_other,cat__location_padmanabhanagar,cat__location_pai layout,cat__location_panathur,cat__location_parappana agrahara,cat__location_pattandur agrahara,cat__location_r.t. nagar,cat__location_rachenahalli,cat__location_raja rajeshwari nagar,cat__location_rajaji nagar,cat__location_rajiv nagar,cat__location_ramagondanahalli,cat__location_ramamurthy nagar,cat__location_rayasandra,cat__location_sahakara nagar,cat__location_sanjay nagar,cat__location_sarjapur,cat__location_sarjapur road,cat__location_sarjapura - attibele road,cat__location_sector 7 hsr layout,cat__location_seegehalli,cat__location_singasandra,cat__location_somasundara palya,cat__location_sonnenahalli,cat__location_subramanyapura,cat__location_talaghattapura,cat__location_tc palaya,cat__location_thanisandra,cat__location_thigalarapalya,cat__location_thubarahalli,cat__location_tumkur road,cat__location_uttarahalli,cat__location_varthur,cat__location_varthur road,cat__location_vasanthapura,cat__location_vidyaranyapura,cat__location_vijayanagar,cat__location_vittasandra,cat__location_whitefield,cat__location_yelachenahalli,cat__location_yelahanka,cat__location_yelahanka new town,cat__location_yeshwanthpur
0,-0.063042,-0.438579,0.558628,-0.531302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.017212,0.591414,-1.959325,1.654008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.065017,-0.438579,0.558628,-0.531302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.056088,-0.438579,0.558628,-0.531302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.027670,-0.438579,-1.959325,-0.531302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6249,-0.068020,-0.438579,-1.959325,-0.531302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6250,-0.022901,0.591414,0.558628,0.561353,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6251,-0.049372,-0.438579,0.558628,-0.531302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6252,0.032411,0.591414,-0.700348,-0.531302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
algorithms={
    "Linear Regression":LinearRegression(),
    "Decision Tree Regressor":DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGB Regressor":XGBRegressor(objective='reg:squarederror'),
    "Gradient Boosting Regressor":GradientBoostingRegressor()
}

def plot_curve(sizes,mean_scores,std_scores,label,ax):
    ax.plot(
        sizes,
        mean_scores,
        marker='o',
        label=label
    )
    ax.fill_between(
        x=sizes,
        y1=mean_scores-std_scores,
        y2=mean_scores+std_scores,
        alpha=0.5
    )

def plot_learning_curve(name,algorithm,figsize=(12,4)):
    model=Pipeline(steps=[
        ("pre",preprocessor),
        ('alg',algorithm)
    ])
    train_sizes,train_scores,test_scores=learning_curve(
        estimator=model,
        X=X_test,
        y=y_test,
        cv=3,
        scoring="r2",
        n_jobs=-1,
        random_state=42
    )
    mean_train_scores=np.mean(train_scores,axis=1)
    std_train_scores=np.std(train_scores,axis=1)
    train_score=f"{mean_train_scores[-1]:0.2f} +/- {std_train_scores[-1]:0.2f}"
    mean_test_scores=np.mean(test_scores,axis=1)
    std_test_scores=np.std(test_scores,axis=1)
    test_score=f"{mean_test_scores[-1]:0.2f} +/- {std_test_scores[-1]:0.2f}"

    fig,ax=plt.subplots(figsize=figsize)
    #training curve

    plot_curve(
        train_sizes,
        mean_train_scores,
        std_train_scores,
        f"Train({train_score})",
        ax
    )
     #test curve

    plot_curve(
        train_sizes,
        mean_test_scores,
        std_test_scores,
        f"Test({test_score})",
        ax
    )
    ax.set(xlabel="Training set Size",ylabel="R_squared",title=name)
    ax.legend(loc="lower right")
    plt.show()


for name,alg in algorithms.items():
    plot_learning_curve(name,alg)

In [19]:
results = []

# Loop through each algorithm
for name, algorithm in algorithms.items():
    # Create a pipeline with preprocessor and algorithm
    model = Pipeline(steps=[
        ("pre", preprocessor),  # Ensure preprocessor is defined earlier
        ('alg', algorithm)
    ])
    
    # Fit the model on training data
    model.fit(X_train, y_train)
    
    # Make predictions on training and testing data
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    
    # Append results to the list
    results.append({
        "Model": name,
        "Training R²": f"{train_r2:.2f}",
        "Testing R²": f"{test_r2:.2f}",
        "Training MSE": f"{train_mse:.2f}",
        "Testing MSE": f"{test_mse:.2f}",
        "Training MAE": f"{train_mae:.2f}",
        "Testing MAE": f"{test_mae:.2f}"
    })

# Create and display the results DataFrame
results_df = pd.DataFrame(results)
results_df.T


Unnamed: 0,0,1,2,3,4
Model,Linear Regression,Decision Tree Regressor,Random Forest Regressor,XGB Regressor,Gradient Boosting Regressor
Training R²,0.48,0.99,0.95,0.91,0.85
Testing R²,0.51,0.40,0.79,0.69,0.81
Training MSE,2679.99,56.13,257.86,454.53,764.00
Testing MSE,2320.62,2809.22,974.06,1433.53,894.85
Training MAE,27.67,2.42,7.68,13.29,17.66
Testing MAE,26.69,21.21,17.05,16.98,17.94


In [20]:
model=Pipeline(steps=[
    ('pre',preprocessor),
    ('xgb',GradientBoostingRegressor() )
])

In [21]:
model.fit(X_train,y_train)

In [22]:
def eval(X,y):
    y_pred=model.predict(X)
    return r2_score(y,y_pred)

In [23]:
print(eval(X_test,y_test))

0.8097905082822657


In [24]:
print(eval(X_train,y_train))

0.8508124320658033


In [27]:
joblib.dump(model,"model.joblib")

['model.joblib']

In [26]:

#joblib.dump(preprocessor, 'preprocessor.joblib')



['preprocessor.joblib']