# Data Exploration

In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [250]:
df_train=pd.read_csv('train.csv')
df_test=pd.read_csv('test.csv')
x_train=df_train.iloc[:,:-1]
x_train=x_train.drop(['Id'],axis=1)
x_val=df_test.iloc[:,:]
x_val=x_val.drop(['Id'],axis=1)
y_train=df_train.iloc[:,-1]

In [263]:
from sklearn.model_selection import train_test_split
x_training,x_test,y_training,y_test=train_test_split(x_train,y_train,test_size=0.2,random_state=0)

In [273]:
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
# imputing the columns
numerical_cols=x_train.select_dtypes(include=['int64','float64']).columns
categorical_cols=x_train.select_dtypes(include=['object']).columns
Ordinal_cols=["Street","CentralAir"]
OneHot_cols=[cols for cols in categorical_cols if cols not in Ordinal_cols]

numerical_pipeline=Pipeline(
                              [
                                  ('imputer',SimpleImputer(missing_values=np.nan,strategy='mean')),
                                  ('scaling',StandardScaler())
                              ]
                           )

Ordinal_pipeline=Pipeline(
                            [
                                ('imputer',SimpleImputer(missing_values=np.nan,strategy='most_frequent')),
                                ('ordinalEncoder',OrdinalEncoder()),
                            ]
                         )
Onehot_pipeline=Pipeline(
                            [
                                ('imputer',SimpleImputer(missing_values=np.nan,strategy='most_frequent')),
                                ('OnehotEncoding',OneHotEncoder(sparse_output=False,handle_unknown='ignore')),
                            ]
                         )
preprocessor=ColumnTransformer(transformers=[
    ('numericPreprocessing',numerical_pipeline,numerical_cols),
    ('Ordinalpreprocessing',Ordinal_pipeline,Ordinal_cols),
    ('Onehotpreprocessing',Onehot_pipeline,OneHot_cols),
])
Regressor=XGBRegressor(n_estimators=3000,
        learning_rate=0.01,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,)
model=Pipeline(
    [
        ('preprocessing',preprocessor),
        ('regression',Regressor)
    ]
)

model.fit(x_train,np.log1p(y_train))


0,1,2
,steps,"[('preprocessing', ...), ('regression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numericPreprocessing', ...), ('Ordinalpreprocessing', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [274]:
y_pred=model.predict(x_val)
y_pred=np.expm1(y_pred)
test_df = pd.read_csv('test.csv')

# Create submission dataframe
submission = pd.DataFrame({
    'Id': test_df['Id'],   # Use the same Id column from test.csv
    'SalePrice': y_pred    # Replace with your predicted values
})

# Save as CSV
submission.to_csv('submission.csv', index=False)

print("Saved submission.csv in correct Kaggle format")


Saved submission.csv in correct Kaggle format


In [None]:
y_test=np.array(y_test)
print(np.concatenate((y_test.reshape(-1, 1), y_pred.reshape(-1, 1)), axis=1))


[[200624.         218870.34456044]
 [133000.         149597.70793504]
 [110000.         111501.22316016]
 [192000.         218628.81584447]
 [ 88000.          95481.78734731]
 [ 85000.         110979.20308689]
 [282922.         260723.62670321]
 [141000.         121874.39757025]
 [745000.         506500.80148598]
 [148800.         157696.62189277]
 [208900.         194912.93269738]
 [136905.         143972.70467806]
 [225000.         223900.24843667]
 [123000.         114696.18956911]
 [119200.         126084.05835514]
 [145000.         147714.23990572]
 [190000.         223445.75961232]
 [123600.         119223.72840985]
 [149350.         143215.33416971]
 [155000.         185232.77108581]
 [166000.         131338.78795564]
 [144500.         141261.81456857]
 [110000.         112845.59188023]
 [174000.         169788.52031518]
 [185000.         183912.53045975]
 [168000.         219292.41262725]
 [177500.         166694.64268312]
 [ 84500.          88658.41521927]
 [320000.         33

In [241]:
from sklearn.metrics import r2_score,root_mean_squared_error
r2=r2_score(y_test,y_pred)
print(r2)
rmse=root_mean_squared_error(y_test,y_pred)
print(rmse)

0.8421632729419293
33015.09890750622
