In [1]:
import pandas as pd
import numpy as np

In [21]:
House_df=pd.read_csv("HousePricePrediction.csv")

In [22]:
House_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,LotConfig,BldgType,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,BsmtFinSF2,TotalBsmtSF,SalePrice
0,0,60,RL,8450,Inside,1Fam,5,2003,2003,VinylSd,0.0,856.0,208500.0
1,1,20,RL,9600,FR2,1Fam,8,1976,1976,MetalSd,0.0,1262.0,181500.0
2,2,60,RL,11250,Inside,1Fam,5,2001,2002,VinylSd,0.0,920.0,223500.0
3,3,70,RL,9550,Corner,1Fam,5,1915,1970,Wd Sdng,0.0,756.0,140000.0
4,4,60,RL,14260,FR2,1Fam,5,2000,2000,VinylSd,0.0,1145.0,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2914,160,RM,1936,Inside,Twnhs,7,1970,1970,CemntBd,0.0,546.0,
2915,2915,160,RM,1894,Inside,TwnhsE,5,1970,1970,CemntBd,0.0,546.0,
2916,2916,20,RL,20000,Inside,1Fam,7,1960,1996,VinylSd,0.0,1224.0,
2917,2917,85,RL,10441,Inside,1Fam,5,1992,1992,HdBoard,0.0,912.0,


In [23]:
#Drop Id
 # Id is just an identifier
 # It has no relationship with house price
 # Including it introduces noise

House_df=House_df.drop(columns=['Id'])

In [24]:


#DATA CLEANING AND FEATURE ENGINEERING

# removing extra space, unsuable data like empty space, null, none

fake_missing = ['none', 'null', 'na', '?', '']

cat_cols = House_df.select_dtypes(include=['object']).columns

House_df[cat_cols] = (
    House_df[cat_cols]
    .apply(lambda col: col.str.strip().str.lower())
    .replace(fake_missing, np.nan)
)


In [25]:
#Separate training data because some data contains NaN value in price 
 #and the model cannot learn without target value so we will use those
  #data as training data

train_df=House_df[House_df['SalePrice'].notnull()]
 # OR  train_df = House_df.dropna(subset=['SalePrice'])

test_df=House_df[House_df['SalePrice'].isnull()]


In [26]:
# seperating X, Y

X=train_df.drop(['SalePrice'], axis=1)
Y=train_df['SalePrice']

X_test=test_df.drop(['SalePrice'], axis=1)
Y_test=test_df['SalePrice']

In [27]:
# When there are missing values, we use imputation techniques 
# like mean, median, or mode.These values are calculated 
# only from the training data to avoid data leakage,and then 
# the same values are used to fill missing values in both 
# training and test data.


# Step 1: Identify column type
cat_cols = X.select_dtypes(include=['object']).columns
num_cols = X.select_dtypes(include=['number']).columns


# Step 2: Find imputation values from TRAIN data ONLY
         # Categorical → mode
         # Numerical → median

cat_fill_values = X[cat_cols].mode().iloc[0]
num_fill_values = X[num_cols].median()


# Step 4: Apply these SAME values to BOTH train and test

   # Categorical columns
X[cat_cols] = X[cat_cols].fillna(cat_fill_values)
X_test[cat_cols] = X_test[cat_cols].fillna(cat_fill_values)

   # Numerical columns
X[num_cols] = X[num_cols].fillna(num_fill_values)
X_test[num_cols] = X_test[num_cols].fillna(num_fill_values)



# THIS IS MANUAL WAY OF HANDLING MISSING VALUE, WE CAN ALSO USE SimpleImputer OF SKlearn



In [28]:
# Step 5: Verify

print(X.isnull().sum().sum())
print(X_test.isnull().sum().sum())


0
0


In [29]:
train_df

Unnamed: 0,MSSubClass,MSZoning,LotArea,LotConfig,BldgType,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,BsmtFinSF2,TotalBsmtSF,SalePrice
0,60,rl,8450,inside,1fam,5,2003,2003,vinylsd,0.0,856.0,208500.0
1,20,rl,9600,fr2,1fam,8,1976,1976,metalsd,0.0,1262.0,181500.0
2,60,rl,11250,inside,1fam,5,2001,2002,vinylsd,0.0,920.0,223500.0
3,70,rl,9550,corner,1fam,5,1915,1970,wd sdng,0.0,756.0,140000.0
4,60,rl,14260,fr2,1fam,5,2000,2000,vinylsd,0.0,1145.0,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,rl,7917,inside,1fam,5,1999,2000,vinylsd,0.0,953.0,175000.0
1456,20,rl,13175,inside,1fam,6,1978,1988,plywood,163.0,1542.0,210000.0
1457,70,rl,9042,inside,1fam,9,1941,2006,cemntbd,0.0,1152.0,266500.0
1458,20,rl,9717,inside,1fam,6,1950,1996,metalsd,1029.0,1078.0,142125.0


In [30]:


test_df



Unnamed: 0,MSSubClass,MSZoning,LotArea,LotConfig,BldgType,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,BsmtFinSF2,TotalBsmtSF,SalePrice
1460,20,rh,11622,inside,1fam,6,1961,1961,vinylsd,144.0,882.0,
1461,20,rl,14267,corner,1fam,6,1958,1958,wd sdng,0.0,1329.0,
1462,60,rl,13830,inside,1fam,5,1997,1998,vinylsd,0.0,928.0,
1463,60,rl,9978,inside,1fam,6,1998,1998,vinylsd,0.0,926.0,
1464,120,rl,5005,inside,twnhse,5,1992,1992,hdboard,0.0,1280.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
2914,160,rm,1936,inside,twnhs,7,1970,1970,cemntbd,0.0,546.0,
2915,160,rm,1894,inside,twnhse,5,1970,1970,cemntbd,0.0,546.0,
2916,20,rl,20000,inside,1fam,7,1960,1996,vinylsd,0.0,1224.0,
2917,85,rl,10441,inside,1fam,5,1992,1992,hdboard,0.0,912.0,


In [31]:

#To see percentage of missing value

train_df.isnull().mean() * 100

MSSubClass      0.0
MSZoning        0.0
LotArea         0.0
LotConfig       0.0
BldgType        0.0
OverallCond     0.0
YearBuilt       0.0
YearRemodAdd    0.0
Exterior1st     0.0
BsmtFinSF2      0.0
TotalBsmtSF     0.0
SalePrice       0.0
dtype: float64

In [32]:


# ONE HOT ENCODING USING PANDAS

  # Step 1: Encoding trainning data
X_encoded = pd.get_dummies(X, columns=cat_cols, drop_first=True)

  # Step 2: Encode test data
X_test_encoded = pd.get_dummies(X_test, columns=cat_cols, drop_first=True)

  # Step 3: Align train and test columns
    # This step is mandetory because Train and test may have different categories for
    # example if train only have red, blue value for colour column it will create
    # only two dummy column but if test have three values in colour column it will create 
    # three dummy column in test which will change the number of columns in train and 
    # test data and model requires "Same columns in same order" so we will align
    # train and test column after one hot encoding on both train & test and fill missing
    # columns in test with 0

X_encoded, X_test_encoded = X_encoded.align(X_test_encoded,join='left',axis=1,fill_value=0)





# ONE HOT ENCODING USING SKlearn

   # Step 1: Import encoder
#from sklearn.preprocessing import OneHotEncoder

   # Step 2: Create encoder
#encoder = OneHotEncoder(handle_unknown='ignore',sparse=False)

   # Step 3: Fit train categorical data only
#X_cat_encoded = encoder.fit_transform(X[cat_cols])

   # Step 4: Transform test categorical data
#X_test_cat_encoded = encoder.transform(X_test[cat_cols])

   # Step 5: Combine with numeric column
#X_encoded = np.hstack([X[num_cols].values, X_cat_encoded])
#X_test_encoded = np.hstack([X_test[num_cols].values, X_test_cat_encoded])


In [33]:
#TRAIN THE MODEL

from sklearn.linear_model import LinearRegression

model=LinearRegression()
model.fit(X_encoded, Y)
model.coef_

array([ 8.59567705e+02,  8.16406451e-01,  4.37043097e+03,  7.50995543e+02,
        7.27056243e+02, -2.11670457e+01,  8.65451199e+01,  3.25197023e+04,
        3.53457961e+04,  3.13166106e+04,  2.12400871e+04,  8.61394594e+03,
       -8.19846288e+03,  3.95627988e+03, -2.28517668e+03, -1.20567953e+05,
       -6.61327847e+04, -1.25321422e+05, -1.03988393e+05,  4.76088368e+04,
       -2.02430454e+04,  2.63017201e+04, -7.95896663e+03,  1.77897159e+04,
       -1.80253972e+04,  1.50687347e+04, -3.94925782e+03, -1.52433638e+04,
        6.53228093e+04,  7.28655318e+03, -5.58200018e+03,  6.15044060e+01,
        3.05688153e+03])

In [34]:
#PPREDICT THE AMOUNT

y_pred=model.predict(X_test_encoded)

In [35]:
y_pred

array([130011.44115214, 173369.89985078, 218763.20902759, ...,
       194534.99848049, 215540.16608728, 202861.31670313])

In [36]:

#EVALUATE THE MODEL



#R-SQUARE METHOD
from sklearn.metrics import r2_score

r2=r2_score(Y_test, y_pred)
print("R-Square: ", r2)


ValueError: Input contains NaN.