#**Importing Required Datasets**

In [1]:
import pandas as pd
import numpy as np
import zipfile
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# **Unzip the dataset**

In [2]:
with zipfile.ZipFile('/content/house-prices-advanced-regression-techniques.zip', 'r') as zip_ref:
    zip_ref.extractall('house_prices_data')

# **Load dataset**

In [3]:
df = pd.read_csv(os.path.join('house_prices_data', 'train.csv'))

# **Display the first few rows of the dataset**

In [4]:
print(df.head())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

# **Select relevant features**

In [5]:
features = ['GrLivArea', 'BedroomAbvGr', 'FullBath']
target = 'SalePrice'

# **Handle missing values (if any)**

In [6]:
df = df[features + [target]].dropna()

# **Separate features and target variable**

In [7]:
X = df[features]
y = df[target]


# **Split data into training and testing sets**

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **Initialize and train the linear regression model**

In [9]:
model = LinearRegression()
model.fit(X_train, y_train)

# **Make predictions**

In [10]:
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# **Evaluate the model**

In [11]:
mse_train = mean_squared_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

print(f'Training MSE: {mse_train:.2f}')
print(f'Training R2: {r2_train:.2f}')
print(f'Testing MSE: {mse_test:.2f}')
print(f'Testing R2: {r2_test:.2f}')

Training MSE: 2593336031.32
Training R2: 0.57
Testing MSE: 2806426667.25
Testing R2: 0.63


# **Output the model coefficients**

In [12]:
print('Model Coefficients:')
print(f'Intercept: {model.intercept_}')
coeff_df = pd.DataFrame(model.coef_, features, columns=['Coefficient'])
print(coeff_df)

Model Coefficients:
Intercept: 52261.74862694461
               Coefficient
GrLivArea       104.026307
BedroomAbvGr -26655.165357
FullBath      30014.324109


# **Load test dataset**

In [13]:
test_df = pd.read_csv(os.path.join('house_prices_data', 'test.csv'))

# **Display the first few rows of the test dataset**

In [14]:
print("\nTest data preview:")
print(test_df.head())


Test data preview:
     Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0  1461          20       RH         80.0    11622   Pave   NaN      Reg   
1  1462          20       RL         81.0    14267   Pave   NaN      IR1   
2  1463          60       RL         74.0    13830   Pave   NaN      IR1   
3  1464          60       RL         78.0     9978   Pave   NaN      IR1   
4  1465         120       RL         43.0     5005   Pave   NaN      IR1   

  LandContour Utilities  ... ScreenPorch PoolArea PoolQC  Fence MiscFeature  \
0         Lvl    AllPub  ...         120        0    NaN  MnPrv         NaN   
1         Lvl    AllPub  ...           0        0    NaN    NaN        Gar2   
2         Lvl    AllPub  ...           0        0    NaN  MnPrv         NaN   
3         Lvl    AllPub  ...           0        0    NaN    NaN         NaN   
4         HLS    AllPub  ...         144        0    NaN    NaN         NaN   

  MiscVal MoSold  YrSold  SaleType  SaleConditio

# **Handle missing values (if any) in the test set**

In [15]:
test_df = test_df[features].dropna()

# **Separate features in the test set**

In [16]:
X_test = test_df[features]

# **Make predictions on the test set**

In [17]:
test_predictions = model.predict(X_test)

# **Create a DataFrame to hold the predictions**

In [18]:
predictions_df = pd.DataFrame({
    'Id': test_df.index,
    'SalePrice': test_predictions
})

# **Save the predictions to a CSV file**

In [19]:
predictions_df.to_csv('house_price_predictions.csv', index=False)

print("\nPredictions saved to 'house_price_predictions.csv'")


Predictions saved to 'house_price_predictions.csv'
