In [63]:
# import frameworks
import os
import pandas as pd
import joblib # for saving or loading models
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define file Path
data_path = r"C:\Users\Sahil Raj\Downloads\house-prices-advanced-regression-techniques\train.csv"

# check if file exists
if not os.path.exists(data_path):
    print(f"The dataset file`{data_path}`was not found Please ensure the data file is uploaded")
else:
    # load the dataset
    train_df = pd.read_csv(data_path)

    # Ensure all the column are exists
    required_columns = {'GrLivArea', 'BedroomAbvGr', 'FullBath', 'HalfBath', 'LotArea', 'YearBuilt', 'OverallQual', 'SalePrice'}
    missing_columns = required_columns - set(train_df.columns)
    if missing_columns:
        raise ValueError(f"Missing Required Columns in dataset: {missing_columns}")
        
    # select relevent features
    train_df['TotalBath'] = train_df['FullBath'] + train_df['HalfBath']
    X = train_df[['GrLivArea', 'BedroomAbvGr', 'TotalBath', 'LotArea', 'YearBuilt', 'OverallQual']]
    y = train_df['SalePrice']

    # split data into training and testing sets
    # X_train,y_train,X_test,y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



    # Create and train the model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Make prediction
    y_pred = model.predict(X_test)

    # Evaluate the models
    mse = mean_squared_error(y_test,y_pred)
    r2 = r2_score(y_test,y_pred)

    print(f"mean_squared_error: {mse}")
    print(f"r2_score: {r2}")

    # Display model coefficients
    print(f"coefficient:",  model.coef_)
    print(f"intersect:",  model.intercept_)

    joblib.dump(model, "house_price_model.pkl")
    print("Model saved as house_price_model.pkl")


    # print(f"Price of the LivinArea is:", '$',model.coef_[0].round(2),"persqfoot")
    # print(f"")



mean_squared_error: 1663130345.08791
r2_score: 0.7831734531017012
coefficient: [ 7.21466493e+01 -8.31421464e+03 -8.46263361e+03  7.82433144e-01
  5.85690848e+02  2.33735808e+04]
intersect: -1193970.0454682729
Model saved as house_price_model.pkl
