In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import joblib

In [2]:
# Load the datasets
file_path =  "../Res/train.csv"
df = pd.read_csv(file_path)
accepted_values = ["OverallQual","GrLivArea","TotalBsmtSF","GarageCars","BedroomAbvGr","FullBath","YearBuilt","Neighborhood","SalePrice"]
df = df[accepted_values]

In [41]:
print("NaN counts in selected columns:")
df.head()


NaN counts in selected columns:


Unnamed: 0,OverallQual,GrLivArea,TotalBsmtSF,GarageCars,BedroomAbvGr,FullBath,YearBuilt,Neighborhood,SalePrice
0,7,1710,856,2,3,2,2003,CollgCr,208500
1,6,1262,1262,2,3,2,1976,Veenker,181500
2,7,1786,920,2,3,2,2001,CollgCr,223500
3,7,1717,756,3,3,1,1915,Crawfor,140000
4,8,2198,1145,3,4,2,2000,NoRidge,250000


In [4]:
X = df.drop('SalePrice',axis =1)
y = df['SalePrice']

In [5]:
numerical_cols = ["OverallQual","GrLivArea","TotalBsmtSF","GarageCars",
                  "BedroomAbvGr","FullBath","YearBuilt"]
categorical_cols = ["Neighborhood"]

In [6]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()) #Scaling happens here.
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, numerical_cols)
    ])


In [8]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42,)

In [9]:
# Train and test the three specified models:
# Log reg
# Random Forest classifier
# Support vector Regressor
models = {"LogReg": LogisticRegression(max_iter=5),"SVM":SVC(),"Random Forest":RandomForestRegressor(n_estimators=300, random_state=42)}

results = {}

for name, model in models.items():
    print(f"\n=== {name} ===")

    
    clf = Pipeline(steps =[("preprocessor",preprocessor),("model",model)])
    clf.fit(X_train,y_train)
    predictions = clf.predict(X_test)

    

    print("MAE:", mean_absolute_error(y_test, predictions))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, predictions)))
    print("R²:", r2_score(y_test, predictions))

    joblib.dump(clf, "./model.pkl")

    # Storing the results from the model evaluation:
    results[name] = {
        'MAE': mean_absolute_error(y_test, predictions),
        'RMSE': np.sqrt(mean_squared_error(y_test, predictions)),
        'R2': r2_score(y_test, predictions)
    }


=== LogReg ===


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=5).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


MAE: 27887.445205479453
RMSE: 41669.24819212917
R²: 0.7736307168696013

=== SVM ===
MAE: 34474.599315068495
RMSE: 64224.377794673994
R²: 0.46224311825902686

=== Random Forest ===
MAE: 19348.92373488802
RMSE: 31979.280040905942
R²: 0.866671335180447


In [28]:

test_data = pd.DataFrame({'OverallQual':[9],"GrLivArea":[1080],"TotalBsmtSF":[656],"GarageCars":[1],"BedroomAbvGr":[2],"FullBath":[2],"YearBuilt":[2000],"Neighborhood":['NAmes']})
df["YearBuilt"].value_counts(sort=True,ascending=True)

YearBuilt
1872     1
1905     1
1917     1
1911     1
1906     1
        ..
2003    45
2007    49
2004    54
2005    64
2006    67
Name: count, Length: 112, dtype: int64

In [11]:
model = joblib.load('model.pkl')

prediction = model.predict(test_data)

In [12]:
print(f"Predicted House Price: ${prediction[0]}")

Predicted House Price: $244108.89666666667
