In [2]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [3]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
date_range = pd.date_range(start='2024-04-01', periods=10, freq='D')
data['Date'] = np.tile(date_range, int(np.ceil(len(data) / len(date_range))))[:len(data)]
data['Date'] = data['Date'].astype(str)
filtered_data = data[data['Date'] <= '2024-04-08']
len(filtered_data), len(data)


(1168, 1460)

In [5]:
cols = ['OverallQual', 'GrLivArea', '2ndFlrSF', 'TotalBsmtSF', 'GarageCars', 'Date', 'SalePrice']
example_data1 = data[data['Date'] == '2024-04-09'][cols]
example_data2 = data[data['Date'] == '2024-04-10'][cols]

example_data1.to_csv('2024-04-09.csv', index=False)
example_data2.to_csv('2024-04-10.csv', index=False)


In [11]:
import sqlite3


def create_table(db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS house_data (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            overall_qual INTEGER,
            gr_liv_area INTEGER,
            second_flr_sf INTEGER,
            total_bsmt_sf INTEGER,
            garage_cars INTEGER,
            price FLOAT,
            date DATE
        );
    """)
    conn.commit()
    conn.close()

def insert_data_to_db(df, db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    for _, row in df.iterrows():
        cursor.execute("""
            INSERT INTO house_data (overall_qual, gr_liv_area, second_flr_sf, total_bsmt_sf, garage_cars, date, price)
            VALUES (?, ?, ?, ?, ?, ?, ?)
        """, (row['OverallQual'], row['GrLivArea'], row['2ndFlrSF'], row['TotalBsmtSF'], row['GarageCars'], row['Date'], row['SalePrice']))
    
    conn.commit()
    conn.close()

db_path = 'project.db'

create_table(db_path)
insert_data_to_db(filtered_data, db_path)


In [6]:
data = data.select_dtypes(include=[np.number]).dropna()

X = data.drop('SalePrice', axis=1)
y = data['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
forest = RandomForestRegressor(n_estimators=100, random_state=42)
forest.fit(X_train, y_train)

feature_importances = pd.DataFrame(forest.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)

important_features = feature_importances.head(5).index
X_train_imp = X_train[important_features]
X_test_imp = X_test[important_features]
print(important_features)


Index(['OverallQual', 'GrLivArea', '2ndFlrSF', 'TotalBsmtSF', 'GarageCars'], dtype='object')


In [34]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)

r2_score = model.score(X_test, y_test)
print("R^2 score:", r2_score)

RMSE: 31597.127520378705
R^2 score: 0.873419825240489


In [41]:
import pickle

with open('./model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [42]:
!ls

[34mapplication[m[m             [34minfrastructure[m[m          [34mservice_layer[m[m
[34mdomain[m[m                  model.pkl               train.csv
house_price.ipynb       random_forest_model.pkl


In [37]:
with open('random_forest_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

y_pred_loaded = loaded_model.predict(X_test)