In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [None]:
df = pd.read_csv('../data/train.csv')
df.head()


In [None]:
df.info()




In [None]:
df.describe()


In [None]:
df.isnull().sum().sort_values(ascending=False).head(10)


In [None]:
sns.histplot(df['SalePrice'], kde=True)
plt.title("Distribution of House Prices")


In [None]:
numeric = df.select_dtypes(include=['int64','float64']).copy()

numeric = numeric.drop(columns=['Id'])
target = numeric['SalePrice']
features = numeric.drop(columns=['SalePrice'])

features = features.fillna(features.median())


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42
)


In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

preds = model.predict(X_test)
rmse = mean_squared_error(y_test, preds, squared=False)
rmse


In [None]:
importances = pd.Series(model.feature_importances_, index=features.columns)
importances.sort_values(ascending=False).head(20).plot(kind='barh', figsize=(8,6))
plt.title("Top 20 Important Features")


In [None]:
import joblib
joblib.dump(model, '../models/model_firstdata_hose_prices.joblib')
