# 1. Create model & train

In [None]:
from sklearn.linear_model import SGDRegressor
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error


import sys
print(sys.executable)

X_train = pd.read_csv('data/X_train_cleaned.csv', index_col='Id')
# print(f"type(X_train): {type(X_train)}")
# print(f"X_train.shape: {X_train.shape}")
y_train = pd.read_csv('data/y_train.csv').squeeze()
# y train should
print(f"type(y_train): {type(y_train)}")
print(f"y_train.shape: {y_train.shape}")
print(f"y_train values:\n{y_train.head()}")


sgd_model = SGDRegressor(
    loss='squared_error',
    alpha=0.001,
    shuffle=True, 
    max_iter=20000, 
    verbose=1,
    tol=1e-5,
    random_state=42  # Add this parameter with any integer value
)


sgd_model.fit(X_train, y_train) 

r2_score = sgd_model.score(X_train, y_train)
print(f"R² score: {r2_score:.4f}")

feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': sgd_model.coef_.flatten()
})
feature_importance['Abs_Coefficient'] = feature_importance['Coefficient'].abs()
top_features = feature_importance.sort_values('Abs_Coefficient', ascending=False).head(10)
# print(top_features)

# print(f"Model type {sgd_model.__class__.__name__}")


# 2. test model on train output 


In [None]:
y_pred_log = sgd_model.predict(X_train)
# print(f"y_pred_log: {y_pred_log}")
rmse_log = np.sqrt(mean_squared_error(y_train, y_pred_log))
print(f"RMSE (log scale): {rmse_log:.4f}")

# Convert predictions back to original price scale
y_train_actual = np.expm1(y_train)
# y_train_actual = # to series 
# convert y_train_actual to series
y_train_actual = y_train_actual
# print(f"y_train_actual: {y_train_actual}")
y_pred_actual = np.expm1(y_pred_log)

rmse_actual = np.sqrt(mean_squared_error(y_train_actual, y_pred_actual))
print(f"RMSE (actual $): ${rmse_actual:.2f}")

# print(f"y_pred_actual: {y_pred_actual}, y_train_actual: {y_train_actual}")
# Mean Absolute Percentage Error (more interpretable)
mape = np.mean(np.abs((y_train_actual - y_pred_actual) / y_train_actual)) * 100
print(f"Mean Absolute Percentage Error: {mape:.2f}%")

import matplotlib.pyplot as plt

# Create scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(y_train_actual, y_pred_actual, alpha=0.5)
plt.plot([y_train_actual.min(), y_train_actual.max()], 
         [y_train_actual.min(), y_train_actual.max()], 
         'r--', lw=2)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Predicted vs Actual House Prices')
plt.show()

# Optional: Plot residuals to check for patterns
residuals = y_train_actual - y_pred_actual
plt.figure(figsize=(10, 6))
plt.scatter(y_pred_actual, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Prices')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

# 3. Use model & make predictions

In [None]:
import pandas as pd
import seaborn as sns
from matplotlib.ticker import FuncFormatter


test_df = pd.read_csv('data/X_test_cleaned.csv', index_col='Id')

predict_prices  = sgd_model.predict(test_df).flatten()
# print(predict_prices)
print(f"predict_prices.shape: {predict_prices.shape}")    

predict_prices_actual = np.expm1(predict_prices)
print(f"Example of prices: \n{predict_prices_actual}")

# Check the values
print("Min price:", predict_prices_actual.min())
print("Max price:", predict_prices_actual.max())
print("Mean price:", predict_prices_actual.mean())


# print 

# Format function to convert to $xxx,xxx format
def currency_formatter(x, pos):
    return f'${x:,.0f}'


plt.figure(figsize=(10, 6))
sns.histplot(predict_prices_actual, kde=True)
plt.title('Distribution of Predicted House Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
# plt.show()
plt.gca().xaxis.set_major_formatter(FuncFormatter(currency_formatter))
plt.tight_layout()
plt.show()





# 4. Create submissional format 

In [183]:
submission_df = pd.DataFrame({
    'Id': test_df.index,
    'SalePrice': predict_prices_actual
})
submission_df.to_csv('data/submission.csv', index=False)