In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load and prepare data
data = pd.read_csv('train (1).csv')
data.dropna(inplace=True)
data["lot_size"] = data.apply(lambda row: row["lot_size"] * 43560 if row["lot_size_units"] == 'acre' else row["lot_size"], axis=1)
data["lot_size_units"] = "sqft"

# Calculate the IQR for the 'lot_size' column
Q1 = data['lot_size'].quantile(0.25)
Q3 = data['lot_size'].quantile(0.75)
IQR = Q3 - Q1

# Determine the bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Exclude the outliers from the DataFrame
df_no_outliers = data[(data['lot_size'] >= lower_bound) & (data['lot_size'] <= upper_bound)]

# Calculate the IQR for the 'price' column
Q1 = df_no_outliers["price"].quantile(0.25)
Q3 = df_no_outliers["price"].quantile(0.75)
IQR = Q3 - Q1

# Determine the bounds for outliers in the 'price' column
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Exclude the outliers from the DataFrame
df_no_outliers = df_no_outliers[(df_no_outliers["price"] >= lower_bound) & (df_no_outliers["price"] <= upper_bound)]

# Drop unnecessary columns
df_no_outliers.drop(columns=["size_units", "lot_size_units", 'zip_code', 'size'], inplace=True)

# Standardize the data
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X = df_no_outliers.iloc[:, :-1]
y = df_no_outliers['price']

X_standardized = scaler_X.fit_transform(X)
y_standardized = scaler_y.fit_transform(y.values.reshape(-1, 1)).ravel()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_standardized, y_standardized, test_size=0.2, random_state=42)

# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_standardized = model.predict(X_test)
y_pred = scaler_y.inverse_transform(y_pred_standardized.reshape(-1, 1)).ravel()
y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1)).ravel()

# Evaluate the model
mse = mean_squared_error(y_test_original, y_pred)
r2 = r2_score(y_test_original, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

# Take user input for predictions
while True:
    try:
        # Take user input
        beds = int(input("Enter the number of beds: "))
        baths = int(input("Enter the number of baths: "))
        lot_area = int(input("Enter the lot area (in sqft): "))

        # Create the input array for prediction and standardize it
        user_input = np.array([[beds, baths, lot_area]])
        user_input_standardized = scaler_X.transform(user_input)

        # Predict and de-standardize the price
        predicted_price_standardized = model.predict(user_input_standardized)
        predicted_price = scaler_y.inverse_transform(predicted_price_standardized.reshape(-1, 1)).ravel()

        print(f'Predicted price for {beds} beds, {baths} baths, {lot_area} sq ft lot area: ${predicted_price[0]:,.2f}')
    except ValueError:
        print("Invalid input. Please enter numerical values.")
    except KeyboardInterrupt:
        print("\nExiting...")
        break
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    finally:
        more_input = input("Do you want to enter another set of values? (yes/no): ").strip().lower()
        if more_input == 'no':
            break


Mean Squared Error: 80204814579.60
R-squared: 0.31
Enter the number of beds: 3
Enter the number of baths: 3
Enter the lot area (in sqft): 6000




Predicted price for 3 beds, 3 baths, 6000 sq ft lot area: $1,021,432.92
Do you want to enter another set of values? (yes/no): no
