In [6]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load dataset (replace 'house_prices.csv' with actual file)
df = pd.read_csv('kc_house_data.csv')

# Display basic info
print(df.head())
print(df.info())

# Feature Selection
features = ['sqft_living', 'bedrooms', 'bathrooms', 'floors', 'waterfront', 'view', 'condition', 'grade',
            'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long']
target = 'price'

# Handling missing values
df = df.dropna()

# Splitting dataset into training and testing sets
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize XGBoost Regressor
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)

# Train model
xgb_reg.fit(X_train, y_train)

# Predictions
y_pred = xgb_reg.predict(X_test)

# Evaluate Model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared Score: {r2}')

# Function to predict house price
def predict_house_price(input_data):
    input_df = pd.DataFrame([input_data], columns=features)
    input_scaled = scaler.transform(input_df)
    prediction = xgb_reg.predict(input_scaled)
    return prediction[0]

# Example usage
sample_input = {
    'sqft_living': 2500,
    'bedrooms': 3,
    'bathrooms': 2,
    'floors': 1,
    'waterfront': 0,
    'view': 1,
    'condition': 3,
    'grade': 7,
    'sqft_above': 1800,
    'sqft_basement': 700,
    'yr_built': 1995,
    'yr_renovated': 0,
    'zipcode': 98052,
    'lat': 47.6219,
    'long': -122.3190
}

predicted_price = predict_house_price(sample_input)
print(f'Predicted House Price: {predicted_price}')


           id             date     price  bedrooms  bathrooms  sqft_living  \
0  7129300520  20141013T000000  221900.0         3       1.00         1180   
1  6414100192  20141209T000000  538000.0         3       2.25         2570   
2  5631500400  20150225T000000  180000.0         2       1.00          770   
3  2487200875  20141209T000000  604000.0         4       3.00         1960   
4  1954400510  20150218T000000  510000.0         3       2.00         1680   

   sqft_lot  floors  waterfront  view  ...  grade  sqft_above  sqft_basement  \
0      5650     1.0           0     0  ...      7        1180              0   
1      7242     2.0           0     0  ...      7        2170            400   
2     10000     1.0           0     0  ...      6         770              0   
3      5000     1.0           0     0  ...      7        1050            910   
4      8080     1.0           0     0  ...      8        1680              0   

   yr_built  yr_renovated  zipcode      lat     lo