In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
# Load the dataset
file_path = 'kc_house_data.csv'
data = pd.read_csv(file_path)

data.describe()

In [None]:
# Check if there is any nulls
null_counts = data.isna().sum()
print("Number of Null itmes in each column: ", null_counts)

In [None]:
# Assuming 'data' is your DataFrame with 'id' and 'price' columns
# Step 1: Calculate the average price per ID
average_prices = data.groupby('id')['price'].mean().reset_index()

# Step 2: Merge the average prices with the original DataFrame
merged_data = data.merge(average_prices, on='id', suffixes=('', '_avg'))

# Step 3: Replace the original price with the averaged price
merged_data['price'] = merged_data['price_avg']
merged_data.drop(columns=['price_avg'], inplace=True)

# Step 4: Drop duplicate IDs
final_data = merged_data.drop_duplicates(subset='id')

In [None]:
# Check for duplicates
for column in final_data.columns:
    duplicates = final_data[column].duplicated().any()
    print(f"Column '{column}' has duplicates: {duplicates}")

In [None]:
possible_bins = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]  # Adjust this list based on your data
for bins in possible_bins:
    bin_edges = np.histogram_bin_edges(final_data['price'], bins=bins)
    sns.histplot(final_data['price'], bins=bin_edges, kde=True)
    plt.title(f'Histogram with {bins} bins')
    plt.show()

In [None]:
selected_bins = 60  # Example value
bin_edges = np.histogram_bin_edges(final_data['price'], bins=selected_bins)
final_data.loc[:, 'bin'] = np.digitize(final_data['price'], bins=bin_edges)

In [None]:
price_data = final_data['price'].values
hist_bins = np.histogram_bin_edges(price_data, bins=50)
plt.figure(figsize=(10, 6))
sns.histplot(price_data, bins=hist_bins, kde=True)
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.title('Histogram of Home Prices')
plt.show()

In [None]:
categorical_columns = ['waterfront', 'condition']

# Create dummy variables
final_data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

In [None]:
set1 = final_data[final_data['price'].apply(lambda x: random.random() < 0.75)]
set2 = final_data[final_data['price'].apply(lambda x: random.random() >= 0.75)]
print(f'Set 1 size: {len(set1)} records')
print(f'Set 2 size: {len(set2)} records')

In [None]:
numeric_columns = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'view', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long']

# Create a StandardScaler and fit it to 'set1' for the numeric columns
scaler = StandardScaler()
scaler.fit(set1[numeric_columns])

# Transform both 'set1' and 'set2' using the same scaler for numeric columns
set1_scaled = set1.copy()
set1_scaled[numeric_columns] = scaler.transform(set1[numeric_columns])

set2_scaled = set2.copy()
set2_scaled[numeric_columns] = scaler.transform(set2[numeric_columns])


In [None]:
non_numeric_columns = ['date']  # Add any other non-numeric columns as needed
X_train = set1.drop(columns=['price'] + non_numeric_columns)
y_train = set1['price']

X_test = set2.drop(columns=['price'] + non_numeric_columns)
y_test = set2['price']

# Train a linear regression model on set1
model = LinearRegression()
model.fit(X_train, y_train)

# Predict prices for set2
y_pred = model.predict(X_test)

# Calculate RMSE (Root Mean Squared Error)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Calculate MAPE (Mean Absolute Percentage Error)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# Plot predicted prices against true prices for set2
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel('True Prices (Set2)')
plt.ylabel('Predicted Prices (Set2)')
plt.title('Predicted Prices vs. True Prices (Set2)')
plt.show()

# Print RMSE and MAPE
print(f'RMSE: {rmse:.2f}')
print(f'MAPE: {mape:.2f}%')

In [None]:
# Assuming 'set1' and 'set2' are your DataFrames
# Identify and exclude non-numeric columns
non_numeric_columns = ['date']  # Add any other non-numeric columns as needed

X_train = set1.drop(columns=['price'] + non_numeric_columns)
y_train = set1['price']

X_test = set2.drop(columns=['price'] + non_numeric_columns)
y_test = set2['price']

# Standardize the numeric features (scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create a dictionary of hyperparameters to search
param_grid = {
    'loss': ['squared_loss', 'huber'],
    'alpha': [0.001, 0.01],
    'epsilon': [0.1, 0.2, 0.3],
}

# Create the SGDRegressor model with increased max_iter
sgd_regressor = SGDRegressor(max_iter=1000, random_state=42)

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=sgd_regressor, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_
print(f'Best Hyperparameters: {best_params}')

In [None]:
# Create a new model with the best hyperparameters
best_sgd_regressor = SGDRegressor(max_iter=5000, random_state=42, **best_params)

# Train the model with the best hyperparameters on set1
best_sgd_regressor.fit(X_train_scaled, y_train)

# Predict prices for set2
y_pred = best_sgd_regressor.predict(X_test_scaled)

# Calculate RMSE (Root Mean Squared Error)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Calculate MAPE (Mean Absolute Percentage Error)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# Plot predicted prices against true prices for set2
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel('True Prices (Set2)')
plt.ylabel('Predicted Prices (Set2)')
plt.title('Predicted Prices vs. True Prices (Set2)')
plt.show()

# Print best hyperparameters, RMSE, and MAPE
print(f'Best Hyperparameters: {best_params}')
print(f'RMSE: {rmse:.2f}')
print(f'MAPE: {mape:.2f}%