In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

# ------------------------------
# Step 1: Load and Explore the Data
# ------------------------------
# Load the dataset from ifAffordable.csv
df = pd.read_csv('cleaned/ifAffordable.csv')

# Display basic information and the first few rows of the dataset
print("Dataframe Info:")
print(df.info())
print("\nFirst 5 rows:")
print(df.head())

# Remove duplicate rows if any
df.drop_duplicates(inplace=True)

# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# ------------------------------
# Step 2: One-Hot Encode Categorical Features
# ------------------------------
# The categorical columns are: RealmType, Enclave, ExteriorCharm, and ParcelSizeUnit.
categorical_features = ['RealmType', 'Enclave', 'ExteriorCharm', 'ParcelSizeUnit']
df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)

print("\nColumns after one-hot encoding:")
print(df_encoded.columns)

# ------------------------------
# Step 3: Define Predictors and Target Variable
# ------------------------------
# According to the instructions, we use:
#   - RealmType, Enclave, LivingQuarters, ParcelSize, ParcelSizeUnit, StructuralIntegrity, ExteriorCharm
# to predict TradeValue.
# We remove 'DI' (the identifier) and 'AffordableDwelling' from the predictors.
X = df_encoded.drop(columns=['DI', 'TradeValue', 'AffordableDwelling'])
y = df_encoded['TradeValue']

print("\nSelected Predictor Columns (first few rows):")
print(X.head())

# ------------------------------
# Step 4: Split the Data and Build the Linear Regression Model
# ------------------------------
# Split the dataset into training (80%) and testing (20%) sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
reg_model = LinearRegression()
reg_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = reg_model.predict(X_test)

# ------------------------------
# Step 5: Evaluate the Model Performance using MAE and MSE
# ------------------------------
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print("\nRegression Model Evaluation:")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)

# ------------------------------
# Step 6: Review Model Coefficients
# ------------------------------
# Display the coefficients for each predictor to understand their influence.
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': reg_model.coef_
})
print("\nModel Coefficients:")
print(coef_df)

# ------------------------------
# Step 7: Visualize Actual vs. Predicted TradeValue
# ------------------------------
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred)
plt.xlabel("Actual TradeValue")
plt.ylabel("Predicted TradeValue")
plt.title("Actual vs. Predicted TradeValue")
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: '/cleaned/ifAffordable.csv'