In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Import Libraries

In [2]:
df= pd.read_csv('Dataset.csv')  # Replace with your actual data file

In [3]:
df.head()

Unnamed: 0,listing_id,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_identity_verified,...,has_indoor_fireplace,has_breakfast,has_essentials,has_shampoo,has_smoke_detector,has_carbon_monoxide_detector,has_first_aid_kit,has_fire_extinguisher,has_laptop_friendly_workspace,has_wheelchair_accessible
0,241032,2011-08-11,3.0,3.0,96.0,100.0,0,3.0,3.0,1,...,0,0,0,0,0,0,0,0,0,0
1,953595,2013-02-21,3.0,4.0,98.0,100.0,1,6.0,6.0,1,...,0,0,1,0,1,1,1,1,0,0
2,3308979,2014-06-12,3.0,3.0,67.0,100.0,0,2.0,2.0,1,...,1,0,1,1,1,1,0,0,0,0
3,7421966,2013-11-06,3.0,0.0,0.0,0.0,0,1.0,1.0,1,...,1,0,1,1,1,1,0,1,0,0
4,278830,2011-11-29,3.0,4.0,100.0,0.0,0,2.0,2.0,1,...,0,0,1,1,1,1,1,1,0,0


<h1>Linear Regression</h1>

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# Load dataset
df = pd.read_csv("Dataset.csv")

# Select numeric features for prediction (replace with actual column names from your dataset)
feature_cols = ['bedrooms', 'bathrooms', 'number_of_reviews', 'availability_365']  
X = df[feature_cols]
y = df['price']  # Target column

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation metrics
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

Mean Absolute Error (MAE): 45.724654333113584
R² Score: 0.46920440021799936


<h1>Decision Tree Regression</h1>

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Load dataset
df = pd.read_csv("Dataset.csv")

# Select numeric features automatically
X = df.select_dtypes(include=['int64', 'float64']).drop(columns=['price'])
y = df['price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train Decision Tree model
model = DecisionTreeRegressor(random_state=42, max_depth=10)  # max_depth can be tuned
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

Mean Absolute Error (MAE): 34.96764647207685
R² Score: 0.17287413357942716


<h1>Random Forest Regressor</h1>

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# 1. Load the dataset
df = pd.read_csv("Dataset.csv")

# 2. Define features (X) and target (y)
# Replace 'price' with your target column name
X = df.drop(columns=['price'])
y = df['price']

# 3. If categorical columns exist, convert them to numerical
X = pd.get_dummies(X, drop_first=True)

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Initialize Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# 6. Train the model
rf_model.fit(X_train, y_train)

# 7. Predictions
y_pred = rf_model.predict(X_test)

# 8. Evaluation
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"R² Score: {r2}")

Mean Absolute Error (MAE): 24.599237844940866
R² Score: 0.6944673483163232


<h1>XGBRegressor</h1>

In [7]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np


# 1. Load dataset
df = pd.read_csv("Dataset.csv") 


# 2. Safe feature extraction
if "host_since" in df.columns:
    df["host_since"] = pd.to_datetime(df["host_since"], errors="coerce")
    df["host_since_year"] = df["host_since"].dt.year.astype("Int64")
    df["host_since_month"] = df["host_since"].dt.month.astype("Int64")
    df.drop(columns=["host_since"], inplace=True)
else:
    print("⚠ 'host_since' column not found — skipping date features.")


# 3. Select numeric + categorical features
target = "price"  # change this to your target column
if target not in df.columns:
    raise ValueError(f"Target column '{target}' not found in dataset.")

X = df.drop(columns=[target])
y = df[target]

# Identify categorical features (object or category)
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

# Convert object columns to pandas category
for col in cat_cols:
    X[col] = X[col].astype("category")


# 4. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# 5. Train XGBoost with native categorical support
xgb_model = XGBRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    enable_categorical=True  # IMPORTANT for category dtype
)

xgb_model.fit(X_train, y_train)


# 6. Predictions & evaluation
y_pred = xgb_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"✅ Model trained successfully.")
print(f"📊 RMSE: {rmse:.2f}")

from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.4f}")

✅ Model trained successfully.
📊 RMSE: 51.82
R² Score: 0.6673
