In [None]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import ast
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

In [None]:
input_file = "../data/raw/listings_detailed.csv"
output_file = "../data/processed/listings_fixed.csv"

rows = []

# Step 1: Read raw CSV safely
with open(input_file, "r", encoding="utf-8", errors="replace") as f:
    reader = csv.reader(f)
    for row in reader:
        rows.append(row)

# Step 2: Detect the correct number of columns (based on header)
expected_cols = len(rows[0])
print(f"Expected columns: {expected_cols}")

# Step 3: Fix rows with wrong number of columns
fixed_rows = []
for i, row in enumerate(rows):
    if len(row) != expected_cols:
        print(f"Fixing row {i+1}: had {len(row)} columns")
        if len(row) < expected_cols:
            # Pad missing columns
            row += [""] * (expected_cols - len(row))
        else:
            # Merge extra columns into the last one
            row = row[:expected_cols-1] + [",".join(row[expected_cols-1:])]
    fixed_rows.append(row)

# Step 4: Save fixed CSV
with open(output_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerows(fixed_rows)

# Step 5: Load into pandas
data = pd.read_csv(output_file)
print("DataFrame shape:", data.shape)
print(data.head())

In [None]:
pd.set_option('display.max_rows', None)  
pd.set_option('display.max_columns', None)
data.isna().sum().to_csv("../data/processed/missing_summary.csv")

In [None]:
data.price
data['price'] = data['price'].replace('[\$,]', '', regex = True).astype(float)

In [None]:
Q1 = data["price"].quantile(0.25)   
Q3 = data["price"].quantile(0.75)   
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# outliers = data[(data["price"] < lower_bound) | (data["price"] > upper_bound)]
data = data[(data['price'] >= lower_bound) & (data['price'] <= upper_bound)]

In [None]:
# data["price_log"] = np.log1p(data["price"])

In [None]:
drop_cols = ['id','listing_url','scrape_id','name','description',
             'picture_url','license',
             'host_url','host_thumbnail_url','host_picture_url']

data = data.drop(columns=drop_cols, errors='ignore')

In [None]:
for col in ['host_response_rate','host_acceptance_rate']:
    data[col] = data[col].replace('%','', regex=True).replace('Unknown', 0).astype(float)

In [None]:
num_cols = data.select_dtypes(include=['float64','int64']).columns
data[num_cols] = data[num_cols].fillna(data[num_cols].median())

cat_cols = data.select_dtypes(include='object').columns
data[cat_cols] = data[cat_cols].fillna("Unknown")

In [None]:
# plt.hist(data["price_log"], bins=50)
# plt.title("Distribution of Log-Transformed Prices")
# plt.show()

In [None]:
# plt.boxplot(data["price_log"], vert=False)
# plt.title("Boxplot of Log-Transformed Prices")
# plt.show()

In [None]:
    # stats.probplot(data["price_log"], dist="norm", plot=plt)
    # plt.show()

In [None]:
# Percentile-based winsorization
lower_cap = data["price"].quantile(0.01)
upper_cap = data["price"].quantile(0.99)

data["price_capped"] = data["price"].clip(lower=lower_cap, upper=upper_cap)
data["price_log"] = np.log1p(data["price_capped"])

In [None]:
plt.hist(data["price_log"], bins=50)
plt.title("Distribution of Log-Transformed Prices")
plt.show()

In [None]:
stats.probplot(data["price_log"], dist="norm", plot=plt)
plt.show()

In [None]:
sns.histplot(data["price_log"], kde=True)

In [None]:
data['host_since'] = pd.to_datetime(data['host_since'], errors='coerce')
data['host_days'] = (pd.to_datetime("today") - data['host_since']).dt.days.fillna(0)

data['last_review'] = pd.to_datetime(data['last_review'], errors='coerce')
data['days_since_last_review'] = (pd.to_datetime("today") - data['last_review']).dt.days.fillna(9999)

In [None]:
data.to_csv(r"D:\SLIIT\Year 3\Semester 2\FDM\Mini Project\Air_bnb_price_prediction\data\processed\new.csv", index=False)

In [None]:
# Handle missing values
data['amenities'] = data['amenities'].fillna("")

# Count total number of amenities
data['amenities_count'] = data['amenities'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0)

# Find most frequent amenities
top_n = 15
all_amenities = data['amenities'].str.split(',').explode().str.strip()
top_amenities = all_amenities.value_counts().head(top_n).index

# Create binary columns for top amenities (safe version)
for a in top_amenities:
    col_name = 'has_' + a.replace(" ", "_").replace("-", "_").replace("/", "_").lower()
    data[col_name] = data['amenities'].str.contains(a, case=False, regex=False).fillna(False).astype(int)

In [None]:
data = data.drop(columns=['amenities'])
data= data.drop(columns=['last_scraped', 'source','host_id'])
data = data.drop(columns=['host_name', 'host_about'])
data = data.drop(columns=['calendar_last_scraped'])
data = data.drop(columns=['bathrooms'])
data = data.drop(columns=['calendar_updated', 'last_review'])
data = data.dropna(subset=['host_since'])

In [None]:
# List of columns to drop

drop_cols = [
    # Price-related
    'price',
    'price_capped',

    # Review dates
    'first_review',
    'last_review',

    # Free-text columns
    'name',
    'summary',
    'description',
    'neighborhood_overview',
    'notes',
    'transit',
    'access',
    'interaction',
    'house_rules'
]

# Drop if exists in the dataset
data = data.drop(columns=[c for c in drop_cols if c in data.columns], errors="ignore")

In [None]:
# Outlier handling for minimum_nights and maximum_nights

# Cap minimum_nights at 30 (anything above becomes 30)
if 'minimum_nights' in data.columns:
    data['minimum_nights'] = data['minimum_nights'].clip(upper=30)

# Cap maximum_nights at 365 (anything above becomes 365)
if 'maximum_nights' in data.columns:
    data['maximum_nights'] = data['maximum_nights'].clip(upper=365)

# If dataset has related min/max/avg columns, cap them too
for col in data.columns:
    if 'minimum_nights' in col.lower():
        data[col] = data[col].clip(upper=30)
    if 'maximum_nights' in col.lower():
        data[col] = data[col].clip(upper=365)

In [None]:
# -----------------------------
# 1. Define categorical columns
# -----------------------------
categorical_low = [
    'host_response_time', 'host_is_superhost', 'host_verifications',
    'host_has_profile_pic', 'host_identity_verified',
    'neighbourhood_group_cleansed', 'room_type',
    'has_availability', 'instant_bookable'
]

categorical_high = [
    'host_location', 'host_neighbourhood', 'neighbourhood',
    'neighbourhood_cleansed', 'property_type', 'bathrooms_text'
]

# -----------------------------
# 2. Identify numeric columns
# -----------------------------
numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()

# Detect binary flags (only 0/1 values)
binary_flags = [col for col in numeric_cols if set(data[col].dropna().unique()) <= {0,1}]

# Continuous numeric = numeric but not binary
continuous_nums = [col for col in numeric_cols if col not in binary_flags]

print("Binary flags:", binary_flags[:10])
print("Continuous numeric:", continuous_nums[:10])

# -----------------------------
# 3. Label Encode high-cardinality categoricals
# -----------------------------
label_encoders = {}
for col in categorical_high:
    le = LabelEncoder()
    data[col] = data[col].astype(str).fillna("Unknown")
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# -----------------------------
# 4. Build preprocessing pipeline
# -----------------------------
preprocessor = ColumnTransformer(
    transformers=[
        # Continuous numeric: impute + scale
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), continuous_nums),

        # Binary flags: keep as is
        ("binary", "passthrough", binary_flags),

        # Low-cardinality categoricals: OneHotEncode
        ("low_cat", OneHotEncoder(handle_unknown="ignore"), categorical_low),

        # High-cardinality categoricals: already label encoded
        ("high_cat", "passthrough", categorical_high)
    ]
)

In [None]:
# Inspect all unique values in each column
for col in data.columns:
    unique_vals = data[col].dropna().unique()
    print(f"{col}: {unique_vals[:20]}")

In [None]:
# Function to detect t/f columns allowing unknowns
def detect_tf_columns_with_unknown(df, unknown_vals=['Unknown']):
    tf_cols = []
    for col in df.columns:
        col_vals = df[col].dropna().astype(str).str.strip().str.lower()
        # Exclude unknown values
        col_vals = col_vals[~col_vals.isin([val.lower() for val in unknown_vals])]
        if col_vals.isin(['t','f']).all():
            tf_cols.append(col)
    return tf_cols

# Detect t/f columns
tf_cols = detect_tf_columns_with_unknown(data)
print("t/f columns found:", tf_cols)

# Convert t/f to binary, set unknowns to NaN
for col in tf_cols:
    data[col] = data[col].astype(str).str.strip().str.lower().replace({'unknown': np.nan}).map({'t':1,'f':0}).astype(float)

# Preview results
print(data[tf_cols].head())

In [None]:
# Convert string to list
data['host_verifications_list'] = data['host_verifications'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])

# Define all possible verifications in the dataset
all_verifications = set(v for lst in data['host_verifications_list'] for v in lst)

# Create binary columns
for v in all_verifications:
    data[f'verified_{v}'] = data['host_verifications_list'].apply(lambda lst: 1 if v in lst else 0)

In [None]:
data = data.drop(columns=['host_verifications_list', 'host_verifications'])

In [None]:
# Convert 'host_since' to datetime
data['host_since'] = pd.to_datetime(data['host_since'], errors='coerce')

# Feature 1: Host tenure in years
data['host_tenure_years'] = (pd.Timestamp.now() - data['host_since']).dt.days / 365

# Feature 2 (optional): Host tenure in months
data['host_tenure_months'] = (pd.Timestamp.now() - data['host_since']).dt.days // 30

# Feature 3 (optional): Extract year and month separately
data['host_since_year'] = data['host_since'].dt.year
data['host_since_month'] = data['host_since'].dt.month

# Drop the original 'host_since' column
data = data.drop(columns=['host_since'])

# Preview the new features
print(data[['host_tenure_years', 'host_tenure_months', 'host_since_year', 'host_since_month']].head())

In [None]:
# One-hot encode neighbourhood_group_cleansed
data = pd.get_dummies(data, columns=['neighbourhood_group_cleansed'], prefix='neighbourhood')

# One-hot encode room_type
data = pd.get_dummies(data, columns=['room_type'], prefix='room')

# Preview the new columns
print(data.head())

In [None]:
# Select columns that are boolean (True/False)
bool_cols = data.select_dtypes(include='bool').columns

# Convert True → 1, False → 0
data[bool_cols] = data[bool_cols].astype(int)

# Preview results
print(data[bool_cols].head())

In [None]:
# One-hot encode 'host_response_time'
host_response_dummies = pd.get_dummies(data['host_response_time'], 
                                       prefix='host_response', 
                                       dummy_na=True)  # dummy_na=True will create a column for missing values

# Concatenate back to the original dataframe
data = pd.concat([data, host_response_dummies], axis=1)

# Drop the original column
data = data.drop('host_response_time', axis=1)

In [None]:
one_hot_cols = [col for col in data.columns if col.startswith('host_response_')]

# Convert bool to int
data[one_hot_cols] = data[one_hot_cols].astype(int)

In [None]:
data = data.dropna(subset=["host_is_superhost"])

In [None]:
# -------------------------------
# 1. Features and target
# -------------------------------
X = data.drop("price_log", axis=1)
y = data["price_log"]   # winsorized + log transformed price

# -------------------------------
# 2. Train/test split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
# Make safe feature names for XGBoost
X_train = X_train.rename(columns=lambda c: str(c).replace('[','').replace(']','').replace('<',''))
X_test = X_test.rename(columns=lambda c: str(c).replace('[','').replace(']','').replace('<',''))

# -------------------------------
# 3. Initialize XGBoost Regressor
# -------------------------------
xgb_model = XGBRegressor(
    n_estimators=500,      # number of boosting rounds
    learning_rate=0.05,    # step size shrinkage
    max_depth=6,           # tree depth
    subsample=0.8,         # row sampling
    colsample_bytree=0.8,  # feature sampling
    random_state=42,
    n_jobs=-1
)

# -------------------------------
# 4. Train the model
# -------------------------------
xgb_model.fit(X_train, y_train)

# -------------------------------
# 5. Predict log-price
# -------------------------------
y_pred_log = xgb_model.predict(X_test)

# -------------------------------
# 6. Convert predictions back to original price
# -------------------------------
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_test)

# -------------------------------
# 7. Evaluate model performance
# -------------------------------
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)
medae = median_absolute_error(y_true, y_pred)

print("XGBoost with log(price):")
print(f"RMSE: {rmse:.2f}")

print(f"R² Score: {r2:.3f}")
print(f"MAE: {mae:.2f}")
print(f"Median AE: {medae:.2f}")

In [None]:
param_dist = {
    'learning_rate': np.linspace(0.01, 0.1, 10),
    'max_depth': [4, 6, 8, 10],
    'n_estimators': [300, 400, 500, 600],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

rand_search = RandomizedSearchCV(
    estimator=XGBRegressor(random_state=42, n_jobs=-1),
    param_distributions=param_dist,
    n_iter=30,          # 30 random combinations
    scoring='neg_root_mean_squared_error',
    cv=3,
    verbose=2,
    random_state=42
)

rand_search.fit(X_train, y_train)
print("Best Parameters:", rand_search.best_params_)

In [None]:
xgb_tuned = XGBRegressor(
    n_estimators=600,
    max_depth=10,
    learning_rate=0.03,
    subsample=0.7,
    colsample_bytree=0.7,
    random_state=42,
    n_jobs=-1
)

xgb_tuned.fit(X_train, y_train)
y_pred_log = xgb_tuned.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_test)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)
medae = median_absolute_error(y_true, y_pred)

print("Tuned XGBoost Results:")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.3f}")
print(f"MAE: {mae:.2f}")
print(f"Median AE: {medae:.2f}")

In [None]:
# -------------------------------
# Feature Importance
# -------------------------------
importances = xgb_tuned.feature_importances_
features = X_train.columns

# Put into DataFrame for easy sorting
feat_imp = pd.DataFrame({
    "Feature": features,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

# Plot top 20 features
plt.figure(figsize=(10, 8))
sns.barplot(x="Importance", y="Feature", data=feat_imp.head(20), palette="viridis")
plt.title("Top 20 Feature Importances - XGBoost", fontsize=14)
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()