In [1]:
# ==============================================
# 1. Imports
# ==============================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib, gzip

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, r2_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)
print("✅ Imports ready")

# ==============================================
# 2. Load Data
# ==============================================
csv_path = "Airbnb_Data_Uncleaned.csv"
df = pd.read_csv(csv_path, low_memory=False)
print(f"Shape before cleaning: {df.shape}")

# Clean currency columns
for money_col in ['price','service fee']:
    if money_col in df.columns and df[money_col].dtype == 'object':
        df[money_col] = (
            df[money_col].astype(str)
            .str.replace(r'[\$,]', '', regex=True)
            .replace({'': np.nan})
            .astype(float)
        )

# Drop duplicates
df = df.drop_duplicates()
print(f"Shape after cleaning: {df.shape}")
df.info()

# ==============================================
# 3. Feature Engineering
# ==============================================
if {'price','number of reviews'}.issubset(df.columns):
    df['price_per_review'] = df['price'] / (df['number of reviews'] + 1)

if {'calculated host listings count','number of reviews'}.issubset(df.columns):
    df['host_listings_ratio'] = df['calculated host listings count'] / (df['number of reviews'] + 1)

if 'number of reviews' in df.columns:
    demand_threshold = df['number of reviews'].median()
    df['demand'] = (df['number of reviews'] > demand_threshold).astype(int)

if 'price' in df.columns:
    df = df[df['price'].between(0, 1000)]  # remove extreme outliers
    df = df.dropna(subset=['price'])

print("✅ Engineered columns:", [c for c in ['price_per_review','host_listings_ratio','demand'] if c in df.columns])

# ==============================================
# 4. ML Dataset Preparation
# ==============================================
cat_cols = [c for c in ['room type','neighbourhood group','instant_bookable',
                        'cancellation_policy','host_identity_verified'] if c in df.columns]

df_ml = pd.get_dummies(df, columns=cat_cols, drop_first=True)

drop_cols = [c for c in ['id','NAME','host name','country','country code','neighbourhood','last review'] if c in df_ml.columns]
df_ml = df_ml.drop(columns=drop_cols, errors='ignore')

df_ml.columns = [c.lower().strip() for c in df_ml.columns]

print("✅ ML dataset ready:", df_ml.shape)

# ==============================================
# 5. Regression Pipeline (Price Prediction)
# ==============================================
leakage_cols_reg = [col for col in df_ml.columns if 
                    'price' in col or 'review' in col or 'avail' in col or 'demand' in col]

X_reg = df_ml.drop(columns=leakage_cols_reg, errors='ignore').select_dtypes(include=[np.number]).fillna(0)
y_reg = np.log1p(df_ml['price'])

X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

reg_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))
])
reg_pipeline.fit(X_train, y_train)

y_pred = reg_pipeline.predict(X_test)
y_test_orig, y_pred_orig = np.expm1(y_test), np.expm1(y_pred)

print("Regression MAE:", round(mean_absolute_error(y_test_orig, y_pred_orig), 2))
print("Regression R²:", round(r2_score(y_test_orig, y_pred_orig), 3))

# ==============================================
# 6. Classification Pipeline (Demand Prediction)
# ==============================================
leakage_cols_clf = [col for col in df_ml.columns if 
                    'price' in col or 'review' in col or 'avail' in col or 'demand' in col]

X_clf = df_ml.drop(columns=leakage_cols_clf, errors='ignore').select_dtypes(include=[np.number]).fillna(0)
y_clf = df['demand']

clf_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", RandomForestClassifier(n_estimators=200, random_state=42, class_weight="balanced", n_jobs=-1))
])
clf_pipeline.fit(X_clf, y_clf)

scores = cross_val_score(clf_pipeline, X_clf, y_clf, cv=5, scoring="accuracy", n_jobs=-1)
print("Classification Cross-validated accuracy:", round(scores.mean(), 3))

# ==============================================
# 7. Save Pipelines + Features 
# ==============================================
joblib.dump(reg_pipeline, "reg_pipeline.pkl")
joblib.dump(clf_pipeline, "clf_pipeline.pkl")

joblib.dump(list(X_reg.columns), "reg_features.pkl")
joblib.dump(list(X_clf.columns), "clf_features.pkl")

print("✅ Pipelines and features saved successfully")

# ==============================================
# 8. Save Cleaned Dataset
# ==============================================
df.to_csv("Airbnb_Cleaned_Ready.csv.gz", index=False, compression="gzip")
print("✅ Cleaned dataset saved as Airbnb_Cleaned_Ready.csv.gz")


✅ Imports ready
Shape before cleaning: (102599, 26)
Shape after cleaning: (102058, 26)
<class 'pandas.core.frame.DataFrame'>
Index: 102058 entries, 0 to 102057
Data columns (total 26 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   id                              102058 non-null  int64  
 1   NAME                            101808 non-null  object 
 2   host id                         102058 non-null  int64  
 3   host_identity_verified          101769 non-null  object 
 4   host name                       101654 non-null  object 
 5   neighbourhood group             102029 non-null  object 
 6   neighbourhood                   102042 non-null  object 
 7   lat                             102050 non-null  float64
 8   long                            102050 non-null  float64
 9   country                         101526 non-null  object 
 10  country code                    101927 non-null  object 
 