# Importing Libararies

In [2]:
import os
import json
import pandas as pd
import zipfile
import logging
from pathlib import Path
import subprocess
import sys


# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def check_kaggle_installation():
    """Verify Kaggle package is installed and credentials exist."""
    try:
        import kaggle
        logger.info("Kaggle package is installed")
        
        # Check if credentials file exists in the default location
        kaggle_path = Path.home() / '.kaggle' / 'kaggle.json'
        if not kaggle_path.exists():
            logger.error("Kaggle credentials not found in ~/.kaggle/kaggle.json")
            raise FileNotFoundError("Kaggle credentials not found")
        
        # Check if credentials have correct permissions
        if oct(kaggle_path.stat().st_mode)[-3:] != '600':
            logger.warning("Kaggle credentials file permissions are not set to 600")
            logger.info("Setting correct permissions...")
            kaggle_path.chmod(0o600)
            
        return True
    except ImportError:
        logger.error("Kaggle package is not installed. Please run: pip install kaggle")
        return False

def download_and_load_dataset(dataset_name, zip_filename):
    """Download dataset from Kaggle and load it into a DataFrame."""
    zip_path = Path(zip_filename)
    
    # Download the dataset if necessary
    if not zip_path.exists() or zip_path.stat().st_size == 0:
        logger.info("Downloading dataset from Kaggle...")
        try:
            import kaggle
            kaggle.api.authenticate()
            kaggle.api.dataset_download_files(dataset_name, path='.', unzip=False)
            logger.info("Dataset downloaded successfully")
        except Exception as e:
            logger.error(f"Failed to download dataset: {str(e)}")
            raise RuntimeError(f"Dataset download failed: {str(e)}")
    else:
        logger.info("Using existing zip file")

    # Load the dataset
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            csv_files = [f for f in zip_ref.namelist() if f.endswith('.csv')]
            if not csv_files:
                logger.error("No CSV files found in the zip archive")
                raise ValueError("No CSV files in archive")
                
            logger.info(f"Found CSV file: {csv_files[0]}")
            with zip_ref.open(csv_files[0]) as csv_file:
                df = pd.read_csv(csv_file)
                logger.info(f"Dataset loaded successfully: {df.shape[0]} rows, {df.shape[1]} columns")
                return df
                
    except zipfile.BadZipFile:
        logger.error("Corrupt zip file detected")
        if zip_path.exists():
            zip_path.unlink()  # Delete corrupt zip file
        logger.info("Deleted corrupt zip file. Please run the script again to re-download")
        raise

def main():
    # Check Kaggle installation first
    if not check_kaggle_installation():
        sys.exit(1)
        
    # Dataset details
    DATASET_NAME = 'ahmedshahriarsakib/usa-real-estate-dataset'
    ZIP_FILENAME = 'usa-real-estate-dataset.zip'
    
    try:
        # Download and load the dataset
        df = download_and_load_dataset(DATASET_NAME, ZIP_FILENAME)
        
        # Basic data exploration
        print("\nDataset Preview:")
        print(df.head())
        print("\nDataset Info:")
        print(df.info())
        
        return df
        
    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    df = main()

INFO:__main__:Kaggle package is installed
INFO:__main__:Using existing zip file
INFO:__main__:Found CSV file: realtor-data.zip.csv
INFO:__main__:Dataset loaded successfully: 2226382 rows, 12 columns



Dataset Preview:
   brokered_by    status     price  bed  bath  acre_lot     street  \
0     103378.0  for_sale  105000.0  3.0   2.0      0.12  1962661.0   
1      52707.0  for_sale   80000.0  4.0   2.0      0.08  1902874.0   
2     103379.0  for_sale   67000.0  2.0   1.0      0.15  1404990.0   
3      31239.0  for_sale  145000.0  4.0   2.0      0.10  1947675.0   
4      34632.0  for_sale   65000.0  6.0   2.0      0.05   331151.0   

         city        state  zip_code  house_size prev_sold_date  
0    Adjuntas  Puerto Rico     601.0       920.0            NaN  
1    Adjuntas  Puerto Rico     601.0      1527.0            NaN  
2  Juana Diaz  Puerto Rico     795.0       748.0            NaN  
3       Ponce  Puerto Rico     731.0      1800.0            NaN  
4    Mayaguez  Puerto Rico     680.0         NaN            NaN  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2226382 entries, 0 to 2226381
Data columns (total 12 columns):
 #   Column          Dtype  
---  ---

In [3]:
df_training_sets = df[['acre_lot', 'bed', 'bath', 'city', 'state', 'house_size', 'price']]

In [4]:
null_sum = df_training_sets.isnull().sum();
print('-------Null sums------\n',null_sum,'\n\n\n-------Data types------');
data_types = df_training_sets.dtypes;
print(data_types);


-------Null sums------
 acre_lot      325589
bed           481317
bath          511771
city            1407
state              8
house_size    568484
price           1541
dtype: int64 


-------Data types------
acre_lot      float64
bed           float64
bath          float64
city           object
state          object
house_size    float64
price         float64
dtype: object


In [8]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler
from scipy import stats

def prepare_data(df):
    """
    Prepare the data with improved cleaning and feature engineering
    """
    # Create a copy to avoid modifying the original
    df_prep = df.copy()
    
    # 1. Remove outliers using z-score for numeric columns
    numeric_cols = ['price', 'house_size', 'bed', 'bath', 'acre_lot']
    for col in numeric_cols:
        z_scores = stats.zscore(df_prep[col], nan_policy='omit')
        df_prep = df_prep[np.abs(z_scores) < 3]
    
    # 2. Log transform price (since house prices are usually log-normally distributed)
    df_prep['price'] = np.log1p(df_prep['price'])
    
    # 3. Create meaningful feature interactions
    df_prep['price_per_sqft'] = df_prep['price'] / df_prep['house_size']
    df_prep['rooms'] = df_prep['bed'] + df_prep['bath']
    df_prep['avg_room_size'] = df_prep['house_size'] / df_prep['rooms']
    
    # 4. Handle missing values more carefully
    # For house_size, impute based on bedrooms and bathrooms
    size_medians = df_prep.groupby(['bed', 'bath'])['house_size'].transform('median')
    df_prep['house_size'] = df_prep['house_size'].fillna(size_medians)
    
    # For remaining numeric columns, use median
    for col in ['bed', 'bath', 'acre_lot']:
        df_prep[col] = df_prep[col].fillna(df_prep[col].median())
    
    # 5. Scale numeric features using RobustScaler (less sensitive to outliers)
    scaler = RobustScaler()
    df_prep[numeric_cols] = scaler.fit_transform(df_prep[numeric_cols])
    
    # 6. Encode categorical variables
    for col in ['city', 'state']:
        # Calculate frequency encoding
        freq_enc = df_prep[col].value_counts(normalize=True)
        # Apply encoding and handle missing values
        df_prep[f'{col}_freq'] = df_prep[col].map(freq_enc).fillna(0)
    
    # Drop original categorical columns after encoding
    df_prep = df_prep.drop(['city', 'state'], axis=1)
    
    return df_prep

# Prepare the data
df_cleaned = prepare_data(df_training_sets)

# Split the data
X = df_cleaned.drop('price', axis=1)
y = df_cleaned['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Run baseline models again with improved data
print("Running baseline models with improved data preprocessing...")

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Simple linear regression with all features
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)

print("\nImproved Linear Regression Performance:")
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")

Running baseline models with improved data preprocessing...

Improved Linear Regression Performance:
R² Score: 0.3602
RMSE: 0.6857


In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# 1. Sample the data for faster processing
def create_sample(X, y, sample_size=100000, random_state=42):
    """Create a random sample of the data"""
    if len(X) > sample_size:
        X_sample, _, y_sample, _ = train_test_split(
            X, y, 
            train_size=sample_size, 
            random_state=random_state
        )
        return X_sample, y_sample
    return X, y

# 2. Create training sample
print("Creating training sample...")
X_sample, y_sample = create_sample(X_train, y_train, sample_size=100000)

# 3. Train Random Forest on sample
print("Training Random Forest on sample...")
rf_model = RandomForestRegressor(
    n_estimators=50,  # Reduced number of trees
    max_depth=8,      # Limited depth
    min_samples_split=10,
    n_jobs=-1,        # Use all CPU cores
    random_state=42
)

rf_model.fit(X_sample, y_sample)

# 4. Evaluate on test set
print("Evaluating model...")
y_pred = rf_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("\nRandom Forest Performance:")
print(f"R² Score: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")

# 5. Feature importance analysis
importances = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 5 Most Important Features:")
print(importances.head())

Creating training sample...
Training Random Forest on sample...
Evaluating model...

Random Forest Performance:
R² Score: 0.6422
RMSE: 0.5127

Top 5 Most Important Features:
          feature  importance
2            bath    0.425979
8      state_freq    0.239833
3      house_size    0.150503
4  price_per_sqft    0.139558
6   avg_room_size    0.021998
