In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [4]:
# Step 1: Data Collection
url = 'calendar.csv'  # Replace with the actual dataset path or URL
try:
    df = pd.read_csv(url)
    print("Dataset loaded successfully.")
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit()

# Step 2: Data Exploration
print("First few rows:")
print(df.head())

print("\nData types and missing values:")
print(df.info())

print("\nBasic statistics:")
print(df.describe())

# Step 3: Data Preprocessing
try:
    # Handle missing values
    df.fillna(method='ffill', inplace=True)
    
    # Convert categorical variables into numerical format (e.g., one-hot encoding)
    df = pd.get_dummies(df, drop_first=True)
    
    # Normalize the data
    scaler = StandardScaler()
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
    
except Exception as e:
    print(f"Error during data preprocessing: {e}")

# Step 4: Feature Selection
# Here, we assume that the dataset already has appropriate features selected
# You can also add feature selection techniques if needed

# Step 5: Model Selection
# We will try Linear Regression, Decision Tree Regression, and Random Forest Regression

# Step 6: Data Split
try:
    X = df.drop('price', axis=1)  # Features (assuming 'price' is the target variable)
    y = df['price']  # Target variable
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print("Data split into training and testing sets.")
except Exception as e:
    print(f"Error during data split: {e}")

# Step 7: Model Training
try:
    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree': DecisionTreeRegressor(),
        'Random Forest': RandomForestRegressor(n_estimators=100)
    }
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        print(f"{name} model trained.")
except Exception as e:
    print(f"Error during model training: {e}")

# Step 8: Model Evaluation
try:
    for name, model in models.items():
        y_pred = model.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        
        print(f"\n{name} Model Evaluation:")
        print(f"Mean Absolute Error (MAE): {mae}")
        print(f"Mean Squared Error (MSE): {mse}")
        print(f"Root Mean Squared Error (RMSE): {rmse}")
except Exception as e:
    print(f"Error during model evaluation: {e}")

# Step 9: Prediction
# Use one of the trained models to make predictions (e.g., Random Forest)
try:
    rf_model = models['Random Forest']
    sample_data = X_test.iloc[0:5]
    predictions = rf_model.predict(sample_data)
    print("\nPredictions for sample data:")
    print(predictions)
except Exception as e:
    print(f"Error during prediction: {e}")

# Step 10: Visualization
try:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=y_test, y=models['Random Forest'].predict(X_test))
    plt.xlabel('Actual Prices')
    plt.ylabel('Predicted Prices')
    plt.title('Actual vs Predicted Prices')
    plt.show()
except Exception as e:
    print(f"Error during visualization: {e}")

# Step 11: Model Tuning
# (This part can be extended based on hyperparameter tuning techniques like GridSearchCV)

# Step 12: Documentation
# (Add detailed documentation and save results as needed)


Dataset loaded successfully.
First few rows:
   listing_id        date available   price
0      241032  2016-01-04         t  $85.00
1      241032  2016-01-05         t  $85.00
2      241032  2016-01-06         f     NaN
3      241032  2016-01-07         f     NaN
4      241032  2016-01-08         f     NaN

Data types and missing values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1393570 entries, 0 to 1393569
Data columns (total 4 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   listing_id  1393570 non-null  int64 
 1   date        1393570 non-null  object
 2   available   1393570 non-null  object
 3   price       934542 non-null   object
dtypes: int64(1), object(3)
memory usage: 42.5+ MB
None

Basic statistics:
         listing_id
count  1.393570e+06
mean   5.550111e+06
std    2.962274e+06
min    3.335000e+03
25%    3.258213e+06
50%    6.118244e+06
75%    8.035212e+06
max    1.034016e+07


  df.fillna(method='ffill', inplace=True)


Error during data split: "['price'] not found in axis"
Error during model training: name 'X_train' is not defined
Error during model evaluation: name 'X_test' is not defined
Error during prediction: name 'X_test' is not defined
Error during visualization: name 'y_test' is not defined


<Figure size 1000x600 with 0 Axes>