In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [19]:
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [20]:
# Custom Dataset
class AirbnbDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.FloatTensor(features)
        self.targets = torch.FloatTensor(targets)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

In [22]:
# CNN Model
class ROIPredictionCNN(nn.Module):
    def __init__(self, input_dim):
        super(ROIPredictionCNN, self).__init__()
        self.conv1 = nn.Conv1d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(64 * (input_dim // 4), 128)
        self.fc2 = nn.Linear(128, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension
        x = self.relu(self.conv1(x))
        x = self.pool(x)
        x = self.relu(self.conv2(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)  # Flatten
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [23]:
def clean_price(x):
    if isinstance(x, str):
        return x.replace('$', '').replace(',', '')
    return x

In [24]:
def clean_numeric(x):
    if pd.isna(x):
        return np.nan
    try:
        return float(x)
    except:
        return np.nan

In [25]:
def preprocess_data(df):
    # Handle categorical variables
    categorical_cols = ['neighbourhood_cleansed', 'room_type', 'property_type', 'host_response_time', 'bathrooms_text']
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
    
    # Handle boolean columns
    boolean_cols = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'has_availability', 'instant_bookable']
    for col in boolean_cols:
        df[col] = df[col].map({'t': 1, 'f': 0, True: 1, False: 0}).fillna(0).astype(int)
    
    # Select relevant numerical columns and clean them
    numerical_cols = ['accommodates', 'host_listings_count', 'latitude', 'longitude', 'minimum_nights', 'availability_365', 'number_of_reviews', 'review_scores_rating']
    for col in numerical_cols:
        df[col] = df[col].apply(clean_numeric)
    
    # Handle price separately
    df['price'] = df['price'].apply(clean_price).apply(clean_numeric)
    
    # Combine features
    features = df[categorical_cols + boolean_cols + numerical_cols + ['price']]
    
    # Remove rows with NaN values
    features = features.dropna()
    
    # Calculate ROI (you may need to adjust this based on your specific ROI calculation)
    df['roi'] = df['price'] * df['number_of_reviews'] / 365  # This is a simplified ROI calculation
    
    # Normalize features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)
    
    return features_scaled, df.loc[features.index, 'roi'].values

In [26]:
def train_model(model, train_loader, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        for batch_features, batch_targets in train_loader:
            batch_features, batch_targets = batch_features.to(device), batch_targets.to(device)
            
            optimizer.zero_grad()
            outputs = model(batch_features)
            loss = criterion(outputs.squeeze(), batch_targets)
            loss.backward()
            optimizer.step()
        
        # Print progress
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

In [27]:
def evaluate_model(model, test_loader):
    model.eval()
    predictions = []
    actual = []
    with torch.no_grad():
        for batch_features, batch_targets in test_loader:
            batch_features, batch_targets = batch_features.to(device), batch_targets.to(device)
            outputs = model(batch_features)
            predictions.extend(outputs.cpu().numpy())
            actual.extend(batch_targets.cpu().numpy())

    predictions = np.array(predictions).squeeze()
    actual = np.array(actual)

    mse = np.mean((predictions - actual) ** 2)
    print(f'Mean Squared Error: {mse:.4f}')
    return predictions, actual

In [28]:
def main():
    # Load data
    df = pd.read_csv("final_data.csv", delimiter=';', on_bad_lines='skip')
    
    # Print data types and first few rows
    print("Data Types:")
    print(df.dtypes)
    print("\nFirst few rows:")
    print(df.head())

    # Preprocess data
    features, targets = preprocess_data(df)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=42)

    # Create DataLoaders
    train_dataset = AirbnbDataset(X_train, y_train)
    test_dataset = AirbnbDataset(X_test, y_test)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # Initialize the model
    input_dim = X_train.shape[1]
    model = ROIPredictionCNN(input_dim).to(device)

    # Loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Train the model
    num_epochs = 100
    train_model(model, train_loader, criterion, optimizer, num_epochs)

    # Evaluate the model
    predictions, actual = evaluate_model(model, test_loader)

    # Find the best neighborhoods
    df['predicted_roi'] = model(torch.FloatTensor(features).to(device)).cpu().detach().numpy()
    best_neighborhoods = df.groupby('neighbourhood_cleansed')['predicted_roi'].mean().sort_values(ascending=False)
    print("\nTop 5 Neighborhoods for ROI:")
    print(best_neighborhoods.head())

if __name__ == "__main__":
    main()

  df = pd.read_csv("final_data.csv", delimiter=';', on_bad_lines='skip')


Data Types:
neighbourhood_cleansed                           object
room_type                                        object
accommodates                                      int64
property_type                                    object
listing_url                                      object
                                                 ...   
calculated_host_listings_count_entire_homes     float64
calculated_host_listings_count_private_rooms    float64
calculated_host_listings_count_shared_rooms     float64
reviews_per_month                               float64
Month                                            object
Length: 64, dtype: object

First few rows:
   neighbourhood_cleansed        room_type  accommodates       property_type  \
0                  Weston  Entire home/apt             8         Entire home   
1                  Weston  Entire home/apt             8         Entire home   
2                  Weston  Entire home/apt             5  Entire rental unit   
3  Centen

ValueError: Length of values (248294) does not match length of index (248341)