In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [19]:
# Load the data
df = pd.read_csv(r"C:\Users\User\Desktop\expedia_clean_dfs\expedia_clean_df_25_06.csv")

In [20]:
# Function to fill NaN values with the mean of the group
def fill_na_with_group_mean(group):
    numeric_cols = group.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        group[col] = group[col].fillna(group[col].mean())
    return group

# Split the data into five data frames based on the star rating
groups = {
    1: fill_na_with_group_mean(df[df['star_rating'] == 1]),
    2: fill_na_with_group_mean(df[df['star_rating'] == 2]),
    3: fill_na_with_group_mean(df[df['star_rating'] == 3]),
    4: fill_na_with_group_mean(df[df['star_rating'] == 4]),
    5: fill_na_with_group_mean(df[df['star_rating'] == 5])
}

# Function to normalize specified columns
def normalize_columns(group, columns_to_normalize):
    if not group.empty:
        scaler = StandardScaler()
        group[columns_to_normalize] = scaler.fit_transform(group[columns_to_normalize])
    return group

# Columns to be normalized
columns_to_normalize = ['reviews', 'price_per_night', 'original_price', 'km_from_center']

# Apply normalization to each group
normalized_groups = {}
for star_rating, group in groups.items():
    normalized_groups[star_rating] = normalize_columns(group, columns_to_normalize)

# Combine normalized groups back into a single dataframe
normalized_df = pd.concat(normalized_groups.values())

# Save the normalized dataframe to a new CSV file
normalized_df.to_csv(r"C:\Users\User\Desktop\expedia_clean_dfs\expedia_normalized_data_25_06.csv", index=False)

print("Data normalization complete. Normalized data saved to 'normalized_hotels_data.csv'.")

Data normalization complete. Normalized data saved to 'normalized_hotels_data.csv'.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group[col] = group[col].fillna(group[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group[columns_to_normalize] = scaler.fit_transform(group[columns_to_normalize])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group[columns_to_normalize] = scaler.fit_transform(group[columns_to_normali

In [21]:
normalized_df

Unnamed: 0,Snapshot,start_date,end_date,name,TTT,LOS,score,reviews,price_per_night,original_price,...,Free_cancellation,No_payment,Breakfast,Option Member,Index,star_rating,location_rating,neighborhood,km_from_center,neighborhood_category
28,55:17.6,26/06/2024,27/06/2024,31 Street Broadway Hotel,1,1,8.770833,-1.504810,-2.729275,-1.417225,...,0,0,0,1,51,2.0,9.445409,NoMad,3.077322,Downtown Manhattan
41,55:18.0,26/06/2024,27/06/2024,Hotel St. James,1,1,8.680000,0.645451,3.842184,2.421259,...,0,0,0,1,83,2.0,9.700000,Manhattan,-0.572525,Other Manhattan
50,55:18.3,26/06/2024,27/06/2024,Doxie Hotel,1,1,8.770833,-1.560150,1.687028,-1.417225,...,0,0,0,0,107,2.0,9.445409,Manhattan,0.948245,Other Manhattan
180,55:50.7,26/06/2024,28/06/2024,31 Street Broadway Hotel,1,2,8.770833,-1.504810,-2.499628,-1.417225,...,0,0,0,1,104,2.0,9.445409,NoMad,3.077322,Downtown Manhattan
181,55:50.8,26/06/2024,28/06/2024,Hotel St. James,1,2,8.680000,0.645451,2.923593,-1.417225,...,0,0,0,1,106,2.0,9.700000,Manhattan,-0.572525,Other Manhattan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9973,40:33.2,25/07/2024,28/07/2024,"Kimpton Hotel Theta, an IHG Hotel",30,3,9.090000,-0.483494,0.085056,-0.671597,...,0,0,0,0,110,5.0,9.300000,Manhattan,-0.131157,Other Manhattan
10041,40:39.7,25/07/2024,28/07/2024,"Le Meridien New York, Central Park by Marriott",30,3,8.770833,2.070616,0.172176,-0.671597,...,0,0,0,0,243,5.0,9.445409,Manhattan,-1.911138,Other Manhattan
10046,40:40.3,25/07/2024,28/07/2024,The Benjamin Royal Sonesta New York,30,3,8.680000,2.066408,0.259295,1.920169,...,0,0,0,0,256,5.0,9.445409,Midtown East,2.538816,Midtown Manhattan
10125,41:23.9,25/07/2024,29/07/2024,"Kimpton Hotel Theta, an IHG Hotel",30,4,9.090000,-0.483494,-0.089183,-0.671597,...,0,0,0,0,118,5.0,9.300000,Manhattan,-0.131157,Other Manhattan


# Run Models to see results improvements:

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Function to fill NaN values with the mean of the group
def fill_na_with_group_mean(group):
    numeric_cols = group.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        group[col].fillna(group[col].mean(), inplace=True)
    return group

# Split the data into five data frames based on the star rating
groups = {
    1: fill_na_with_group_mean(df[df['star_rating'] == 1]),
    2: fill_na_with_group_mean(df[df['star_rating'] == 2]),
    3: fill_na_with_group_mean(df[df['star_rating'] == 3]),
    4: fill_na_with_group_mean(df[df['star_rating'] == 4]),
    5: fill_na_with_group_mean(df[df['star_rating'] == 5])
}

# Function to normalize specified columns
def normalize_columns(group, columns_to_normalize):
    scaler = StandardScaler()
    group[columns_to_normalize] = scaler.fit_transform(group[columns_to_normalize])
    return group

# Columns to be normalized
columns_to_normalize = ['reviews', 'price_per_night', 'original_price', 'km_from_center']

# Function to run models and calculate metrics
def run_models(group):
    # Fill NaN values with the mean of the group
    group = fill_na_with_group_mean(group)
    
    # Normalize specified columns
    group = normalize_columns(group, columns_to_normalize)
    
    # Keep only numeric columns
    group = group.select_dtypes(include=[np.number])
    
    # Split the data into train and test sets
    X = group.drop(columns=['price_per_night'])
    y = group['price_per_night']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Initialize models
    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree Regressor (max_depth=6)': DecisionTreeRegressor(max_depth=6),
        'Decision Tree Regressor (max_depth=8)': DecisionTreeRegressor(max_depth=8),
        'Decision Tree Regressor (max_depth=10)': DecisionTreeRegressor(max_depth=10),
        'Decision Tree Regressor (max_depth=12)': DecisionTreeRegressor(max_depth=12),
        'Decision Tree Regressor (max_depth=14)': DecisionTreeRegressor(max_depth=14),
        'Gaussian Process Regressor': GaussianProcessRegressor()
    }
    
    results = []
    
    # Fit models and calculate metrics for both train and test sets
    for name, model in models.items():
        model.fit(X_train, y_train)
        
        # Predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        # Training metrics
        train_r2 = r2_score(y_train, y_train_pred)
        train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
        train_mse = mean_squared_error(y_train, y_train_pred)
        train_mae = mean_absolute_error(y_train, y_train_pred)
        
        # Test metrics
        test_r2 = r2_score(y_test, y_test_pred)
        test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
        test_mse = mean_squared_error(y_test, y_test_pred)
        test_mae = mean_absolute_error(y_test, y_test_pred)
        
        results.append({
            'Model': name,
            'Train R2': train_r2,
            'Train RMSE': train_rmse,
            'Train MSE': train_mse,
            'Train MAE': train_mae,
            'Test R2': test_r2,
            'Test RMSE': test_rmse,
            'Test MSE': test_mse,
            'Test MAE': test_mae,
            'Train Size': len(X_train),
            'Test Size': len(X_test)
        })
    
    return pd.DataFrame(results)

# Run models for each group and store the results
results = {}
for star_rating, group in groups.items():
    results[star_rating] = run_models(group)

# Display results
for star_rating, result in results.items():
    print(f"Results for {star_rating}-star hotels:")
    print(result)
    print("\n")


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group[col].fillna(group[col].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group[columns_to_normalize] = scaler.fit_transform(group[columns_to_normalize])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group[col].fillna(group[col].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

Results for 1-star hotels:
                                    Model  Train R2  Train RMSE  Train MSE  \
0                       Linear Regression       NaN         0.0        0.0   
1   Decision Tree Regressor (max_depth=6)       NaN         0.0        0.0   
2   Decision Tree Regressor (max_depth=8)       NaN         0.0        0.0   
3  Decision Tree Regressor (max_depth=10)       NaN         0.0        0.0   
4  Decision Tree Regressor (max_depth=12)       NaN         0.0        0.0   
5  Decision Tree Regressor (max_depth=14)       NaN         0.0        0.0   
6              Gaussian Process Regressor       NaN         0.0        0.0   

   Train MAE  Test R2  Test RMSE  Test MSE  Test MAE  Train Size  Test Size  
0        0.0      NaN        0.0       0.0       0.0           1          1  
1        0.0      NaN        0.0       0.0       0.0           1          1  
2        0.0      NaN        0.0       0.0       0.0           1          1  
3        0.0      NaN        0.0    