In [None]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import  LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
df1 = pd.read_csv("/kaggle/input/airbnb-prices-in-european-cities/amsterdam_weekdays.csv")
df2 = pd.read_csv("/kaggle/input/airbnb-prices-in-european-cities/amsterdam_weekends.csv")


# Concatenate the DataFrames
amsterdam_df = pd.concat([df1, df2], ignore_index=True)

# Save the combined DataFrame to a new CSV file
amsterdam_df.to_csv('amsterdam_data.csv', index=False)

In [None]:
amsterdam_df.shape

In [None]:
amsterdam_df.columns

In [None]:
amsterdam_df.info()

In [None]:
amsterdam_df.describe()

In [None]:
#Checking distributions of the various features in the dataset

index= ["room_shared","realSum", "room_type","room_private","person_capacity","host_is_superhost", "multi", "biz", "cleanliness_rating",
        "guest_satisfaction_overall", "bedrooms", "dist", "metro_dist", "attr_index_norm", "attr_index", "rest_index", "rest_index_norm", "lng", 
        "lat"]

for i in index:
    
    print(amsterdam_df[i].value_counts(), "\n")
    print("---------------------------------------------------------------")

In [None]:
# Identify and filter categorical columns
categorical_cols = [col for col in amsterdam_df.columns if amsterdam_df[col].dtype == 'object']

# Calculate the number of rows needed based on the number of categorical columns
num_col = len(categorical_cols)
num_row = (num_col + 2) // 3  # Calculate the number of rows needed

plt.figure(figsize=(15, 5 * num_row))  # Adjust the figure size based on the number of rows

for i, col in enumerate(categorical_cols, 1):
    plt.subplot(num_row, 3, i)  # 3 columns per row
    sns.countplot(data=amsterdam_df, x=col)
    plt.title(f'Count Plot of {col}')

plt.tight_layout()
plt.show()

In [None]:
unnamed = amsterdam_df['Unnamed: 0']
attr_index = amsterdam_df['attr_index']
attr_index_norm = amsterdam_df['attr_index_norm']
rest_index = amsterdam_df['rest_index']
rest_index_norm = amsterdam_df['rest_index_norm']

# Drop the specified columns from the DataFrame
columns_to_drop = ['Unnamed: 0', 'attr_index', 'attr_index_norm', 'rest_index', 'rest_index_norm']
amsterdam_df = amsterdam_df.drop(columns_to_drop, axis=1)

# Display the updated DataFrame
amsterdam_df.head()

In [None]:
amsterdam_df = pd.get_dummies(amsterdam_df)
amsterdam_df.head()

In [None]:
le = LabelEncoder()

for col in amsterdam_df.columns:
    if amsterdam_df[col].dtype == 'bool':
        print(f"Column '{col}' is boolean and will be converted to binary using LabelEncoder.")
        
        # Fit and transform the LabelEncoder on the column
        amsterdam_df[col] = le.fit_transform(amsterdam_df[col])

amsterdam_df.head()

In [None]:
y = amsterdam_df['realSum'] #Target variable / Dependent variable

X = amsterdam_df.drop('realSum', axis =1) # Independent 

correlation = X.corrwith(y)
print(correlation)

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x=correlation.values, y=correlation.index, palette="viridis")
plt.xlabel('Correlation Coefficient')
plt.ylabel('Features')
plt.title('Correlation between Features and Booking Prices')
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.show()

In [None]:
corr_matrix = X.corr()

# Set the font size for labels
plt.rc('font', size=8)

# Create a heatmap plot of the correlation matrix
plt.figure(figsize=(12, 12))
plt.imshow(corr_matrix, cmap='coolwarm', interpolation='nearest', vmin=-1, vmax=1)
plt.colorbar()
plt.title('Correlation Matrix between features')
plt.xticks(np.arange(len(corr_matrix.columns)), corr_matrix.columns, rotation=45)
plt.yticks(np.arange(len(corr_matrix.columns)), corr_matrix.columns)
plt.tight_layout()
plt.show()

In [None]:
print(corr_matrix)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor #Handling multicollinearity
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
  
print(vif_data)

In [None]:
X = X.drop(['room_shared','room_private', 'room_type_Private room','cleanliness_rating', 'room_type_Shared room'], axis=1)
X.head()

In [None]:
X = amsterdam_df['person_capacity'].values
y = amsterdam_df['realSum'].values
print(X)
print(y)

In [None]:
X = X.reshape(-1,1)
print(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, test_size=.2, random_state=100)

In [None]:
print(f'X_train shape{X_train}')
print(f'X_test shape{X_test}')
print(f'y_train shape{y_train}')
print(f'y_test shape{y_test}')

In [None]:
plt.scatter(X_train, y_train)
plt.xlabel('Person Capacity')
plt.ylabel('Price of Airbnb')
plt.title('Amsterdam Airbnb Training Data')
plt.show()

In [None]:
lm = LinearRegression()
lm.fit(X_train, y_train)
y_predict = lm.predict(X_test)
print(f'Train Accuracy {round(lm.score(X_train, y_train)* 100,2)}%')
print(f'Test Accuracy {round(lm.score(X_test, y_test)* 100,2)}%')

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Create a Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

# Visualize predicted vs. actual booking prices (optional)
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel('Actual Booking Price')
plt.ylabel('Predicted Booking Price')
plt.title('Actual vs. Predicted Booking Prices')
plt.show()

In [None]:
In this program, I had a lot of overfitting which I should have standardize before getting to this point. I will look into the program again and get the best model to use while comparing Linear Regression and Random Forest Regression models.