# Import Essential Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Check out the Data

## Training data

In [None]:
train_data_raw = pd.read_csv('train_data.csv')
train_data_raw.head()

In [None]:
train_data_raw.info()

In [None]:
train_data_raw.describe()

In [None]:
train_data_raw[train_data_raw['minimum_nights'] == train_data_raw['minimum_nights'].max()]

In [None]:
train_data_raw[train_data_raw['number_of_reviews'] == train_data_raw['number_of_reviews'].max()]

In [None]:
train_data_raw[train_data_raw['reviews_per_month'] == train_data_raw['reviews_per_month'].max()]

In [None]:
train_data_raw[train_data_raw['calculated_host_listings_count'] == train_data_raw['calculated_host_listings_count'].max()]

## Testing data

In [None]:
test_data_raw = pd.read_csv('test_data.csv')
test_data_raw.head()

In [None]:
test_data_raw.info()

In [None]:
test_data_raw.describe()

# PreProcessing

In [None]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    df = df.dropna(axis="rows")

    df = df.drop(['id', 'name', 'host_id', 'host_name'], axis="columns")

    df['last_review'] = pd.to_datetime(df['last_review'])
    df['last_review'] = max(df['last_review'])-df['last_review']
    df['last_review'] = df['last_review'].dt.days

    ohe = pd.get_dummies(df[{'neighbourhood_group','room_type'}])
    df = pd.concat([df, ohe], axis='columns')
    df = df.drop(['room_type','neighbourhood_group'], axis='columns')

    return df

In [None]:
train_data = preprocess(train_data_raw)
test_data = preprocess(test_data_raw)

# Exploratory Data Analysis (EDA)

In [None]:
plt.figure(figsize=(15, 6))
sns.scatterplot(train_data_raw.longitude, train_data_raw.latitude,
                hue=train_data_raw.neighbourhood_group)
plt.ioff()

In [None]:
sns.pairplot(train_data)

In [None]:
sns.boxplot(train_data['price'])

In [None]:
sns.boxplot(train_data['minimum_nights'])

In [None]:
sns.boxplot(train_data['number_of_reviews'])

In [None]:
sns.boxplot(train_data['reviews_per_month'])

In [None]:
sns.boxplot(train_data['calculated_host_listings_count'])

In [None]:
corr = train_data.corr(method='kendall')
plt.figure(figsize=(15, 10))
sns.heatmap(corr, annot=True, center=0, vmin=-1, vmax=1)

# Model Training

## Preparing

In [None]:
train_data.columns

### Select features

In [None]:
X = train_data[['latitude', 'longitude', 'calculated_host_listings_count', 'neighbourhood_group_Bronx', 'neighbourhood_group_Brooklyn',
       'neighbourhood_group_Manhattan', 'neighbourhood_group_Queens', 'room_type_Entire home/apt', 'room_type_Private room']]
Y = train_data['price']

### Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

### Metrics

In [None]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score

def cross_val(model):
    pred = cross_val_score(model, X, Y, cv=10)
    return pred.mean()

def print_evaluate(true, predicted):  
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    print('MAE:', mae)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('R2 Square', r2_square)
    print('__________________________________')
    
def evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    return mae, mse, rmse, r2_square

### PreProcessing

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('std_scalar', StandardScaler())
])

X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression(normalize=True)
lin_reg.fit(X_train,y_train)
test_pred = lin_reg.predict(X_test)
train_pred = lin_reg.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

results_df = pd.DataFrame(data=[["Linear Regression", *evaluate(y_test, test_pred) , cross_val(LinearRegression())]], 
                          columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', "Cross Validation"])

In [None]:
sns.scatterplot(y_test, test_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")

In [None]:
sns.kdeplot(y_test - test_pred, fill=True)
plt.xlabel("Residual")

## Robust Regression - Random Sample Consensus - RANSAC

In [None]:
from sklearn.linear_model import RANSACRegressor

model = RANSACRegressor(base_estimator=LinearRegression(), max_trials=100)
model.fit(X_train, y_train)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

results_df_2 = pd.DataFrame(data=[["Robust Regression", *evaluate(y_test, test_pred) , cross_val(RANSACRegressor())]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', "Cross Validation"])
results_df = results_df.append(results_df_2, ignore_index=True)

In [None]:
sns.scatterplot(y_test, test_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")

In [None]:
sns.kdeplot(y_test - test_pred, fill=True)
plt.xlabel("Residual")

## Ridge Regression

In [None]:
from sklearn.linear_model import Ridge

model = Ridge(alpha=100, solver='cholesky', tol=0.0001, random_state=0)
model.fit(X_train, y_train)
pred = model.predict(X_test)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

results_df_2 = pd.DataFrame(data=[["Ridge Regression", *evaluate(y_test, test_pred) , cross_val(Ridge())]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', "Cross Validation"])
results_df = results_df.append(results_df_2, ignore_index=True)

In [None]:
sns.scatterplot(y_test, test_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")

In [None]:
sns.kdeplot(y_test - test_pred, fill=True)
plt.xlabel("Residual")

## LASSO Regression

In [None]:
from sklearn.linear_model import Lasso

model = Lasso(alpha=0.1, 
              precompute=True, 
              positive=True, 
              selection='random',
              random_state=42)
model.fit(X_train, y_train)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

results_df_2 = pd.DataFrame(data=[["Lasso Regression", *evaluate(y_test, test_pred) , cross_val(Lasso())]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', "Cross Validation"])
results_df = results_df.append(results_df_2, ignore_index=True)

In [None]:
sns.scatterplot(y_test, test_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")

In [None]:
sns.kdeplot(y_test - test_pred, fill=True)
plt.xlabel("Residual")

## Elastic Net

In [None]:
from sklearn.linear_model import ElasticNet

model = ElasticNet(alpha=0.1, l1_ratio=0.9, selection='random', random_state=42)
model.fit(X_train, y_train)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

results_df_2 = pd.DataFrame(data=[["Elastic Net Regression", *evaluate(y_test, test_pred) , cross_val(ElasticNet())]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', "Cross Validation"])
results_df = results_df.append(results_df_2, ignore_index=True)

In [None]:
sns.scatterplot(y_test, test_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")

In [None]:
sns.kdeplot(y_test - test_pred, fill=True)
plt.xlabel("Residual")

## Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly_reg = PolynomialFeatures(degree=2)

X_train_2_d = poly_reg.fit_transform(X_train)
X_test_2_d = poly_reg.transform(X_test)

lin_reg = LinearRegression(normalize=True)
lin_reg.fit(X_train_2_d,y_train)

test_pred = lin_reg.predict(X_test_2_d)
train_pred = lin_reg.predict(X_train_2_d)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

results_df_2 = pd.DataFrame(data=[["Polynomail Regression", *evaluate(y_test, test_pred), 0]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', 'Cross Validation'])
results_df = results_df.append(results_df_2, ignore_index=True)

In [None]:
sns.scatterplot(y_test, test_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")

In [None]:
sns.kdeplot(y_test - test_pred, fill=True)
plt.xlabel("Residual")

## Stochastic Gradient Descent

In [None]:
from sklearn.linear_model import SGDRegressor

sgd_reg = SGDRegressor(n_iter_no_change=250, penalty=None, eta0=0.0001, max_iter=100000)
sgd_reg.fit(X_train, y_train)

test_pred = sgd_reg.predict(X_test)
train_pred = sgd_reg.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

results_df_2 = pd.DataFrame(data=[["Stochastic Gradient Descent", *evaluate(y_test, test_pred), 0]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', 'Cross Validation'])
results_df = results_df.append(results_df_2, ignore_index=True)

In [None]:
sns.scatterplot(y_test, test_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")

In [None]:
sns.kdeplot(y_test - test_pred, fill=True)
plt.xlabel("Residual")

## Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor(n_estimators=1000)
rf_reg.fit(X_train, y_train)

test_pred = rf_reg.predict(X_test)
train_pred = rf_reg.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

results_df_2 = pd.DataFrame(data=[["Random Forest Regressor", *evaluate(y_test, test_pred), 0]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', 'Cross Validation'])
results_df = results_df.append(results_df_2, ignore_index=True)

In [None]:
sns.scatterplot(y_test, test_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")

In [None]:
sns.kdeplot(y_test - test_pred, fill=True)
plt.xlabel("Residual")

## Support Vector Machine

In [None]:
from sklearn.svm import SVR

svm_reg = SVR(kernel='rbf', C=1000000, epsilon=0.001)
svm_reg.fit(X_train, y_train)

test_pred = svm_reg.predict(X_test)
train_pred = svm_reg.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

results_df_2 = pd.DataFrame(data=[["SVM Regressor", *evaluate(y_test, test_pred), 0]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', 'Cross Validation'])
results_df = results_df.append(results_df_2, ignore_index=True)

In [None]:
sns.scatterplot(y_test, test_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")

In [None]:
sns.kdeplot(y_test - test_pred, fill=True)
plt.xlabel("Residual")

# Models Comparison

In [None]:
results_df

In [None]:
results_df.set_index('Model', inplace=True)
results_df['R2 Square'].plot(kind='barh', figsize=(12, 8))