In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from scipy.stats.mstats import winsorize
from scipy.stats import probplot


df = pd.read_csv('rentprediction_dataset_v5.csv')

y = df['rent']
X = df.drop(columns={'rent'})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

df.describe()

Unnamed: 0,rent,age,sqmtr,newbuild,rooms,elabel,parking,bedrooms,bathrooms,floor,...,eucl_dist_to_Station Amsterdam Lelylaan,eucl_dist_to_Station RAI Amsterdam,eucl_dist_to_Station Amsterdam Zuid/WTC,eucl_dist_to_Station Amsterdam Bijlmer,eucl_dist_to_Station Amsterdam Holendrecht,eucl_dist_to_Station Weesp,eucl_dist_to_Station Amsterdam Centraal,eucl_dist_to_nearest_park,eucl_dist_to_nearest_train_station,eucl_dist_to_nearest_tramsubway
count,881.0,881.0,881.0,881.0,881.0,881.0,881.0,881.0,881.0,881.0,...,881.0,881.0,881.0,881.0,881.0,881.0,881.0,881.0,881.0,881.0
mean,2652.91714,61.398603,91.702885,0.288309,3.154932,3.76492,0.090806,2.051294,1.636809,4.705616,...,0.060151,0.051768,0.050915,0.092516,0.109739,0.17155,0.045874,0.011267,0.016273,0.004452
std,1417.46715,72.777349,41.420398,0.453233,1.159004,1.367979,0.287496,0.893739,0.445677,4.713962,...,0.036419,0.025751,0.027222,0.031364,0.033585,0.04301,0.026433,0.005718,0.011168,0.003596
min,1045.0,0.0,24.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,...,0.002695,0.003135,0.002761,0.004402,0.0045,0.047223,0.003464,0.000494,0.002454,0.000212
25%,1791.0,0.0,65.0,0.0,2.0,3.0,0.0,1.222252,1.5,2.0,...,0.030588,0.030851,0.030333,0.072262,0.09014,0.149139,0.02532,0.006946,0.006441,0.002167
50%,2300.0,26.0,77.85932,0.0,3.0,4.0,0.0,2.0,1.5,3.0,...,0.051895,0.050054,0.047312,0.088812,0.107277,0.17125,0.040498,0.010667,0.015036,0.003692
75%,2950.0,118.0,107.0,1.0,4.0,5.0,0.0,2.0,1.677922,6.042212,...,0.076476,0.075391,0.063908,0.115485,0.133394,0.207197,0.067956,0.015586,0.021504,0.00599
max,12500.0,509.0,350.0,1.0,9.0,8.0,1.0,7.0,5.0,31.0,...,0.179307,0.133469,0.147381,0.179422,0.195561,0.274936,0.131884,0.042465,0.074855,0.043112


In [37]:
#logrithmic transformation rent and removing the outliers.
df['rent_log'] = np.log(df['rent'])

Q1 = df['rent_log'].quantile(0.25)
Q3 = df['rent_log'].quantile(0.75)
IQR = Q3 - Q1

# Determine bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
df['is_outlier'] = (df['rent_log'] < lower_bound) | (df['rent_log'] > upper_bound)

df_cleaned = df[~df['is_outlier']].copy()

# Drop the 'is_outlier' column from the cleaned DataFrame
df_cleaned.drop(columns=['is_outlier'], inplace=True)
df = df_cleaned
print(f"Number of outliers: {num_outliers}")

Number of outliers: 31


In [38]:
df.describe()

Unnamed: 0,rent,age,sqmtr,newbuild,rooms,elabel,parking,bedrooms,bathrooms,floor,...,eucl_dist_to_Station RAI Amsterdam,eucl_dist_to_Station Amsterdam Zuid/WTC,eucl_dist_to_Station Amsterdam Bijlmer,eucl_dist_to_Station Amsterdam Holendrecht,eucl_dist_to_Station Weesp,eucl_dist_to_Station Amsterdam Centraal,eucl_dist_to_nearest_park,eucl_dist_to_nearest_train_station,eucl_dist_to_nearest_tramsubway,rent_log
count,850.0,850.0,850.0,850.0,850.0,850.0,850.0,850.0,850.0,850.0,...,850.0,850.0,850.0,850.0,850.0,850.0,850.0,850.0,850.0,850.0
mean,2448.729412,59.421376,87.241461,0.295294,3.067641,3.782629,0.089412,1.996694,1.59752,4.770872,...,0.052604,0.05181,0.09288,0.110043,0.171614,0.046497,0.011366,0.016286,0.004518,7.744303
std,898.781515,71.852881,32.590348,0.456443,1.022914,1.351196,0.285505,0.830249,0.378155,4.756608,...,0.025785,0.02719,0.031807,0.034094,0.04373,0.026604,0.005765,0.011327,0.003634,0.336672
min,1045.0,0.0,24.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,...,0.003135,0.002761,0.004402,0.0045,0.047223,0.003464,0.000494,0.002454,0.000212,6.951772
25%,1760.0,0.0,64.0,0.0,2.0,3.0,0.0,1.0,1.5,2.0,...,0.031841,0.031303,0.071707,0.089863,0.147668,0.025699,0.00698,0.006266,0.0022,7.473069
50%,2250.0,23.0,75.496533,0.0,3.0,4.0,0.0,2.0,1.5,3.0,...,0.050293,0.048912,0.089733,0.107595,0.171189,0.041639,0.010751,0.014919,0.003817,7.718685
75%,2850.0,118.0,103.659958,1.0,3.30589,5.0,0.0,2.0,1.677914,6.379518,...,0.07544,0.06391,0.117457,0.135829,0.208171,0.067956,0.016444,0.022119,0.00599,7.955074
max,6000.0,509.0,252.0,1.0,8.0,8.0,1.0,7.0,3.5,31.0,...,0.133469,0.147381,0.179422,0.195561,0.274936,0.131884,0.042465,0.074855,0.043112,8.699515


In [39]:
#logrithmic transformation rent and removing the outliers.
df['rent_log'] = np.log(df['rent'])

Q1 = df['rent_log'].quantile(0.25)
Q3 = df['rent_log'].quantile(0.75)
IQR = Q3 - Q1

# Determine bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
df['is_outlier'] = (df['rent_log'] < lower_bound) | (df['rent_log'] > upper_bound)

df_cleaned = df[~df['is_outlier']].copy()

# Drop the 'is_outlier' column from the cleaned DataFrame
df_cleaned.drop(columns=['is_outlier'], inplace=True)
df = df_cleaned
print(f"Number of outliers: {num_outliers}")

1044.9998282031518


In [23]:
import pandas as pd
from scipy.stats import shapiro

# Function to apply Shapiro-Wilk test
def apply_shapiro(df):
    results = {}
    for column in df.columns:
        stat, p_value = shapiro(df[column])
        results[column] = {'Statistic': stat, 'p-value': p_value}
    return pd.DataFrame(results).T

# Apply the test to each column
shapiro_results = apply_shapiro(df)
for row in shapiro_results.index:
    if df['p_value'] < 0.05:
        print (df.columns())


KeyError: 'p_value'

In [21]:
Q1 = df['rent'].quantile(0.25)
Q3 = df['rent'].quantile(0.75)
IQR = Q3 - Q1
upper_bound = Q3 + 1.5 * IQR

# Count the number of outliers above the upper limit
outliers_above_upper = df[df['rent'] > upper_bound]
count_above_upper = outliers_above_upper.shape[0]

# Print the upper limit and count of outliers above the upper limit
print("Upper Limit:", upper_bound)
print("Number of outliers above the upper limit:", count_above_upper)

Upper Limit: 4689.25
Number of outliers above the upper limit: 60


In [20]:
Q1 = df['rent_log'].quantile(0.25)
Q3 = df['rent_log'].quantile(0.75)
IQR = Q3 - Q1
upper_bound = Q3 + 1.5 * IQR

# Count the number of outliers above the upper limit
outliers_above_upper = df[df['rent_log'] > upper_bound]
count_above_upper = outliers_above_upper.shape[0]

# Print the upper limit and count of outliers above the upper limit
print("Upper Limit:", upper_bound)
print("Number of outliers above the upper limit:", count_above_upper)

Upper Limit: 8.73852589766303
Number of outliers above the upper limit: 32
