In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import numpy as np

In [10]:
week_days_path = "./data/paris_weekdays_district.csv"
week_ends_path = "./data/paris_weekends_district.csv"

paris_weekdays = pd.read_csv(week_days_path, index_col=0)
paris_weekends = pd.read_csv(week_ends_path, index_col=0)

paris_weekdays['is_weekend'] = 0
paris_weekends['is_weekend'] = 1

paris_all = pd.concat([paris_weekdays, paris_weekends], ignore_index=True)

print("Shape of combined dataset:", paris_all.shape)

Shape of combined dataset: (6688, 21)


In [11]:
# Find the 3 biggest outliers on realSum attribute (both high and low extremes)
# Calculate z-scores to measure how far each value is from the mean
label = 'metro_dist'

mean_realSum = paris_all[label].mean()
median_realSum = paris_all[label].median()
std_realSum = paris_all[label].std()
paris_all['z_score'] = np.abs((paris_all[label] - mean_realSum) / std_realSum)

# Find the 3 biggest outliers based on absolute z-score
top_3_outliers = paris_all.nlargest(3, 'z_score')
print("Top 3 biggest outliers on realSum attribute:")

print()

print(f"Mean: {mean_realSum:.2f}, Median: {median_realSum:.2f}, Std: {std_realSum:.2f}\n")
print(top_3_outliers[:3][['realSum', 'metro_dist', 'room_type', 'person_capacity', 'district', 'dist', 'bedrooms', "cleanliness_rating", "guest_satisfaction_overall"]])

Top 3 biggest outliers on realSum attribute:

Mean: 0.23, Median: 0.21, Std: 0.12

         realSum  metro_dist        room_type  person_capacity  district  \
2853  510.066176    1.154777  Entire home/apt              2.0         0   
6416  510.299189    1.154771  Entire home/apt              2.0         0   
1677  178.488210    1.045861     Private room              2.0         0   

          dist  bedrooms  cleanliness_rating  guest_satisfaction_overall  
2853  6.855428         1                10.0                       100.0  
6416  6.855421         1                10.0                       100.0  
1677  5.210635         1                10.0                        95.0  


In [14]:
print(top_3_outliers[:3][['metro_dist', 'realSum', 'dist', 'guest_satisfaction_overall']])

      metro_dist     realSum      dist  guest_satisfaction_overall
2853    1.154777  510.066176  6.855428                       100.0
6416    1.154771  510.299189  6.855421                       100.0
1677    1.045861  178.488210  5.210635                        95.0


In [12]:
top_3_outliers = paris_all.nlargest(3, 'metro_dist')
print(top_3_outliers)

         realSum        room_type  room_shared  room_private  person_capacity  \
2853  510.066176  Entire home/apt        False         False              2.0   
6416  510.299189  Entire home/apt        False         False              2.0   
1677  178.488210     Private room        False          True              2.0   

      host_is_superhost  multi  biz  cleanliness_rating  \
2853              False      0    0                10.0   
6416              False      0    0                10.0   
1677              False      0    0                10.0   

      guest_satisfaction_overall  ...  metro_dist  attr_index  \
2853                       100.0  ...    1.154777  229.908990   
6416                       100.0  ...    1.154771  229.908650   
1677                        95.0  ...    1.045861  138.576975   

      attr_index_norm  rest_index  rest_index_norm      lng       lat  \
2853        11.191892  381.303571        22.405221  2.26551  48.87872   
6416        11.179325  381.3041

In [13]:
# Show all possible values for guest_satisfaction_overall
label = "guest_satisfaction_overall"
print(f"Unique values for {label}:")
print(sorted(paris_all[label].unique()))
print(f"\nTotal unique values: {paris_all[label].nunique()}")
print(f"\nValue counts:")
print(paris_all[label].value_counts().sort_index())

Unique values for guest_satisfaction_overall:
[np.float64(20.0), np.float64(40.0), np.float64(50.0), np.float64(53.0), np.float64(56.0), np.float64(57.0), np.float64(60.0), np.float64(63.0), np.float64(64.0), np.float64(65.0), np.float64(67.0), np.float64(68.0), np.float64(69.0), np.float64(70.0), np.float64(71.0), np.float64(72.0), np.float64(73.0), np.float64(74.0), np.float64(75.0), np.float64(76.0), np.float64(77.0), np.float64(78.0), np.float64(79.0), np.float64(80.0), np.float64(81.0), np.float64(82.0), np.float64(83.0), np.float64(84.0), np.float64(85.0), np.float64(86.0), np.float64(87.0), np.float64(88.0), np.float64(89.0), np.float64(90.0), np.float64(91.0), np.float64(92.0), np.float64(93.0), np.float64(94.0), np.float64(95.0), np.float64(96.0), np.float64(97.0), np.float64(98.0), np.float64(99.0), np.float64(100.0)]

Total unique values: 44

Value counts:
guest_satisfaction_overall
20.0       15
40.0       16
50.0        7
53.0        4
56.0        1
57.0        2
60.0     