In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('../data/train_location.csv')
test_df = pd.read_csv('../data/test_location.csv')
df.describe()

In [None]:
drop_cols = ['pet-friendly']

df.drop(columns=drop_cols, inplace=True)

for col in drop_cols:
    if col in test_df.columns:
        test_df.drop(columns=col, inplace=True)

In [None]:
correlation_matrix = df.corr().abs()

plt.figure(figsize=(200, 200))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
review_columns = [col for col in df.columns if 'review' in col]

correlation_matrix = df[review_columns].corr().abs()

plt.figure(figsize=(10, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix for review features')
plt.show()

In [None]:
reviews_pca_variables = ['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_value', 'review_sentiment']

scaler = StandardScaler()
df[reviews_pca_variables] = scaler.fit_transform(df[reviews_pca_variables])

pca = PCA(n_components=4)
pca_result = pca.fit_transform(df[reviews_pca_variables])

print(pca.explained_variance_ratio_)

In [None]:
# host responsiveness and host response time are highly correlated, so we can drop one of them as they represent the willingness of the host to communicate with the guests
df.drop(columns=['host_response_time'], inplace=True)
test_df.drop(columns=['host_response_time'], inplace=True)

In [None]:
listing_column = [col for col in df.columns if 'listing' in col]

correlation_matrix = df[listing_column].corr().abs()

plt.figure(figsize=(5, 5))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix for listings features')
plt.show()

In [None]:
# drop host_listings_count and host_total_listings_count as we have the same info in calculated_host_listings_count
df.drop(columns=['host_listings_count', 'host_total_listings_count'], inplace=True)
test_df.drop(columns=['host_listings_count', 'host_total_listings_count'], inplace=True)

In [None]:
availability_range = [30, 60, 90, 365]
for availability in availability_range:
    df['availability_' + str(availability)] = df['availability_' + str(availability)] / availability
    test_df['availability_' + str(availability)] = test_df['availability_' + str(availability)] / availability

In [None]:
availability_cols = [col for col in df.columns if 'availab' in col]

correlation_matrix = df[availability_cols].corr().abs()

plt.figure(figsize=(5, 5))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix for availability features')
plt.show()

In [None]:
availability_range = ['availability_30', 'availability_60', 'availability_90']
plt.figure(figsize=(20, 10))
for availability in availability_range:
    sns.histplot(df[availability], kde=True, bins=100, label=availability)

In [None]:
# run PCA on availability columns
scaler = StandardScaler()
df[availability_range] = scaler.fit_transform(df[availability_range])

pca = PCA(n_components=2)
pca.fit(df[availability_range])

df['availability_pca1'] = pca.transform(df[availability_range])[:, 0]

print(pca.explained_variance_ratio_)

df.drop(columns=availability_range, inplace=True)

test_df[availability_range] = scaler.transform(test_df[availability_range])

test_df['availability_pca1'] = pca.transform(test_df[availability_range])[:, 0]

test_df.drop(columns=availability_range, inplace=True)

In [None]:
test_df['amenities_count'] = test_df['air conditioning'] + test_df['breakfast'] + test_df['dryer'] + test_df['gym'] + test_df['kitchen'] + test_df['parking'] + test_df['pool'] + test_df['tv'] + test_df['view'] + test_df['washer']
df['amenities_count'] = df['air conditioning'] + df['breakfast'] + df['dryer'] + df['gym'] + df['kitchen'] + df['parking'] + df['pool'] + df['tv'] + df['view'] + df['washer']

In [None]:
correlation_matrix = df.corr().abs()

# Plot the correlation matrix
plt.figure(figsize=(200, 200))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
correlation_matrix = df.corr().abs()

highly_correlated = correlation_matrix[correlation_matrix > 0.5]
highly_correlated = highly_correlated[highly_correlated < 1]

highly_correlated = highly_correlated.dropna(axis=0, how='all')
highly_correlated = highly_correlated.dropna(axis=1, how='all')

plt.figure(figsize=(10, 10))
sns.heatmap(highly_correlated, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Highly Correlated Columns')
plt.show()

In [None]:
df.drop(columns=['host_has_profile_pic', 'room_type_Hotel room'], inplace=True)
test_df.drop(columns=['host_has_profile_pic', 'room_type_Hotel room'], inplace=True)

In [None]:
df.describe()

In [None]:
df.to_csv('../data/train_final.csv', index=False)
test_df.to_csv('../data/test_final.csv', index=False)