In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
pd.set_option("display.max_columns", None)

In [None]:
train_df = pd.read_csv('../data/train_nlp2.csv')
test_df = pd.read_csv('../data/test_nlp2.csv')

In [None]:
train_df = train_df.drop_duplicates()

In [None]:
train_df.describe()

In [None]:
missing = train_df.isnull().sum()
missing = missing[missing > 0]
missing

In [None]:
fig, ax = plt.subplots(3, 4, figsize=(20, 10))
ax = ax.flatten()
cols = ['host_is_superhost', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'reviews_per_month', 'time_since_first_review', 'time_since_last_review']
for i, col in enumerate(cols):
    train_df[col].hist(ax=ax[i])
    ax[i].set_title(col)
plt.tight_layout()
plt.show()

In [None]:
train_df['host_is_superhost'] = train_df['host_is_superhost'].fillna(0)
test_df['host_is_superhost'] = test_df['host_is_superhost'].fillna(0)

train_df['no_reviews'] = train_df['review_scores_rating'].isnull().astype(int)
test_df['no_reviews'] = test_df['review_scores_rating'].isnull().astype(int)

train_df['has_availability'] = train_df['has_availability'].fillna(0)
test_df['has_availability'] = test_df['has_availability'].fillna(0)

mean_review_scores_rating = train_df['review_scores_rating'].mean()
mean_review_scores_accuracy = train_df['review_scores_accuracy'].mean()
mean_review_scores_cleanliness = train_df['review_scores_cleanliness'].mean()
mean_review_scores_checkin = train_df['review_scores_checkin'].mean()
mean_review_scores_communication = train_df['review_scores_communication'].mean()
mean_review_scores_location = train_df['review_scores_location'].mean()
mean_review_scores_value = train_df['review_scores_value'].mean()
mean_reviews_per_month = train_df['reviews_per_month'].mean()
mean_review_sentiment = train_df['review_sentiment'].mean()
mean_avg_review_length = train_df['avg_review_length'].mean()

mean_time_since_first_review = train_df['time_since_first_review'].mean()
mean_time_since_last_review = train_df['time_since_last_review'].mean()

train_df['review_scores_rating'] = train_df['review_scores_rating'].fillna(mean_review_scores_rating)
train_df['review_scores_accuracy'] = train_df['review_scores_accuracy'].fillna(mean_review_scores_accuracy)
train_df['review_scores_cleanliness'] = train_df['review_scores_cleanliness'].fillna(mean_review_scores_cleanliness)
train_df['review_scores_checkin'] = train_df['review_scores_checkin'].fillna(mean_review_scores_checkin)
train_df['review_scores_communication'] = train_df['review_scores_communication'].fillna(mean_review_scores_communication)
train_df['review_scores_location'] = train_df['review_scores_location'].fillna(mean_review_scores_location)
train_df['review_scores_value'] = train_df['review_scores_value'].fillna(mean_review_scores_value)
train_df['reviews_per_month'] = train_df['reviews_per_month'].fillna(mean_reviews_per_month)
train_df['time_since_first_review'] = train_df['time_since_first_review'].fillna(mean_time_since_first_review)
train_df['time_since_last_review'] = train_df['time_since_last_review'].fillna(mean_time_since_last_review)
train_df['review_sentiment'] = train_df['review_sentiment'].fillna(mean_review_sentiment)
train_df['avg_review_length'] = train_df['avg_review_length'].fillna(mean_avg_review_length)

test_df['review_scores_rating'] = test_df['review_scores_rating'].fillna(mean_review_scores_rating)
test_df['review_scores_accuracy'] = test_df['review_scores_accuracy'].fillna(mean_review_scores_accuracy)
test_df['review_scores_cleanliness'] = test_df['review_scores_cleanliness'].fillna(mean_review_scores_cleanliness)
test_df['review_scores_checkin'] = test_df['review_scores_checkin'].fillna(mean_review_scores_checkin)
test_df['review_scores_communication'] = test_df['review_scores_communication'].fillna(mean_review_scores_communication)
test_df['review_scores_location'] = test_df['review_scores_location'].fillna(mean_review_scores_location)
test_df['review_scores_value'] = test_df['review_scores_value'].fillna(mean_review_scores_value)
test_df['reviews_per_month'] = test_df['reviews_per_month'].fillna(mean_reviews_per_month)
test_df['time_since_first_review'] = test_df['time_since_first_review'].fillna(mean_time_since_first_review)
test_df['time_since_last_review'] = test_df['time_since_last_review'].fillna(mean_time_since_last_review)
test_df['review_sentiment'] = test_df['review_sentiment'].fillna(mean_review_sentiment)
test_df['avg_review_length'] = test_df['avg_review_length'].fillna(mean_avg_review_length)

In [None]:
missing = train_df.isnull().sum()
missing = missing[missing > 0]
missing

In [None]:
# show the distributions have clear means
fig, ax = plt.subplots(1, 2, figsize=(20, 5))
train_df['host_response_time'].value_counts().plot(kind='bar', ax=ax[0])
train_df['host_response_rate'].value_counts().plot(kind='bar', ax=ax[1])
plt.show()

In [None]:
train_df['no_messages'] = train_df['host_response_rate'].isnull().astype(int)
test_df['no_messages'] = test_df['host_response_rate'].isnull().astype(int)

mean_host_response_time = train_df['host_response_time'].mean()
mean_host_response_rate = train_df['host_response_rate'].mean()
mean_host_acceptance_rate = train_df['host_acceptance_rate'].mean()

train_df['host_response_time'] = train_df['host_response_time'].fillna(mean_host_response_time)
train_df['host_response_rate'] = train_df['host_response_rate'].fillna(mean_host_response_rate)
train_df['host_acceptance_rate'] = train_df['host_acceptance_rate'].fillna(mean_host_acceptance_rate)

test_df['host_response_time'] = test_df['host_response_time'].fillna(mean_host_response_time)
test_df['host_response_rate'] = test_df['host_response_rate'].fillna(mean_host_response_rate)
test_df['host_acceptance_rate'] = test_df['host_acceptance_rate'].fillna(mean_host_acceptance_rate)

In [None]:
mode_bathrooms_shared = train_df['bathrooms_shared'].mode()[0]

train_df['bathrooms_shared'] = train_df['bathrooms_shared'].fillna(mode_bathrooms_shared)
test_df['bathrooms_shared'] = test_df['bathrooms_shared'].fillna(mode_bathrooms_shared)

In [None]:
train_df['listing_length'] = train_df['listing_length'].fillna(0)
test_df['listing_length'] = test_df['listing_length'].fillna(0)

In [None]:
missing = train_df.isnull().sum()
missing = missing[missing > 0]
missing

In [None]:
imputer = KNNImputer(n_neighbors=5)
cols = ['accommodates', 'bathrooms', 'bathrooms_shared', 'bedrooms', 'beds']
train_df[cols] = imputer.fit_transform(train_df[cols])
test_df[cols] = imputer.transform(test_df[cols])

In [None]:
missing = train_df.isnull().sum()
missing = missing[missing > 0]
missing

In [None]:
# save files
train_df.to_csv('../data/train_imputed.csv', index=False)
test_df.to_csv('../data/test_imputed.csv', index=False)