In [19]:
import pandas as pd
import re
from transformers import BertModel, BertTokenizer, DistilBertTokenizer, RobertaTokenizer
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error
# from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# import stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xinyutian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
all_data = pd.read_csv('../../raw_data/train.csv')
text_cols = ['name', 
             'host_name',
            'description', 
            'host_verifications', 
            'neighbourhood_cleansed', 
            'neighbourhood_group_cleansed',
            'property_type',
            'room_type',
            'bathrooms_text',
            'amenities']

In [11]:
test_data = pd.read_csv('../../raw_data/test.csv')

In [21]:
def preprocess_text(text, placeholder_token="[NO_DESCRIPTION]"):
    if type(text) != str:
        # Replace missing values with empty strings
        text = ""
    if not text or text.isspace():
        # Replace empty/whitespace-only text with placeholder
        return placeholder_token
    else:
        text = text.lower()
        text = re.sub(r"[^\w\s]", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        text = re.sub(r"[^a-zA-Z]+", " ", text)
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r"\s+", " ", text).strip()
        text = re.sub(r"^\s+$", "", text)
        text = re.sub(r"^\s+", "", text)
        text = re.sub(r"\s+$", "", text)
        # remove stop words
        stop_words = stopwords.words('english')
    return text

In [22]:
# preprocess all the columns except for id and price
for col in text_cols:
    all_data[col] = all_data[col].apply(lambda x: preprocess_text(x))

In [12]:
for col in text_cols:
    test_data[col] = test_data[col].apply(lambda x: preprocess_text(x))

In [23]:
# Version 1: Tokenization with BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_data = all_data.copy()

def tokenize_text(text, max_len=512):
    return tokenizer.encode(text, add_special_tokens=True)[:max_len]

for col in text_cols:
    tokenized_data[col] = tokenized_data[col].apply(lambda x: tokenize_text(x))

# save the tokenized data
tokenized_data.to_csv('../df/texts/data_with_tokenization.csv', index=False)

In [6]:
# Version 2: CountVectorizer
count_vectorizer = CountVectorizer()
vectorized_data = all_data.copy()

name_vectorized = count_vectorizer.fit_transform(vectorized_data['name'])
name_df = pd.DataFrame(name_vectorized.toarray(), columns=count_vectorizer.get_feature_names_out())
description_vectorized = count_vectorizer.fit_transform(vectorized_data['description'])
description_df = pd.DataFrame(description_vectorized.toarray(), columns=count_vectorizer.get_feature_names_out())
host_verifications_vectorized = count_vectorizer.fit_transform(vectorized_data['host_verifications'])
host_verifications_df = pd.DataFrame(host_verifications_vectorized.toarray(), columns=count_vectorizer.get_feature_names_out())
neighbourhood_cleansed_vectorized = count_vectorizer.fit_transform(vectorized_data['neighbourhood_cleansed'])
neighbourhood_cleansed_df = pd.DataFrame(neighbourhood_cleansed_vectorized.toarray(), columns=count_vectorizer.get_feature_names_out())
neighbourhood_group_cleansed_vectorized = count_vectorizer.fit_transform(vectorized_data['neighbourhood_group_cleansed'])
neighbourhood_group_cleansed_df = pd.DataFrame(neighbourhood_group_cleansed_vectorized.toarray(), columns=count_vectorizer.get_feature_names_out())
property_type_vectorized = count_vectorizer.fit_transform(vectorized_data['property_type'])
property_type_df = pd.DataFrame(property_type_vectorized.toarray(), columns=count_vectorizer.get_feature_names_out())
room_type_vectorized = count_vectorizer.fit_transform(vectorized_data['room_type'])
room_type_df = pd.DataFrame(room_type_vectorized.toarray(), columns=count_vectorizer.get_feature_names_out())
bathrooms_text_vectorized = count_vectorizer.fit_transform(vectorized_data['bathrooms_text'])
bathrooms_text_df = pd.DataFrame(bathrooms_text_vectorized.toarray(), columns=count_vectorizer.get_feature_names_out())
amenities_vectorized = count_vectorizer.fit_transform(vectorized_data['amenities'])
amenities_df = pd.DataFrame(amenities_vectorized.toarray(), columns=count_vectorizer.get_feature_names_out())


In [7]:
host_name_vectorized = count_vectorizer.fit_transform(vectorized_data['host_name'])
host_name_df = pd.DataFrame(host_name_vectorized.toarray(), columns=count_vectorizer.get_feature_names_out())

In [8]:
# save all the dfs
name_df.to_csv('../df/texts/name_df.csv', index=False)
host_verifications_df.to_csv('../df/texts/host_verifications_df.csv', index=False)
neighbourhood_cleansed_df.to_csv('../df/texts/neighbourhood_cleansed_df.csv', index=False)
neighbourhood_group_cleansed_df.to_csv('../df/texts/neighbourhood_group_cleansed_df.csv', index=False)
property_type_df.to_csv('../df/texts/property_type_df.csv', index=False)
room_type_df.to_csv('../df/texts/room_type_df.csv', index=False)
bathrooms_text_df.to_csv('../df/texts/bathrooms_text_df.csv', index=False)
amenities_df.to_csv('../df/texts/amenities_df.csv', index=False)
description_df.to_csv('../df/texts/description_df.csv', index=False)

In [14]:
count_vectorizer = CountVectorizer()
vectorized_data = test_data.copy()

name_vectorized_test = count_vectorizer.fit_transform(vectorized_data['name'])
name_df_test = pd.DataFrame(name_vectorized_test.toarray(), columns=count_vectorizer.get_feature_names_out())
description_vectorized_test = count_vectorizer.fit_transform(vectorized_data['description'])
description_df_test = pd.DataFrame(description_vectorized_test.toarray(), columns=count_vectorizer.get_feature_names_out())
host_verifications_vectorized_test = count_vectorizer.fit_transform(vectorized_data['host_verifications'])
host_verifications_df_test = pd.DataFrame(host_verifications_vectorized_test.toarray(), columns=count_vectorizer.get_feature_names_out())
neighbourhood_cleansed_vectorized_test = count_vectorizer.fit_transform(vectorized_data['neighbourhood_cleansed'])
neighbourhood_cleansed_df_test = pd.DataFrame(neighbourhood_cleansed_vectorized_test.toarray(), columns=count_vectorizer.get_feature_names_out())
neighbourhood_group_cleansed_vectorized_test = count_vectorizer.fit_transform(vectorized_data['neighbourhood_group_cleansed'])
neighbourhood_group_cleansed_df_test = pd.DataFrame(neighbourhood_group_cleansed_vectorized_test.toarray(), columns=count_vectorizer.get_feature_names_out())
property_type_vectorized_test = count_vectorizer.fit_transform(vectorized_data['property_type'])
property_type_df_test = pd.DataFrame(property_type_vectorized_test.toarray(), columns=count_vectorizer.get_feature_names_out())
room_type_vectorized_test = count_vectorizer.fit_transform(vectorized_data['room_type'])
room_type_df_test = pd.DataFrame(room_type_vectorized_test.toarray(), columns=count_vectorizer.get_feature_names_out())
bathrooms_text_vectorized_test = count_vectorizer.fit_transform(vectorized_data['bathrooms_text'])
bathrooms_text_df_test = pd.DataFrame(bathrooms_text_vectorized_test.toarray(), columns=count_vectorizer.get_feature_names_out())
amenities_vectorized_test = count_vectorizer.fit_transform(vectorized_data['amenities'])
amenities_df_test = pd.DataFrame(amenities_vectorized_test.toarray(), columns=count_vectorizer.get_feature_names_out())
host_name_vectorized_test = count_vectorizer.fit_transform(vectorized_data['host_name'])
host_name_df_test = pd.DataFrame(host_name_vectorized_test.toarray(), columns=count_vectorizer.get_feature_names_out())


In [15]:
# save all to csv
name_df_test.to_csv('../df/texts/name_df_test.csv', index=False)
host_verifications_df_test.to_csv('../df/texts/host_verifications_df_test.csv', index=False)
neighbourhood_cleansed_df_test.to_csv('../df/texts/neighbourhood_cleansed_df_test.csv', index=False)
neighbourhood_group_cleansed_df_test.to_csv('../df/texts/neighbourhood_group_cleansed_df_test.csv', index=False)
property_type_df_test.to_csv('../df/texts/property_type_df_test.csv', index=False)
room_type_df_test.to_csv('../df/texts/room_type_df_test.csv', index=False)
bathrooms_text_df_test.to_csv('../df/texts/bathrooms_text_df_test.csv', index=False)
amenities_df_test.to_csv('../df/texts/amenities_df_test.csv', index=False)
description_df_test.to_csv('../df/texts/description_df_test.csv', index=False)
host_name_df_test.to_csv('../df/texts/host_name_df_test.csv', index=False)

In [18]:
name_df.columns

Index(['acton', 'ada', 'agoura', 'agua', 'aire', 'alhambra', 'altadena', 'and',
       'angeles', 'angles',
       ...
       'westchester', 'westlake', 'westwood', 'whittier', 'windsor',
       'winnetka', 'woodland', 'wrightwood', 'york', 'yurt'],
      dtype='object', length=245)

In [10]:
host_name_df.to_csv('../df/texts/host_name_df.csv', index=False)

In [9]:
# save the vectorized data
vectorized_data.to_csv('../df/texts/data_with_vectorization.csv', index=False)