In [7]:
import pandas as pd
import re
import unicodedata
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

df = pd.read_csv("datasets/listings.csv")

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8")
    text = re.sub("<.*?>", "", text) # Remove HTML tags
    return text

def get_sentiment_score(text):
    if not text:
        return 0
    processed_text = preprocess_text(text)
    
    # Truncate text if longer than max_sequence_length
    max_sequence_length = tokenizer.model_max_length
    tokens = tokenizer.tokenize(processed_text)
    if len(tokens) > max_sequence_length - 2:  # Reserve space for [CLS] and [SEP] tokens
        tokens = tokens[:max_sequence_length - 2]
    truncated_text = tokenizer.convert_tokens_to_string(tokens)
    
    sentiment = nlp(truncated_text)
    score = sentiment[0]["label"]
    
    return score

columns_to_score = ["name", "description", "neighborhood_overview", "host_about", "amenities"]
new_columns = [col + "_score" for col in columns_to_score]

for idx, row in df.iterrows():
    for col, new_col in zip(columns_to_score, new_columns):
        df.at[idx, new_col] = get_sentiment_score(row[col])
    
    if idx % 300 == 0:
        print(f"Processing row {idx}")

df.to_csv("datasets/listings_with_nlp.csv", index=False)


Processing row 0
Processing row 300
Processing row 600
Processing row 900
Processing row 1200
Processing row 1500
Processing row 1800
Processing row 2100
Processing row 2400
Processing row 2700
Processing row 3000
Processing row 3001
Processing row 3002
Processing row 3003
Processing row 3004
Processing row 3005
Processing row 3006
Processing row 3007
Processing row 3008
Processing row 3009
Processing row 3010
Processing row 3011
Processing row 3012
Processing row 3013
Processing row 3014
Processing row 3015
Processing row 3016
Processing row 3017
Processing row 3018
Processing row 3019
Processing row 3020
Processing row 3021
Processing row 3022
Processing row 3023
Processing row 3024
Processing row 3025
Processing row 3026
Processing row 3027
Processing row 3028
Processing row 3029
Processing row 3030
Processing row 3031
Processing row 3032
Processing row 3033
Processing row 3034
Processing row 3035
Processing row 3036
Processing row 3037
Processing row 3038
Processing row 3039
Proces

Token indices sequence length is longer than the specified maximum sequence length for this model (1480 > 512). Running this sequence through the model will result in indexing errors


Processing row 3059
Processing row 3060
Processing row 3061
Processing row 3062
Processing row 3063
Processing row 3064
Processing row 3065
Processing row 3066
Processing row 3067
Processing row 3068
Processing row 3069
Processing row 3070
Processing row 3071
Processing row 3072
Processing row 3073
Processing row 3074
Processing row 3075
Processing row 3076
Processing row 3077
Processing row 3078
Processing row 3079
Processing row 3080
Processing row 3081
Processing row 3082
Processing row 3083
Processing row 3084
Processing row 3085
Processing row 3086
Processing row 3087
Processing row 3088
Processing row 3089
Processing row 3090
Processing row 3091
Processing row 3092
Processing row 3093
Processing row 3094
Processing row 3095
Processing row 3096
Processing row 3097
Processing row 3098
Processing row 3099
Processing row 3100
Processing row 3101
Processing row 3102
Processing row 3103
Processing row 3104
Processing row 3105
Processing row 3106
Processing row 3107
Processing row 3108
