In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from langdetect import detect, LangDetectException
import openai
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
# openai.api_key = "sk-8"

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Samu\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
data_path = '../data/original/train.csv'
df = pd.read_csv(data_path)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=314)

final_train_df = df.copy()

predictive_path = '../data/original/predictive.csv'
predictive_df = pd.read_csv(predictive_path)

processed_train_csv_path = '../data/processed/nbhds/train.csv'
processed_test_csv_path = '../data/processed/nbhds/test.csv'
processed_final_train_csv_path = '../data/processed/nbhds/final_train.csv'
processed_predictive_csv_path = '../data/processed/nbhds/predictive.csv'

processed_train_df = pd.read_csv(processed_train_csv_path)
processed_test_df = pd.read_csv(processed_test_csv_path)
processed_final_train_df = pd.read_csv(processed_final_train_csv_path)
processed_predictive_df = pd.read_csv(processed_predictive_csv_path)

In [None]:
def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

# def translate_to_english(text):
#     try:
#         response = openai.ChatCompletion.create(
#             model="gpt-4",
#             messages=[
#                 {"role": "system", "content": "You are a helpful assistant that translates text to English."},
#                 {"role": "user", "content": f"Translate the following text to English:\n\n{text}"}
#             ],
#             max_tokens=1000,
#             temperature=0
#         )
#         print('Translated to English:', response['choices'][0]['message']['content'].strip())
#         return response['choices'][0]['message']['content'].strip()
#     except:
#         print('Failed to translate to English:', text)
#         return text

def get_sentiment_score(text):
    return sia.polarity_scores(text)['compound']


def process_dataframe(df):

    df['combined_text'] = df['name'].astype(str) + ' ' + df['description'].astype(str)
    df['description_sentiment'] = df['combined_text'].apply(get_sentiment_score)
    df['no_reviews'] = df['reviews'].apply(lambda x: 1 if not isinstance(x, str) else 0)
    def process_reviews(reviews_str):
        if not isinstance(reviews_str, str):
            return 0
        reviews = reviews_str.split("\n---------------------------------\n")
        sentiments = []
        for review in reviews:
            review = review.strip()
            if not review:
                continue
            # if not is_english(review):
            #     review = translate_to_english(review)
            sentiments.append(get_sentiment_score(review))
        return sum(sentiments) / len(sentiments) if sentiments else 0
    df['reviews_sentiment'] = df['reviews'].apply(process_reviews)
    print(df[['description_sentiment', 'reviews_sentiment', 'no_reviews']])
    return df[['description_sentiment', 'reviews_sentiment', 'no_reviews']]


description_reviews_sentiment_train = process_dataframe(train_df)
description_reviews_sentiment_test = process_dataframe(test_df)
description_reviews_sentiment_final_train = process_dataframe(final_train_df)
description_reviews_sentiment_predictive = process_dataframe(predictive_df)

processed_train_df = pd.concat([processed_train_df, description_reviews_sentiment_train], axis=1)
processed_test_df = pd.concat([processed_test_df, description_reviews_sentiment_test], axis=1)
processed_final_train_df = pd.concat([processed_final_train_df, description_reviews_sentiment_final_train], axis=1)
processed_predictive_df = pd.concat([processed_predictive_df, description_reviews_sentiment_predictive], axis=1)

In [55]:
sample = train_df.sample(200)
original_sample = sample.copy()
sentiment_sample = process_dataframe(sample)
sample = pd.concat([original_sample, sentiment_sample], axis=1)

       description_sentiment  reviews_sentiment  no_reviews
6548                  0.9260           0.922700           0
4483                  0.9260           0.743174           0
1838                  0.8481           0.692000           0
9746                  0.7184           0.000000           1
11853                 0.9819           0.832413           0
...                      ...                ...         ...
11212                 0.9403           0.000000           1
1818                  0.3400           0.785122           0
7899                  0.5848           0.754911           0
12990                 0.8519           0.771450           0
3285                  0.0000           0.730759           0

[200 rows x 3 columns]


In [None]:
processed_train_csv_path = '../data/processed/sentiment/train.csv'
processed_test_csv_path = '../data/processed/sentiment/test.csv'
processed_final_train_csv_path = '../data/processed/sentiment/final_train.csv'
processed_predictive_csv_path = '../data/processed/sentiment/predictive.csv'

processed_train_df.to_csv(processed_train_csv_path, index=False)
processed_test_df.to_csv(processed_test_csv_path, index=False)
processed_final_train_df.to_csv(processed_final_train_csv_path, index=False)
processed_predictive_df.to_csv(processed_predictive_csv_path, index=False)