## Counting Words Appearances

In [2]:
import json
from collections import Counter
import re

def calculate_word_frequency(input_file, output_file):
    
    word_counter = Counter()

    try:
        with open(input_file, 'r', encoding='utf-8') as infile:
            for line in infile:
                review = json.loads(line)  # Parse the JSON line
                text = review.get('text', '')
                # Normalize text: remove non-alphanumeric characters and convert to lowercase
                words = re.findall(r'\b\w+\b', text.lower())
                word_counter.update(words)  # Update word frequency counter

        # Write word frequencies to the output file
        with open(output_file, 'w', encoding='utf-8') as outfile:
            for word, count in word_counter.most_common():
                outfile.write(f"{word}: {count}\n")

        print(f"Word frequency analysis completed. Results written to {output_file}.")

    except Exception as e:
        print(f"An error occurred: {e}")

# File paths
input_file = "datasets/filtered_hotel_reviews_cleaned_2.json"  # Input JSON file with cleaned reviews
output_file = "datasets/word_frequencies.txt"  # Output text file for word frequencies

calculate_word_frequency(input_file, output_file)


Word frequency analysis completed. Results written to datasets/word_frequencies.txt.


## Extracting Sensory Words from Hotel Reviews and Counting their Apppearances

### 1. Exact Match

In [3]:
import os

def load_word_frequencies(freq_file):
    word_freq = {}
    with open(freq_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(': ')
            if len(parts) == 2:
                word, freq = parts
                word_freq[word] = int(freq)
    return word_freq

def process_sensory_file(sensory_file, word_freq):
    sensory_word_counts = {}
    with open(sensory_file, 'r', encoding='utf-8') as f:
        for word in f:
            word = word.strip().lower()
            if word in word_freq:
                sensory_word_counts[word] = word_freq[word]
    return sensory_word_counts

def write_output(sensory_file, sensory_word_counts, output_dir):
    # Extract the base filename without directory path
    base_filename = os.path.basename(sensory_file)
    # Create output filename in the specified directory
    output_file = os.path.join(output_dir, f"{os.path.splitext(base_filename)[0]}_frequencies.txt")
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    with open(output_file, 'w', encoding='utf-8') as f:
        for word, freq in sorted(sensory_word_counts.items(), key=lambda x: x[1], reverse=True):
            f.write(f"{word}: {freq}\n")
    print(f"Generated {output_file}")

def main():
    freq_file = "datasets/word_frequencies.txt"
    sensory_files = ["sense_vocab/sight.txt", "sense_vocab/taste.txt", "sense_vocab/touch.txt", "sense_vocab/smell.txt", "sense_vocab/sound.txt"]
    output_dir = "sense_hotel_lexicons/exact_match"
    
    word_freq = load_word_frequencies(freq_file)
    
    for sensory_file in sensory_files:
        if os.path.exists(sensory_file):
            sensory_word_counts = process_sensory_file(sensory_file, word_freq)
            write_output(sensory_file, sensory_word_counts, output_dir)
        else:
            print(f"File {sensory_file} not found!")

if __name__ == "__main__":
    main()

Generated sense_hotel_lexicons/exact_match\sight_frequencies.txt
Generated sense_hotel_lexicons/exact_match\taste_frequencies.txt
Generated sense_hotel_lexicons/exact_match\touch_frequencies.txt
Generated sense_hotel_lexicons/exact_match\smell_frequencies.txt
Generated sense_hotel_lexicons/exact_match\sound_frequencies.txt


### 2. Semantic Similarity with GloVe

In [4]:
import os
import numpy as np
import gensim.downloader as api
from tqdm import tqdm  # For progress bars

def load_word_frequencies(freq_file):
    """Load word frequencies from the frequency file."""
    word_freq = {}
    with open(freq_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(': ')
            if len(parts) == 2:
                word, freq = parts
                word_freq[word] = int(freq)
    return word_freq

def load_word_embeddings(model_name="glove-wiki-gigaword-300"):
    """Load pre-trained word embeddings."""
    print(f"Loading {model_name} word embeddings...")
    return api.load(model_name)

def process_sensory_file_semantic(sensory_file, word_freq, word_vectors, similarity_threshold=0.75):
  
    sensory_word_counts = {}
    sensory_words = []
    
    # Load sensory words
    with open(sensory_file, 'r', encoding='utf-8') as f:
        for line in f:
            word = line.strip().lower()
            if word in word_vectors:  # Make sure the sensory word is in our embeddings
                sensory_words.append(word)
    
    print(f"Processing {len(sensory_words)} sensory words from {sensory_file}")
    
    # Create a mapping to track which frequency words matched with which sensory words
    matches = {}
    
    # Check each word in the frequency dictionary
    for freq_word in tqdm(word_freq.keys()):
        if freq_word not in word_vectors:
            continue
            
        # Check similarity with each sensory word
        for sensory_word in sensory_words:
            try:
                similarity = word_vectors.similarity(freq_word, sensory_word)
                
                # If similarity is above threshold, consider it a match
                if similarity >= similarity_threshold:
                    if freq_word not in matches:
                        matches[freq_word] = []
                    matches[freq_word].append((sensory_word, similarity))
                    
                    # Count this word (if we have multiple matches, take the highest similarity one)
                    if freq_word not in sensory_word_counts or similarity > max(s for _, s in matches[freq_word][:-1]):
                        sensory_word_counts[freq_word] = word_freq[freq_word]
            except KeyError:
                continue  # Skip if either word is not in the vocabulary
    
    return sensory_word_counts, matches

def write_output(sensory_file, sensory_word_counts, matches, output_dir):
    """Write output files with matched words and their origins."""
    # Extract the base filename without directory path
    base_filename = os.path.basename(sensory_file)
    base_name = os.path.splitext(base_filename)[0]
    
    # Create output filenames in the specified directory
    output_file = os.path.join(output_dir, f"{base_name}_frequencies.txt")
    matches_file = os.path.join(output_dir, f"{base_name}_matches.txt")
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Write frequencies
    with open(output_file, 'w', encoding='utf-8') as f:
        for word, freq in sorted(sensory_word_counts.items(), key=lambda x: x[1], reverse=True):
            f.write(f"{word}: {freq}\n")
    
    # Write matches information
    with open(matches_file, 'w', encoding='utf-8') as f:
        for freq_word, sensory_matches in sorted(matches.items()):
            match_info = ", ".join([f"{s_word} ({s:.2f})" for s_word, s in sensory_matches])
            f.write(f"{freq_word} -> {match_info}\n")
    
    print(f"Generated {output_file} and {matches_file}")

def main():
    freq_file = "datasets/word_frequencies.txt"
    sensory_files = ["sense_vocab/sight.txt", "sense_vocab/taste.txt", "sense_vocab/touch.txt", "sense_vocab/smell.txt", "sense_vocab/sound.txt"]
    output_dir = "sense_hotel_lexicons/semantic_match_glove"
    
    # Load word frequencies
    word_freq = load_word_frequencies(freq_file)
    
    # Load pre-trained word embeddings
    word_vectors = load_word_embeddings()
    
    for sensory_file in sensory_files:
        if os.path.exists(sensory_file):
            sensory_word_counts, matches = process_sensory_file_semantic(sensory_file, word_freq, word_vectors)
            write_output(sensory_file, sensory_word_counts, matches, output_dir)
        else:
            print(f"File {sensory_file} not found!")

if __name__ == "__main__":
    main()

Loading glove-wiki-gigaword-300 word embeddings...
Processing 197 sensory words from sense_vocab/sight.txt


100%|██████████| 159131/159131 [03:08<00:00, 843.72it/s] 


Generated sense_hotel_lexicons/semantic_match_glove\sight_frequencies.txt and sense_hotel_lexicons/semantic_match_glove\sight_matches.txt
Processing 52 sensory words from sense_vocab/taste.txt


100%|██████████| 159131/159131 [00:26<00:00, 6044.16it/s] 


Generated sense_hotel_lexicons/semantic_match_glove\taste_frequencies.txt and sense_hotel_lexicons/semantic_match_glove\taste_matches.txt
Processing 122 sensory words from sense_vocab/touch.txt


100%|██████████| 159131/159131 [01:01<00:00, 2588.50it/s]


Generated sense_hotel_lexicons/semantic_match_glove\touch_frequencies.txt and sense_hotel_lexicons/semantic_match_glove\touch_matches.txt
Processing 35 sensory words from sense_vocab/smell.txt


100%|██████████| 159131/159131 [00:17<00:00, 8962.38it/s] 


Generated sense_hotel_lexicons/semantic_match_glove\smell_frequencies.txt and sense_hotel_lexicons/semantic_match_glove\smell_matches.txt
Processing 198 sensory words from sense_vocab/sound.txt


100%|██████████| 159131/159131 [01:40<00:00, 1587.19it/s]


Generated sense_hotel_lexicons/semantic_match_glove\sound_frequencies.txt and sense_hotel_lexicons/semantic_match_glove\sound_matches.txt


In [None]:
# Load words from file 1
with open('sense_vocab/sight.txt', 'r', encoding='utf-8') as f1:
    file1_words = set(line.strip() for line in f1 if line.strip())

# Load words from file 2 (ignoring counts)
file2_words = set()
with open('sense_hotel_lexicons/semantic_match_glove/sight_frequencies.txt', 'r', encoding='utf-8') as f2:
    for line in f2:
        if ':' in line:
            word = line.split(':')[0].strip()
            if word:
                file2_words.add(word)

# Difference
unique_words = file2_words - file1_words

# Save to a new file
with open('sense_hotel_lexicons/semantic_match_glove/sigth_new_words.txt', 'w', encoding='utf-8') as out:
    for word in sorted(unique_words):
        out.write(word + '\n')


In [4]:
# Load words from file 1
with open('sense_vocab/smell.txt', 'r', encoding='utf-8') as f1:
    file1_words = set(line.strip() for line in f1 if line.strip())

# Load words from file 2 (ignoring counts)
file2_words = set()
with open('sense_hotel_lexicons/semantic_match_glove/smell_frequencies.txt', 'r', encoding='utf-8') as f2:
    for line in f2:
        if ':' in line:
            word = line.split(':')[0].strip()
            if word:
                file2_words.add(word)

# Difference
unique_words = file2_words - file1_words

# Save to a new file
with open('sense_hotel_lexicons/semantic_match_glove/smell_new_words.txt', 'w', encoding='utf-8') as out:
    for word in sorted(unique_words):
        out.write(word + '\n')


In [5]:
# Load words from file 1
with open('sense_vocab/sound.txt', 'r', encoding='utf-8') as f1:
    file1_words = set(line.strip() for line in f1 if line.strip())

# Load words from file 2 (ignoring counts)
file2_words = set()
with open('sense_hotel_lexicons/semantic_match_glove/sound_frequencies.txt', 'r', encoding='utf-8') as f2:
    for line in f2:
        if ':' in line:
            word = line.split(':')[0].strip()
            if word:
                file2_words.add(word)

# Difference
unique_words = file2_words - file1_words

# Save to a new file
with open('sense_hotel_lexicons/semantic_match_glove/sound_new_words.txt', 'w', encoding='utf-8') as out:
    for word in sorted(unique_words):
        out.write(word + '\n')


In [6]:
# Load words from file 1
with open('sense_vocab/taste.txt', 'r', encoding='utf-8') as f1:
    file1_words = set(line.strip() for line in f1 if line.strip())

# Load words from file 2 (ignoring counts)
file2_words = set()
with open('sense_hotel_lexicons/semantic_match_glove/taste_frequencies.txt', 'r', encoding='utf-8') as f2:
    for line in f2:
        if ':' in line:
            word = line.split(':')[0].strip()
            if word:
                file2_words.add(word)

# Difference
unique_words = file2_words - file1_words

# Save to a new file
with open('sense_hotel_lexicons/semantic_match_glove/taste_new_words.txt', 'w', encoding='utf-8') as out:
    for word in sorted(unique_words):
        out.write(word + '\n')


In [7]:
# Load words from file 1
with open('sense_vocab/touch.txt', 'r', encoding='utf-8') as f1:
    file1_words = set(line.strip() for line in f1 if line.strip())

# Load words from file 2 (ignoring counts)
file2_words = set()
with open('sense_hotel_lexicons/semantic_match_glove/touch_frequencies.txt', 'r', encoding='utf-8') as f2:
    for line in f2:
        if ':' in line:
            word = line.split(':')[0].strip()
            if word:
                file2_words.add(word)

# Difference
unique_words = file2_words - file1_words

# Save to a new file
with open('sense_hotel_lexicons/semantic_match_glove/touch_new_words.txt', 'w', encoding='utf-8') as out:
    for word in sorted(unique_words):
        out.write(word + '\n')


### 3. Semantic Similarity with FastText

In [6]:
import os
import numpy as np
import gensim.downloader as api
from tqdm import tqdm  # For progress bars

def load_word_frequencies(freq_file):
    """Load word frequencies from the frequency file."""
    word_freq = {}
    with open(freq_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(': ')
            if len(parts) == 2:
                word, freq = parts
                word_freq[word] = int(freq)
    return word_freq

def load_word_embeddings(model_name="fasttext-wiki-news-subwords-300"):
    """Load pre-trained FastText word embeddings."""
    print(f"Loading {model_name} word embeddings...")
    return api.load(model_name)

def process_sensory_file_semantic(sensory_file, word_freq, word_vectors, similarity_threshold=0.75):
 
    sensory_word_counts = {}
    sensory_words = []
    
    # Load sensory words
    with open(sensory_file, 'r', encoding='utf-8') as f:
        for line in f:
            word = line.strip().lower()
            if word in word_vectors:  # Make sure the sensory word is in our embeddings
                sensory_words.append(word)
            else:
                print(f"Warning: Sensory word '{word}' not found in FastText vocabulary")
    
    print(f"Processing {len(sensory_words)} sensory words from {sensory_file}")
    
    # Create a mapping to track which frequency words matched with which sensory words
    matches = {}
    
    # Check each word in the frequency dictionary
    for freq_word in tqdm(word_freq.keys()):
        if freq_word not in word_vectors:
            continue
            
        # Check similarity with each sensory word
        for sensory_word in sensory_words:
            try:
                similarity = word_vectors.similarity(freq_word, sensory_word)
                
                # If similarity is above threshold, consider it a match
                if similarity >= similarity_threshold:
                    if freq_word not in matches:
                        matches[freq_word] = []
                    matches[freq_word].append((sensory_word, similarity))
                    
                    # Count this word
                    # If multiple sensory words match, we'll keep the highest similarity match
                    if freq_word not in sensory_word_counts or similarity > max(s for _, s in matches[freq_word][:-1]):
                        sensory_word_counts[freq_word] = word_freq[freq_word]
            except KeyError:
                continue  # Skip if either word is not in the vocabulary
    
    return sensory_word_counts, matches

def write_output(sensory_file, sensory_word_counts, matches, output_dir):
    """Write output files with matched words and their origins."""
    # Extract the base filename without directory path
    base_filename = os.path.basename(sensory_file)
    base_name = os.path.splitext(base_filename)[0]
    
    # Create output filenames in the specified directory
    output_file = os.path.join(output_dir, f"{base_name}_frequencies.txt")
    matches_file = os.path.join(output_dir, f"{base_name}_matches.txt")
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Write frequencies
    with open(output_file, 'w', encoding='utf-8') as f:
        for word, freq in sorted(sensory_word_counts.items(), key=lambda x: x[1], reverse=True):
            f.write(f"{word}: {freq}\n")
    
    # Write matches information
    with open(matches_file, 'w', encoding='utf-8') as f:
        for freq_word, sensory_matches in sorted(matches.items()):
            match_info = ", ".join([f"{s_word} ({s:.2f})" for s_word, s in sensory_matches])
            f.write(f"{freq_word} -> {match_info}\n")
    
    print(f"Generated {output_file} and {matches_file}")

def main():
    freq_file = "datasets/word_frequencies.txt"
    sensory_files = ["sense_vocab/sight.txt", "sense_vocab/taste.txt", "sense_vocab/touch.txt", "sense_vocab/smell.txt", "sense_vocab/sound.txt"]
    output_dir = "sense_hotel_lexicons/semantic_match_fasttext"
    
    # Load word frequencies
    word_freq = load_word_frequencies(freq_file)
    
    # Load pre-trained word embeddings (FastText)
    word_vectors = load_word_embeddings()
    
    for sensory_file in sensory_files:
        if os.path.exists(sensory_file):
            sensory_word_counts, matches = process_sensory_file_semantic(sensory_file, word_freq, word_vectors)
            write_output(sensory_file, sensory_word_counts, matches, output_dir)
        else:
            print(f"File {sensory_file} not found!")

if __name__ == "__main__":
    main()

Loading fasttext-wiki-news-subwords-300 word embeddings...
Processing 197 sensory words from sense_vocab/sight.txt


100%|██████████| 159131/159131 [01:57<00:00, 1352.88it/s]


Generated sense_hotel_lexicons/semantic_match_fasttext\sight_frequencies.txt and sense_hotel_lexicons/semantic_match_fasttext\sight_matches.txt
Processing 53 sensory words from sense_vocab/taste.txt


100%|██████████| 159131/159131 [00:45<00:00, 3532.48it/s] 


Generated sense_hotel_lexicons/semantic_match_fasttext\taste_frequencies.txt and sense_hotel_lexicons/semantic_match_fasttext\taste_matches.txt
Processing 123 sensory words from sense_vocab/touch.txt


100%|██████████| 159131/159131 [01:02<00:00, 2563.34it/s]


Generated sense_hotel_lexicons/semantic_match_fasttext\touch_frequencies.txt and sense_hotel_lexicons/semantic_match_fasttext\touch_matches.txt
Processing 37 sensory words from sense_vocab/smell.txt


100%|██████████| 159131/159131 [00:18<00:00, 8661.07it/s] 


Generated sense_hotel_lexicons/semantic_match_fasttext\smell_frequencies.txt and sense_hotel_lexicons/semantic_match_fasttext\smell_matches.txt
Processing 200 sensory words from sense_vocab/sound.txt


100%|██████████| 159131/159131 [02:20<00:00, 1128.64it/s]


Generated sense_hotel_lexicons/semantic_match_fasttext\sound_frequencies.txt and sense_hotel_lexicons/semantic_match_fasttext\sound_matches.txt


In [8]:
# Load words from file 1
with open('sense_vocab/touch.txt', 'r', encoding='utf-8') as f1:
    file1_words = set(line.strip() for line in f1 if line.strip())

# Load words from file 2 (ignoring counts)
file2_words = set()
with open('sense_hotel_lexicons/semantic_match_fasttext/sight_frequencies.txt', 'r', encoding='utf-8') as f2:
    for line in f2:
        if ':' in line:
            word = line.split(':')[0].strip()
            if word:
                file2_words.add(word)

# Difference
unique_words = file2_words - file1_words

# Save to a new file
with open('sense_hotel_lexicons/semantic_match_fasttext/sight_new_words.txt', 'w', encoding='utf-8') as out:
    for word in sorted(unique_words):
        out.write(word + '\n')


In [9]:
# Load words from file 1
with open('sense_vocab/smell.txt', 'r', encoding='utf-8') as f1:
    file1_words = set(line.strip() for line in f1 if line.strip())

# Load words from file 2 (ignoring counts)
file2_words = set()
with open('sense_hotel_lexicons/semantic_match_fasttext/smell_frequencies.txt', 'r', encoding='utf-8') as f2:
    for line in f2:
        if ':' in line:
            word = line.split(':')[0].strip()
            if word:
                file2_words.add(word)

# Difference
unique_words = file2_words - file1_words

# Save to a new file
with open('sense_hotel_lexicons/semantic_match_fasttext/smell_new_words.txt', 'w', encoding='utf-8') as out:
    for word in sorted(unique_words):
        out.write(word + '\n')


In [10]:
# Load words from file 1
with open('sense_vocab/sound.txt', 'r', encoding='utf-8') as f1:
    file1_words = set(line.strip() for line in f1 if line.strip())

# Load words from file 2 (ignoring counts)
file2_words = set()
with open('sense_hotel_lexicons/semantic_match_fasttext/sound_frequencies.txt', 'r', encoding='utf-8') as f2:
    for line in f2:
        if ':' in line:
            word = line.split(':')[0].strip()
            if word:
                file2_words.add(word)

# Difference
unique_words = file2_words - file1_words

# Save to a new file
with open('sense_hotel_lexicons/semantic_match_fasttext/sound_new_words.txt', 'w', encoding='utf-8') as out:
    for word in sorted(unique_words):
        out.write(word + '\n')


In [11]:
# Load words from file 1
with open('sense_vocab/taste.txt', 'r', encoding='utf-8') as f1:
    file1_words = set(line.strip() for line in f1 if line.strip())

# Load words from file 2 (ignoring counts)
file2_words = set()
with open('sense_hotel_lexicons/semantic_match_fasttext/taste_frequencies.txt', 'r', encoding='utf-8') as f2:
    for line in f2:
        if ':' in line:
            word = line.split(':')[0].strip()
            if word:
                file2_words.add(word)

# Difference
unique_words = file2_words - file1_words

# Save to a new file
with open('sense_hotel_lexicons/semantic_match_fasttext/taste_new_words.txt', 'w', encoding='utf-8') as out:
    for word in sorted(unique_words):
        out.write(word + '\n')


In [12]:
# Load words from file 1
with open('sense_vocab/touch.txt', 'r', encoding='utf-8') as f1:
    file1_words = set(line.strip() for line in f1 if line.strip())

# Load words from file 2 (ignoring counts)
file2_words = set()
with open('sense_hotel_lexicons/semantic_match_fasttext/touch_frequencies.txt', 'r', encoding='utf-8') as f2:
    for line in f2:
        if ':' in line:
            word = line.split(':')[0].strip()
            if word:
                file2_words.add(word)

# Difference
unique_words = file2_words - file1_words

# Save to a new file
with open('sense_hotel_lexicons/semantic_match_fasttext/touch_new_words.txt', 'w', encoding='utf-8') as out:
    for word in sorted(unique_words):
        out.write(word + '\n')
