Lets first get the data from the created file

In [2]:
import ast
import pandas as pd
import json as js

df_captions = pd.read_csv('./preprocessing/data_captions.csv', index_col=0)


def string_to_list_of_lists(string):
    return [item.strip() for item in string.replace("[", "")
            .replace("]", "").replace("'", "").replace('"', "")
            .split(",")]


df_captions['captions'] = df_captions['captions'].apply(string_to_list_of_lists)

df_captions.head()

Unnamed: 0_level_0,captions
id_annonce,Unnamed: 1_level_1
14965103,[this is a large kitchen with several counters...
35813513,[a living area with blue floors and a dining a...
35815605,[a floor plan showing the three different room...
36020531,[a field full of lots of grass on top of a blu...
35775577,"[a table and chairs are on a wood floor, a lar..."


In [3]:
import pandas as pd
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt


def parse_captions(df):
    # Flatten the list of captions into a single list
    captions = [caption for caption_list in df['captions'] for caption in caption_list]
    # Join the captions into a single string and split by space to get words
    words = " ".join(captions).split()
    # Use Counter to calculate the frequency of each word
    word_freq = Counter(words)
    return dict(word_freq)



word_freq = parse_captions(df_captions)

# Sort the words by frequency
sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

# Create a dataframe with the sorted words and their frequencies
df_word_freq = pd.DataFrame(sorted_word_freq, columns=['word', 'freq'])


df_word_freq.head(10)



Unnamed: 0,word,freq
0,a,247022
1,with,95674
2,and,94158
3,of,87526
4,the,69565
5,room,66038
6,an,53575
7,in,43947
8,is,38281
9,blurry,31761


In [4]:
stop_words = [
    "a", "an", "and", "are", "as", "at", "be", "by", "for", "from",
    "has", "he", "in", "is", "it", "its", "of", "on", "that", "the",
    "to", "was", "were", "will", "with","there","this","there"
]

# Remove the stop words from the dataframe and put them in a dataframe of their own
df_stop_words = df_word_freq[df_word_freq['word'].isin(stop_words)]
df_word_freq = df_word_freq[~df_word_freq['word'].isin(stop_words)]
df_word_freq = df_word_freq.reset_index(drop=True)

df_word_freq.head()



Unnamed: 0,word,freq
0,room,66038
1,blurry,31761
2,large,24959
3,living,24118
4,white,23825


In [5]:
selected_words = ["elegant", "pool", "view", "big", "grass","tub","small","hallway","blurred","patio","lawn"
                        "stairs", "tree", "fireplace", "artistic", "garden","modern"
                        "bright", "decorated", "fashioned", "panoramic", "high", 
                        "spacious","beautiful", "sunny", "colors","pretty",
                        "chandelier", "marble", "messy","unfinished",  "shining",
                        "organized", "huge", "antique","renovated", "warehouse",
                        "rooftop","garage","swimming", "decor","loft","porch","umbrella",
                        "bar","wooden","clean", "empty","balcony","tile", "bushes","nice"
                        "good", "plants","backyard","frontyard","yard"]


def is_in_caption(caption_list, word):
    for captions in caption_list :
        if word in captions:
            return True

for word in selected_words:
    # Create a column with the word as the name and the the value 0 for each row
    if word not in df_captions.columns:
        df_captions.insert(len(df_captions.columns), word, 0)
    
    # Set the value to 1 if the word is in the caption
    df_captions[word] = df_captions['captions'].apply(is_in_caption, args=(word,))



# plot the number of rows with each word from the selected words using plotly
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Bar(x=df_captions[selected_words].sum().index, y=df_captions[selected_words].sum().values))
fig.update_layout(title_text='Number of rows with each word from the selected words', width=800, height=400)
fig.show()

# download the plot as a png file
fig.write_image("./analysis/images/selected_words.png")