# LLM for analyzing Description content and evaluate the ad's validity

## 1. Initialize notebook

In [1]:
#import libraries
import pandas as pd
from langchain_community.llms import Ollama
import time


In [2]:
#import clean dataset
df = pd.read_csv('data_clean4.csv')

In [3]:
#create model
#llama3 is the most advanced model to run locally
llm = Ollama(model="llama3", temperature=0.2)
#temperature 0.2 for increased accuracy and less creativity, according to the task at hand 

## 2. Prompt engineering

In [4]:
#template that will be used in all calls of the llm

PROMPT_TEMPLATE = """\
You are a good detective specialized in the housing market. You will get a text describing a house sale from a website which may be in Portuguese or Spanish.
The houses should be sold in good condition, or about to be ready to live in. However, some houses are ruins or only have the project of the house, but no building at all.
Your job is to get read the description and tell me if the house is not proper for an ad because of it not having a building or being a ruin or some condition that makes inhabitable.

Considering the following description:
----------------
{description}
----------------

Tell me if the house is not proper for an ad because of it not having a building or being a ruin or some condition that makes inhabitable, in which case respond only 'no' (not proper) or 'yes' (proper).
Do not say anything else.
"""

## 3. Evaluate content of Description and get LLM's evaluation

In [9]:
#list to store the answers, this list will turn into a new column for the df


# Record the start time
start_time = time.time()


# Loop through the first 100 rows of the DataFrame
for j in range (17): #to save data every 1000 iterations
    lista = []
    rows = []
    print(f'Working on row {row} and the next 999.')
    
    inner_start = time.time()
    for i in range(1000): # Process each row in the current chunk
        row = j*1000+i
        if row%100 == 0:
            print(f'Working on row {row}')
        if row >= len(df): # Exit if row exceeds DataFrame length
            break  
        # Check if the 'description' field is NaN
        if pd.isna(df.iloc[row]['description']):
            answer = 'empty'
        else:
            # Format the prompt with the description
            description = df.iloc[row]['description']
            prompt = PROMPT_TEMPLATE.format(description=description)
            # Invoke the LLM with the formatted prompt
            answer = llm.invoke(prompt)
        # Append the answer to the list
        lista.append(answer)
        rows.append(row)

    #create Series and save to disk
    series = pd.Series(data=lista, index=rows)
    series.to_csv(f'series_{row}.csv') #saves name with last row from that group
    print('Saved data.')

    #time spent
    inner_end = time.time()
    inner_duration = inner_end-inner_start
    print(f'This cycle took {inner_duration:.0f} seconds')



# Record the end time
end_time = time.time()
duration = end_time - start_time
print(f"Time taken for the loop: {duration:.0f} seconds")

Working on row 4437 and the next 999.
Working on row 0
Working on row 100
Working on row 200
Working on row 300
Working on row 400
Working on row 500
Working on row 600
Working on row 700
Working on row 800
Working on row 900
Saved data.
This cycle took 0 seconds
Working on row 999 and the next 999.
Working on row 1000
Working on row 1100
Working on row 1200
Working on row 1300
Working on row 1400
Working on row 1500
Working on row 1600
Working on row 1700
Working on row 1800
Working on row 1900
Saved data.
This cycle took 3 seconds
Working on row 1999 and the next 999.
Working on row 2000
Working on row 2100
Working on row 2200
Working on row 2300
Working on row 2400
Working on row 2500
Working on row 2600
Working on row 2700
Working on row 2800
Working on row 2900
Saved data.
This cycle took 0 seconds
Working on row 2999 and the next 999.
Working on row 3000
Working on row 3100
Working on row 3200
Working on row 3300
Working on row 3400
Working on row 3500
Working on row 3600
Working

In [50]:
#evaluate performance
yes=lista.count('Yes')
no=lista.count('No')
percent=(yes+no)/len(lista)
print(f'number of "yes": {yes}')
print(f'number of "no": {no}')
total=yes+no
print(f'Percent of entries with description is: {total/len(lista)*100:.1f}%')
print(f'"No" percentage is: {no/(total)*100:.1f}%')

number of "yes": 52
number of "no": 6
Percent of entries with description is: 58.0%
"No" percentage is: 10.3%


In [52]:
total_duration = duration/total*8500/60/60 #aprox duration of analyzing the whole dataset (in hours)
print(f'Analyzing the whole dataset would take {total_duration} hours.')

Analyzing the whole dataset would take 9.256667673473617 hours.


## Next Steps
- run model on the whole dataset
- create a new column in the df with the result from the LLM, and  eclude the 'no' entries
- re-train ML model on the new, cleaner dataset
- re-analyze data on the new, cleaner dataset