# LLM for analyzing Description content and evaluate the ad's validity

## 1. Initialize notebook

In [1]:
#import libraries
import pandas as pd
from langchain_community.llms import Ollama
import time


In [2]:
#import clean dataset
df = pd.read_csv('data_clean4.csv')

In [3]:
#create model
#llama3 is the most advanced model to run locally
llm = Ollama(model="llama3", temperature=0.2)
#temperature 0.2 for increased accuracy and less creativity, according to the task at hand 

## 2. Prompt engineering

In [4]:
#template that will be used in all calls of the llm

PROMPT_TEMPLATE = """\
You are a good detective specialized in the housing market. You will get a text describing a house sale from a website which may be in Portuguese or Spanish.
The houses should be sold in good condition, or about to be ready to live in. However, some houses are ruins or only have the project of the house, but no building at all.
Your job is to get read the description and tell me if the house is not proper for an ad because of it not having a building or being a ruin or some condition that makes inhabitable.

Considering the following description:
----------------
{description}
----------------

Tell me if the house is not proper for an ad because of it not having a building or being a ruin or some condition that makes inhabitable, in which case respond only 'no' (not proper) or 'yes' (proper).
Do not say anything else.
"""

## 3. Evaluate content of Description and get LLM's evaluation

In [6]:
list(range(17))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]

In [45]:
#list to store the answers, this list will turn into a new column for the df


# Record the start time
start_time = time.time()


# Loop through the first 100 rows of the DataFrame
for j in range (17):
    lista = []
    rows = []
    for i in range(len(df)):
        row = j*100+i
        # Check if the 'description' field is NaN
        if pd.isna(df.iloc[row]['description']):
            answer = 'empty'
        else:
            # Format the prompt with the description
            description = df.iloc[row]['description']
            prompt = PROMPT_TEMPLATE.format(description=description)
            # Invoke the LLM with the formatted prompt
            answer = llm.invoke(prompt)
        # Append the answer to the list
        lista.append(answer)
        rows.append(row)

        #create Series and save to disk
        series = pd.Series(data=)
    
    # Record the end time
    end_time = time.time()
    
    # Calculate the duration
    duration = end_time - start_time
    
    # Print the time taken
    print(f"Time taken for the loop: {duration:.0f} seconds")
    
    
    # Print the list of answers
    print(lista)

Time taken for the loop: 227 seconds
['empty', 'empty', 'empty', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'empty', 'empty', 'Yes', 'empty', 'Yes', 'empty', 'Yes', 'Yes', 'Yes', 'Yes', 'empty', 'empty', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'empty', 'empty', 'empty', 'Yes', 'Yes', 'empty', 'Yes', 'Yes', 'Yes', 'empty', 'empty', 'No', 'empty', 'Yes', 'Yes', 'empty', 'empty', 'empty', 'Yes', 'Yes', 'empty', 'Yes', 'Yes', 'Yes', 'empty', 'Yes', 'Yes', 'Yes', 'empty', 'Yes', 'empty', 'Yes', 'Yes', 'empty', 'Yes', 'Yes', 'Yes', 'empty', 'Yes', 'Yes', 'empty', 'empty', 'Yes', 'empty', 'Yes', 'Yes', 'Yes', 'Yes', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'Yes', 'empty', 'empty', 'empty', 'empty', 'empty', 'Yes', 'Yes', 'Yes', 'empty', 'empty', 'Yes', 'Yes', 'Yes']


In [50]:
#evaluate performance
yes=lista.count('Yes')
no=lista.count('No')
percent=(yes+no)/len(lista)
print(f'number of "yes": {yes}')
print(f'number of "no": {no}')
total=yes+no
print(f'Percent of entries with description is: {total/len(lista)*100:.1f}%')
print(f'"No" percentage is: {no/(total)*100:.1f}%')

number of "yes": 52
number of "no": 6
Percent of entries with description is: 58.0%
"No" percentage is: 10.3%


In [52]:
total_duration = duration/total*8500/60/60 #aprox duration of analyzing the whole dataset (in hours)
print(f'Analyzing the whole dataset would take {total_duration} hours.')

Analyzing the whole dataset would take 9.256667673473617 hours.


## Next Steps
- run model on the whole dataset
- create a new column in the df with the result from the LLM, and  eclude the 'no' entries
- re-train ML model on the new, cleaner dataset
- re-analyze data on the new, cleaner dataset