In [1]:
import pandas as pd
import ast
import os
from dotenv import load_dotenv

In [None]:
load_dotenv()
api_key = os.getenv("GROQ_API_KEY")

In [2]:
df = pd.read_csv("reviews_google.csv")

In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [4]:
df.head()

Unnamed: 0,user_id,name,date,rating,text,gmap_id
0,115087327175786879005,James Drummond,2019-05-06 01:42:12+00:00,1,"The location is a franchise of sorts operated by Sodexo, the food service intermediary working on contract for the university. As a result, the service is not and will never be up to Starbucks's standards. This is less about the quality of the food & beverages, and more to do with the quality and friendliness of the servers. I personally have only had to deal with slow service, but that could happen at any time during a rush. The worst stories come from others; I have been around while the building was hosting an added-security event, which required the presence of LEOs; the baristas refused to provide him with a glass of water without charging him. At other foodservice locations in the same building and operated by the same company, not even students are charged for water glasses. That one event does a good job summarizing the apathy and disregard the employees have for their customers. Another critical issue is that this location doesn't accept Starbucks's own rewards program, which is extremely annoying after having signed up for the program just for the convenience of the location being in the same building that I work for. I've personally made a resolution to never purchase another item at this location again.",0x88891beed225fed1:0x3c63ad3e69972d22
1,103797448577708424762,Matthew Pearson,2019-04-19 12:10:35+00:00,1,"Go to the one in Sterne. This place is a mess. Wrong size coffee, stale croissant, long wait.",0x88891beed225fed1:0x3c63ad3e69972d22
2,105327613948110783717,Gargya Malla,2021-03-23 19:05:56+00:00,1,,0x88891beed225fed1:0x3c63ad3e69972d22
3,109991408114401717608,Tessa Moore,2019-10-25 11:47:08+00:00,1,,0x88891beed225fed1:0x3c63ad3e69972d22
4,101745792789153591390,Shayla Rose,2021-06-07 16:53:41+00:00,1,,0x88891beed225fed1:0x3c63ad3e69972d22


In [3]:
reviews = list(df.text.dropna())

In [4]:
print(df.shape)
len(reviews)


(156729, 6)


75035

In [21]:
print(reviews[5])

Rude


In [5]:
from groq import Groq

def cluster_and_tag(review):
    client = Groq(api_key = api_key)
    

    completion = client.chat.completions.create(
        model="llama3-70b-8192",
        messages=[
            {
                "role": "system",
                "content": "You are a business specialist. You have to clasisify Starbucks client reviews into one of five clusters: \n1) Service.\n2) Place.\n3) Coffee.\n4) Food.\n5) Time\n\nAditionally, add a positive, neutral or negative tag to the review.\n\n. Return your answer in the specify format, without any other messages."
            },
            {
                "role": "user",
                "content": "Review: The location is a franchise of sorts operated by Sodexo, the food service intermediary working on contract for the university. As a result, the service is not and will never be up to Starbucks's standards. This is less about the quality of the food & beverages, and more to do with the quality and friendliness of the servers. I personally have only had to deal with slow service, but that could happen at any time during a rush. The worst stories come from others; I have been around while the building was hosting an added-security event, which required the presence of LEOs; the baristas refused to provide him with a glass of water without charging him. At other foodservice locations in the same building and operated by the same company, not even students are charged for water glasses. That one event does a good job summarizing the apathy and disregard the employees have for their customers. Another critical issue is that this location doesn't accept Starbucks's own rewards program, which is extremely annoying after having signed up for the program just for the convenience of the location being in the same building that I work for. I've personally made a resolution to never purchase another item at this location again."
            },
            {
                "role": "assistant",
                "content": "{'cluster': 'Service', 'tag': 'negative'}"
            },
            {
                "role": "user",
                "content": "Go to the one in Sterne. This place is a mess. Wrong size coffee, stale croissant, long wait."
            },
            {
                "role": "assistant",
                "content": "[{'cluster': 'place', 'tag': 'negative'},{'cluster': 'food', 'tag': 'negative'}]"
            },
            {
                "role": "user",
                "content": f"Review: {review}"
            }
        ],
        temperature=1,
        max_tokens=1024,
        top_p=1,
        stream=True,
        stop=None,
    )
    answer =[]
    for chunk in completion:
      sent = (chunk.choices[0].delta.content or "") 
      answer.append(sent)

    return ast.literal_eval("".join(answer))
    


Abro el archivo con las reviews ya analizadas. Este trabajo se hace de a partes porque analizar 500 reviews lleva 40 minutos.

In [6]:
clustered = pd.read_csv("reviews_cluster.csv")
clustered.shape

(2144, 3)

Selecciono al azar 500 reviews que no hayan sido analizadas. Se hace al azar porque las reviews venían ordenadas por puntaje y queremos que todas las muestras sean representativas para tener un MVP en caso de no llegar a procesar todas las reviews. 

In [7]:
import random
r = random.sample([review for review in reviews if review not in list(clustered.reviews)], 500)

In [79]:
reviews_cluster.shape

(500, 3)

Corremos el código que analiza cada review y devuelve un diccionario con la categoría que se está calificando y el tipo de calificación (Positivo-Neutro-Negativo). En caso que el review califique más de una categoría devuelve una lista de diccionarios.

In [None]:
reviews_cluster = pd.DataFrame()
clusters = []
tags = []
for review in r:
    try:
        analisis = cluster_and_tag(review)
        print(analisis)
        if isinstance(analisis, list):
            clusters.append([element['cluster'] for element in analisis])
            tags.append([element['tag'] for element in analisis])
        elif isinstance(analisis, dict):
            clusters.append(analisis['cluster'])
            tags.append(analisis['tag'])
        else: 
            clusters.append(None)
            tags.append(None)
    except Exception as e:
        clusters.append(None)
        tags.append(None)

reviews_cluster['reviews'] = r
reviews_cluster['clusters'] = clusters
reviews_cluster['tags'] = tags


In [35]:
def cap(el):
    if isinstance(el, str):
        return el.capitalize()
    elif isinstance(el, list):
        el = [e.capitalize() for e in el]
    return el

In [81]:
reviews_cluster.clusters = reviews_cluster['clusters'].apply(cap)
reviews_cluster.tags = reviews_cluster['tags'].apply(cap)

In [82]:
reviews_cluster = reviews_cluster.explode(['clusters', 'tags'])

In [84]:
reviews_cluster.shape

(723, 3)

In [83]:
reviews_cluster.groupby("clusters")["tags"].value_counts()

clusters  tags    
Coffee    Positive    133
          Negative     40
          Neutral      20
Food      Positive     28
          Negative     13
          Neutral       2
Neutral   Neutral       1
None      None          2
          Neutral       1
Place     Positive    115
          Neutral      26
          Negative     25
Service   Positive    204
          Negative     48
          Neutral      12
Time      Negative     25
          Positive     20
          Neutral       7
Name: count, dtype: int64

Abre el archivo y agrega las nuevas reviews etiquetadas.

In [85]:
import os
# Ruta del archivo CSV
file_path = 'reviews_cluster.csv'

# Verificar si el archivo existe
if os.path.exists(file_path):
    # Leer el archivo CSV existente
    existing_df = pd.read_csv(file_path)
    # Concatenar el DataFrame existente con el nuevo DataFrame
    updated_df = pd.concat([existing_df, reviews_cluster], ignore_index=True)
else:
    # Si el archivo no existe, el DataFrame actualizado es el nuevo DataFrame
    updated_df = reviews_cluster

# Guardar el DataFrame actualizado en el archivo CSV
updated_df.to_csv(file_path, index=False)