#  **Fynd-Ai-Intern-Assessment Date 5/12/2025**

#### **Importing Required Librareis**

In [104]:
# import libraries
import os
from dotenv import load_dotenv
import pandas as pd
from openai import OpenAI
import json

#### **Some Data Operations on Data**

In [105]:
# load the dataset
data = pd.read_csv("yelp.csv")

In [106]:
data.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [107]:
data.info() # check for null values and data types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   business_id  10000 non-null  object
 1   date         10000 non-null  object
 2   review_id    10000 non-null  object
 3   stars        10000 non-null  int64 
 4   text         10000 non-null  object
 5   type         10000 non-null  object
 6   user_id      10000 non-null  object
 7   cool         10000 non-null  int64 
 8   useful       10000 non-null  int64 
 9   funny        10000 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 781.4+ KB


In [108]:
data.shape # check number of rows and columns
 

(10000, 10)

In [109]:
# extract needed columns
data = data[['text','stars']]

In [110]:
# shap of the new dataframe
data.shape

(10000, 2)

In [111]:
data.head()

Unnamed: 0,text,stars
0,My wife took me here on my birthday for breakf...,5
1,I have no idea why some people give bad review...,5
2,love the gyro plate. Rice is so good and I als...,4
3,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",5
4,General Manager Scott Petello is a good egg!!!...,5


In [112]:
data.tail() # check last few rows

Unnamed: 0,text,stars
9995,First visit...Had lunch here today - used my G...,3
9996,Should be called house of deliciousness!\n\nI ...,4
9997,I recently visited Olive and Ivy for business ...,4
9998,My nephew just moved to Scottsdale recently so...,2
9999,4-5 locations.. all 4.5 star average.. I think...,5


In [113]:
# drop null values if any
data = data.dropna()

In [114]:
# print the final shape of the dataset
data.shape

(10000, 2)

In [115]:
# Remove very long reviews (more than 500 words)
data  = data[data['text'].str.split().apply(len)<=500]

In [116]:
data.shape

(9847, 2)

In [117]:
# sampel 200 rows for faster processing
data = data.sample(200, random_state=42).reset_index(drop=True)

In [118]:
# clean the text data
data['text'] = (
    data['text']
    .str.replace('\n', ' ')
    .str.replace('"', "'")
)


In [119]:
print(data)

                                                  text  stars
0    This place has a great selection of Korean dis...      4
1    This place is the best place I've found so far...      5
2    Okay, I will start out by saying that I just p...      5
3    My husband and I heard great reviews about thi...      1
4    Where I live in Tempe, I can walk to the end o...      3
..                                                 ...    ...
195  Great variety.  Great price!  Beats other buff...      5
196  I go here a lot. Well...used to. Service gets ...      3
197  Based off of these great reviews i decided to ...      4
198  Got a padi and the tech stopped for a while to...      1
199  I spent most of my first year being legal drin...      2

[200 rows x 2 columns]


In [120]:
# reset index after dropping rows
data = data.reset_index(drop=True)

In [121]:
# convert text column name to review for better understanding
data = data.rename(columns={'text': 'review'})

In [122]:
# print the column names
print(data.columns)

Index(['review', 'stars'], dtype='object')


In [123]:
review = data['review'].iloc[0] 

In [124]:
print("Final shape of the dataset:", data.shape)
print("The columns in the dataset are:", data.columns.tolist())

Final shape of the dataset: (200, 2)
The columns in the dataset are: ['review', 'stars']


#### **LLM Operations On Data**

In [125]:
# load environment variables from .env file
load_dotenv()

True

In [126]:
# get API key from environment variable
api  = os.getenv("GAK")

In [127]:
# set it in Openai client
client = OpenAI(
    api_key=api,
    base_url="https://generativelanguage.googleapis.com/v1beta"
)


In [128]:
# 
prompt_1 = [
    {
        "role": "system",
        "content": """
You classify Yelp reviews into 1-5 stars.
Your output must be valid JSON and must not contain extra text.
Star definitions:
1 = very negative
2 = negative
3 = neutral/mixed
4 = positive
5 = very positive
"""
    },
    {
        "role": "user",
        "content": f"""
Review: "{review}"

Return ONLY the JSON object:
{{
  "predicted_stars": <integer>,
  "explanation": "short reason"
}}
"""
    }
]



In [129]:

# And now call it! any problem, head to the Troubleshooting guide
response = client.chat.completions.create(
    model="gemini-2.0-flash",
    messages= prompt_1,
)

print(response.choices[0].message.content)

```json
{
  "predicted_stars": 5,
  "explanation": "Mentions great selection, family favorite, friendly staff, and great experience indicating very positive sentiment."
}
```


In [130]:
prompt_2 = [
    {
        "role": "system",
        "content": """
You classify Yelp reviews into 1â€“5 stars.
Your output must be valid JSON and contain no additional text.
Follow the format shown in the examples.
"""
    },
    {
        "role": "user",
        "content": "Review: 'Terrible food, rude staff.'"
    },
    {
        "role": "assistant",
        "content": '{"predicted_stars": 1, "explanation": "very negative experience"}'
    },
    {
        "role": "user",
        "content": "Review: 'Great service and delicious food!'"
    },
    {
        "role": "assistant",
        "content": '{"predicted_stars": 5, "explanation": "very positive"}'
    },
    {
        "role": "user",
        "content": f'Review: "{review}"'
    }
]

In [131]:
response = client.chat.completions.create(
    model="gemini-2.0-flash",
    messages= Prompt_2,
)

print(response.choices[0].message.content)

{"predicted_stars": 5, "explanation": "Mentions a great selection of food, family favorite, friendly staff, and overall great experience."}


In [132]:
prompt_3 = [
    {
        "role": "system",
        "content": """
Think step-by-step internally but DO NOT reveal your reasoning.
Only return the final JSON output.
Ensure the JSON is valid and contains no extra text.
"""
    },
    {
        "role": "user",
        "content": f"""
Review: "{review}"

Return JSON in the following format:
{{
  "predicted_stars": <integer>,
  "explanation": "brief reason"
}}
"""
    }
]

In [133]:
response = client.chat.completions.create(
    model="gemini-2.0-flash",
    messages=prompt_3,
)

print(response.choices[0].message.content)

```json
{
  "predicted_stars": 5,
  "explanation": "Mentions great food selection, family favorite dishes, friendly staff, and a great overall experience, outweighing the slight slowness of service."
}
```
