<a href="https://colab.research.google.com/github/SaranshOp/BTP/blob/main/Analysis_Through_Gemini.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Working Environment

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Gemini_Hands_On/
!ls

/content/drive/MyDrive/Gemini_Hands_On
Reviews_handson.tsv


### Import Dataset

In [None]:
import pandas as pd

data = pd.read_csv('Reviews_handson.tsv', sep='\t')
data.head(10)

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1
5,5,31-Jul-18,Heather Gray Fabric,I received the echo as a gift. I needed anothe...,1
6,3,31-Jul-18,Sandstone Fabric,"Without having a cellphone, I cannot use many ...",1
7,5,31-Jul-18,Charcoal Fabric,I think this is the 5th one I've purchased. I'...,1
8,5,30-Jul-18,Heather Gray Fabric,looks great,1
9,5,30-Jul-18,Heather Gray Fabric,Love it! I’ve listened to songs I haven’t hear...,1


In [None]:
mydata = data[['verified_reviews','feedback']]
mydata.columns = ['review','label']

mydata.head()

Unnamed: 0,review,label
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1


In [None]:
mydata.value_counts('label')

label
1    2893
0     257
Name: count, dtype: int64

In [None]:
# Count the occurrences of labels
label_counts = mydata["label"].value_counts()

# majority class
rows_to_drop = label_counts.max() - label_counts.min()

# Drop rows from the majority class randomly
if rows_to_drop > 0:
   data_majority = mydata[mydata["label"] == 1]
   data_balanced = mydata.drop(data_majority.sample(rows_to_drop).index)
else:
   data_balanced = mydata.copy()

print(data_balanced["label"].value_counts())

label
1    257
0    257
Name: count, dtype: int64


## Data Preprocessing

In [None]:
import re

def clean_text(text):
  # Remove special characters and punctuation
  text = re.sub(r"[^\w\s]", " ", text)

  # Remove single characters
  text = re.sub(r"\b[a-zA-Z]\b", " ", text)

  # Remove HTML tags
  text = re.sub(r"<[^>]*>", " ", text)

  # Lowercase the text
  text = text.lower()

  # Remove extra whitespace
  text = re.sub(r"\s+", " ", text)

  # Trim leading and trailing spaces
  text = text.strip()

  return text

In [None]:
import pandas as pd

reviews = data_balanced['review'].tolist()

In [None]:
data_balanced.info()

<class 'pandas.core.frame.DataFrame'>
Index: 514 entries, 0 to 3142
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  513 non-null    object
 1   label   514 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 12.0+ KB


In [None]:
len(reviews)

514

In [None]:
cleaned_reviews = []

for review in reviews:
    if isinstance(review, str):
        cleaned_review = clean_text(review)
        cleaned_reviews.append(cleaned_review)
    else:
        cleaned_reviews.append(" ")
        pass
len(cleaned_review)

110

In [None]:
data_balanced['clean_reviews'] = cleaned_reviews
data_balanced

Unnamed: 0,review,label,clean_reviews
0,Love my Echo!,1,love my echo
18,We love the size of the 2nd generation echo. S...,1,we love the size of the 2nd generation echo st...
24,"I got a second unit for the bedroom, I was exp...",1,got second unit for the bedroom was expecting ...
31,I like it,1,like it
36,Love my Echo. Still learning all the things it...,1,love my echo still learning all the things it ...
...,...,...,...
3096,The product sounded the same as the emoji spea...,0,the product sounded the same as the emoji spea...
3109,Easy to set up and connect with smart devices....,1,easy to set up and connect with smart devices ...
3115,It is just not as loud as I thought it was goi...,1,it is just not as loud as thought it was going...
3138,Este producto llegó y a la semana se quedó sin...,1,este producto llegó la semana se quedó sin olo...


## Data Split

In [None]:
import pandas as pd

total_rows = len(data_balanced)
test_size = int(total_rows * 0.95)

test_set = data_balanced.sample(test_size)

train_set = data_balanced.drop(test_set.index)

## Sentiment w/ LLM

### Setting up Gemini API

In [None]:
!pip install -q -U google-generativeai

In [None]:
# Necessary packages
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

# Used to securely store your API key
from google.colab import userdata

In [None]:
# Or use `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

In [None]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-1.5-pro-latest
models/gemini-pro
models/gemini-pro-vision


In [None]:
model = genai.GenerativeModel('gemini-pro')

In [None]:
%%time
response = model.generate_content("What is the meaning of life?")

to_markdown(response.text)

CPU times: user 128 ms, sys: 14 ms, total: 142 ms
Wall time: 10.5 s


> **Existential and Philosophical Perspectives:**
> 
> * **Absolutism:** Life has an inherent, objective meaning that exists independently of human interpretation.
> * **Existentialism:** Life has no inherent meaning, and individuals must create their own.
> * **Nihilism:** Life is meaningless and devoid of purpose.
> * **Hedonism:** The meaning of life lies in the pursuit of pleasure and the avoidance of pain.
> * **Utilitarianism:** The purpose of life is to maximize happiness and well-being.
> * **Stoicism:** Life is a series of challenges and obstacles that should be met with equanimity and acceptance.
> 
> **Religious and Spiritual Beliefs:**
> 
> * **Theism:** Life has a divinely ordained purpose, such as serving God or experiencing spiritual growth.
> * **Humanism:** The meaning of life lies in the cultivation of human values and the pursuit of a fulfilling life.
> * **Buddhism:** Life is characterized by suffering and impermanence, and the meaning lies in breaking the cycle of rebirth through enlightenment.
> * **Hinduism:** Life is part of a continuous cycle of birth, death, and rebirth, with the ultimate goal of attaining moksha (liberation).
> 
> **Psychological and Developmental Theories:**
> 
> * **Maslow's Hierarchy of Needs:** The meaning of life involves meeting basic needs (e.g., food, safety) and pursuing higher-order needs (e.g., self-actualization).
> * **Erikson's Stages of Life:** Life progresses through a series of developmental stages, each with its own unique purpose.
> * **Positive Psychology:** The meaning of life lies in experiencing positive emotions, having meaningful relationships, and engaging in purposeful activities.
> 
> **Personal Experiences and Interpretations:**
> 
> * **Purpose:** Finding a sense of purpose or direction in life can provide meaning.
> * **Relationships:** Building strong and fulfilling relationships with others can enrich life.
> * **Creativity and Expression:** Expressing oneself through creativity or other forms of self-expression can give life meaning.
> * **Service and Altruism:** Helping others and making a positive impact on the world can provide a sense of fulfillment.
> * **Legacy:** Leaving a lasting legacy through one's actions, ideas, or creations can give life purpose.
> 
> Ultimately, the meaning of life is a highly subjective and personal concept that varies greatly depending on individual beliefs, values, and experiences.

#### Single API Call

In [None]:
test_set_sample = test_set.sample(20)

test_set_sample['pred_label'] = ''

test_set_sample

Unnamed: 0,review,label,clean_reviews,pred_label
444,"Very puny sound, but works great.",1,very puny sound but works great,
1378,Fairly useless.,0,fairly useless,
424,Great product but returning for new Alexa Dot....,0,great product but returning for new alexa dot ...,
551,product stopped working after return time ran out,0,product stopped working after return time ran out,
871,Sad joke. Worthless.,0,sad joke worthless,
434,&#34;NEVER BUY CERTIFIED AND REFURBISHED ECHO ...,0,34 never buy certified and refurbished echo do...,
406,,0,,
1291,"item returned for repair ,receivded item back ...",0,item returned for repair receivded item back f...,
1764,,0,,
2558,"For the price, the product is nice quality and...",0,for the price the product is nice quality and ...,


In [None]:
# Convert the DataFrame to JSON

json_data = test_set_sample[['clean_reviews','pred_label']].to_json(orient='records')

print(json_data)

[{"clean_reviews":"very puny sound but works great","pred_label":""},{"clean_reviews":"fairly useless","pred_label":""},{"clean_reviews":"great product but returning for new alexa dot refurbished is already giving me problems with connection","pred_label":""},{"clean_reviews":"product stopped working after return time ran out","pred_label":""},{"clean_reviews":"sad joke worthless","pred_label":""},{"clean_reviews":"34 never buy certified and refurbished echo dot 34 bought 34 certified and refurbished 34 echo dot this unit is very poor in receiving command and sometimes it does not even acknowledge the command never buy refurbished echo dot the unit received was probably never was certified and refurbished as indicated also bought new echo dot it works perfectly very disappointed","pred_label":""},{"clean_reviews":"","pred_label":""},{"clean_reviews":"item returned for repair receivded item back from repair 07 23 18 parts missing no power cord included please advise","pred_label":""},{"

In [None]:
prompt = f"""
You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.

```
{json_data}
```
"""

print(prompt)


You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.

```
[{"clean_reviews":"very puny sound but works great","pred_label":""},{"clean_reviews":"fairly useless","pred_label":""},{"clean_reviews":"great product but returning for new alexa dot refurbished is already giving me problems with connection","pred_label":""},{"clean_reviews":"product stopped working after return time ran out","pred_label":""},{"clean_reviews":"sad joke worthless","pred_label":""},{"clean_reviews":"34 never buy certified and refurbished echo dot 34 bought 34 certified and refurbished 34 ech

In [None]:
response = model.generate_content(prompt)

print(response.text)

```
[{"clean_reviews":"very puny sound but works great","pred_label":1},{"clean_reviews":"fairly useless","pred_label":0},{"clean_reviews":"great product but returning for new alexa dot refurbished is already giving me problems with connection","pred_label":0},{"clean_reviews":"product stopped working after return time ran out","pred_label":0},{"clean_reviews":"sad joke worthless","pred_label":0},{"clean_reviews":"34 never buy certified and refurbished echo dot 34 bought 34 certified and refurbished 34 echo dot this unit is very poor in receiving command and sometimes it does not even acknowledge the command never buy refurbished echo dot the unit received was probably never was certified and refurbished as indicated also bought new echo dot it works perfectly very disappointed","pred_label":0},{"clean_reviews":"","pred_label":0},{"clean_reviews":"item returned for repair receivded item back from repair 07 23 18 parts missing no power cord included please advise","pred_label":0},{"clea

In [None]:
import json

# stripping the backticks
json_data = response.text.strip("`")

# Load the cleaned data
data = json.loads(json_data)
df_sample = pd.DataFrame(data)

df_sample

Unnamed: 0,clean_reviews,pred_label
0,very puny sound but works great,1
1,fairly useless,0
2,great product but returning for new alexa dot ...,0
3,product stopped working after return time ran out,0
4,sad joke worthless,0
5,34 never buy certified and refurbished echo do...,0
6,,0
7,item returned for repair receivded item back f...,0
8,,0
9,for the price the product is nice quality and ...,0


In [None]:
# prompt: Overwrite pred_label from 'df' to in 'train_set_sample'

test_set_sample['pred_label'] = df_sample['pred_label'].values
test_set_sample

Unnamed: 0,review,label,clean_reviews,pred_label
444,"Very puny sound, but works great.",1,very puny sound but works great,1
1378,Fairly useless.,0,fairly useless,0
424,Great product but returning for new Alexa Dot....,0,great product but returning for new alexa dot ...,0
551,product stopped working after return time ran out,0,product stopped working after return time ran out,0
871,Sad joke. Worthless.,0,sad joke worthless,0
434,&#34;NEVER BUY CERTIFIED AND REFURBISHED ECHO ...,0,34 never buy certified and refurbished echo do...,0
406,,0,,0
1291,"item returned for repair ,receivded item back ...",0,item returned for repair receivded item back f...,0
1764,,0,,0
2558,"For the price, the product is nice quality and...",0,for the price the product is nice quality and ...,0


In [None]:
# EVAL

from sklearn.metrics import confusion_matrix

y_true = test_set_sample["label"]
y_pred = test_set_sample["pred_label"]

confusion_matrix(y_true, y_pred)

array([[13,  0],
       [ 0,  7]])

### Batching API Calls: Gemini API

In [None]:
test_set.shape

(488, 3)

In [None]:
test_set_total = test_set.sample(100)

test_set_total['pred_label'] = ''

test_set_total

Unnamed: 0,review,label,clean_reviews,pred_label
1549,"Very intrusive, so I turn it off",0,very intrusive so turn it off,
1036,Alexa hardly came on..,0,alexa hardly came on,
2706,Great to play music throughout the house. Grea...,1,great to play music throughout the house great...,
1311,"LOVE love love this! I have an echo, and seve...",1,love love love this have an echo and several e...,
2390,I would not recommend this to anyone. It won't...,0,would not recommend this to anyone it won load...,
...,...,...,...,...
2621,Love it,1,love it,
618,Extremely low in volume,0,extremely low in volume,
1628,Echo Show is perfect for staging in the kitche...,1,echo show is perfect for staging in the kitche...,
1013,My wife & I enjoy the ease of playing our mus...,1,my wife enjoy the ease of playing our music th...,


In [None]:
batches = []
batch_size = 25

for i in range(0, len(test_set_total), batch_size):
  batches.append(test_set_total[i : i + batch_size])

In [None]:
import time

def gemini_completion_function(batch,current_batch,total_batch):
  """Function works in three steps:
  # Step-1: Convert the DataFrame to JSON using the to_json() method.
  # Step-2: Preparing the Gemini Prompt
  # Step-3: Calling Gemini API
  """

  print(f"Now processing batch#: {current_batch+1} of {total_batch}")

  json_data = batch[['clean_reviews','pred_label']].to_json(orient='records')

  prompt = f"""You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
  Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below.
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).

  ```
  {json_data}
  ```
  """

  print(prompt)
  response = model.generate_content(prompt)
  time.sleep(5)

  return response

In [None]:
batch_count = len(batches)
responses = []

for i in range(0,len(batches)):
  responses.append(gemini_completion_function(batches[i],i,batch_count))

Now processing batch#: 1 of 4
You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
  Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below.
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).

  ```
  [{"clean_reviews":"very intrusive so turn it off","pred_label":""},{"clean_reviews":"alexa hardly came on","pred_label":""},{"clean_reviews":"great to play music throughout the house great and easily fits in my decor of my home","pred_label":""},{"clean_reviews":"love love love this have an echo and several echo dots but love t

In [None]:
import json

df_total = pd.DataFrame()

for response in responses:
    # Stripping the backticks
    json_data = response.text.strip("`")

    # Load the cleaned data and convert to DataFrame
    data = json.loads(json_data)
    df_temp = pd.DataFrame(data)

    # Concatenate df_temp with df_total
    df_total = pd.concat([df_total, df_temp], ignore_index=True)

print(df_total)


                                        clean_reviews  pred_label
0                       very intrusive so turn it off           0
1                                alexa hardly came on           0
2   great to play music throughout the house great...           1
3   love love love this have an echo and several e...           1
4   would not recommend this to anyone it won load...           0
..                                                ...         ...
95                                            love it           1
96                            extremely low in volume           0
97  echo show is perfect for staging in the kitche...           1
98  my wife enjoy the ease of playing our music th...           1
99  este producto llegó la semana se quedó sin olo...           0

[100 rows x 2 columns]


In [None]:
# prompt: Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

test_set_total['pred_label'] = df_total['pred_label'].values
test_set_total

Unnamed: 0,review,label,clean_reviews,pred_label
1549,"Very intrusive, so I turn it off",0,very intrusive so turn it off,0
1036,Alexa hardly came on..,0,alexa hardly came on,0
2706,Great to play music throughout the house. Grea...,1,great to play music throughout the house great...,1
1311,"LOVE love love this! I have an echo, and seve...",1,love love love this have an echo and several e...,1
2390,I would not recommend this to anyone. It won't...,0,would not recommend this to anyone it won load...,0
...,...,...,...,...
2621,Love it,1,love it,1
618,Extremely low in volume,0,extremely low in volume,0
1628,Echo Show is perfect for staging in the kitche...,1,echo show is perfect for staging in the kitche...,1
1013,My wife & I enjoy the ease of playing our mus...,1,my wife enjoy the ease of playing our music th...,1


In [None]:
# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix, accuracy_score

y_true = test_set_total["label"]
y_pred = test_set_total["pred_label"]

confusion_matrix(y_true, y_pred)

array([[48,  1],
       [ 3, 48]])

In [None]:
accuracy_score(y_true, y_pred)

0.96