<a href="https://colab.research.google.com/github/ShirinTahmasebi/KTH-ID2223/blob/main/1_obtain_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%%capture
! pip install transformers
! pip install openai

In [3]:
DATASET_PATH = '/content/drive/MyDrive/Embedding with GPT-3/input/Reviews.csv'
OPEN_AI_API_KEY = '/content/drive/MyDrive/Embedding with GPT-3/input/openai_api_key.txt'
OPEN_AI_REQUEST_LIMIT = 50

In [4]:
import pandas as pd

## 1. Load the dataset

df = pd.read_csv(DATASET_PATH, index_col=0)
df = df[['Time', 'ProductId', 'UserId', 'Score', 'Summary', 'Text']]
df = df.dropna()
df['combined'] = "Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
df.head(2)

Unnamed: 0_level_0,Time,ProductId,UserId,Score,Summary,Text,combined
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1303862400,B001E4KFG0,A3SGXH7AUHU8GW,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,Title: Good Quality Dog Food; Content: I have ...
2,1346976000,B00813GRG4,A1D87F6ZCVE5NK,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Title: Not as Advertised; Content: Product arr...


In [5]:
# Subsample to 1k most recent reviews and remove samples that are too long
df = df.sort_values('Time').tail(1_100)
df.drop('Time', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [6]:
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

In [7]:
# Remove reviews that are too long
df['n_tokens'] = df.combined.apply(lambda x: len(tokenizer.encode(x)))
df = df[df.n_tokens<2000].tail(1_000)
len(df)

1000

In [10]:
### 2. Get embeddings and save them for future reuse

#%%
import openai
from openai.embeddings_utils import get_embedding

openai.api_key = open(OPEN_AI_API_KEY).readlines()[0]

In [33]:
engines = \
[
  'text-similarity-davinci-001',
  # 'text-similarity-ada-001',
  'text-similarity-babbage-001',
  'text-similarity-curie-001',
  # 'code-search-ada-code-001',
  # 'code-search-babbage-code-001'
]

In [12]:
result_column_names = []

for item in engines:
  result_column_names.append('embedding_' + item)
  result_column_names.append('reduced_x_' + item)
  result_column_names.append('reduced_y_' + item)

In [13]:
def create_embedding_for_engine(engine:str, last_query_time=0):
  import time
  import math
  from datetime import datetime

  expected_number_of_iterations = math.ceil(len(df) / OPEN_AI_REQUEST_LIMIT)

  result_series = pd.Series([])

  for i in range(expected_number_of_iterations):
    while last_query_time and time.time() - last_query_time < 60:
      continue

    lowerbound_index = i * OPEN_AI_REQUEST_LIMIT
    upperbound_index = lowerbound_index + OPEN_AI_REQUEST_LIMIT

    print('Current time is: ', datetime.now().strftime('%H:%M:%S'), ' - From [{},{})'.format(lowerbound_index, upperbound_index))
    temp_result_series = df.iloc[lowerbound_index: upperbound_index].combined.apply(lambda x: get_embedding(x, engine=engine))
    result_series = result_series.append(temp_result_series)

    last_query_time = time.time()
  
  return result_series, last_query_time

In [29]:
df_result = pd.DataFrame(columns=result_column_names)

In [34]:
for item in engines:
  # df_result['embedding_' + item], last_query_time = create_embedding_for_engine(item, last_query_time)
  df_result['embedding_' + item] = df.combined.apply(lambda x: get_embedding(x, engine=item))
  # break

RetryError: ignored

In [35]:
df_result

Unnamed: 0,embedding_text-similarity-davinci-001,reduced_x_text-similarity-davinci-001,reduced_y_text-similarity-davinci-001,embedding_text-similarity-ada-001,reduced_x_text-similarity-ada-001,reduced_y_text-similarity-ada-001,embedding_text-similarity-babbage-001,reduced_x_text-similarity-babbage-001,reduced_y_text-similarity-babbage-001,embedding_text-similarity-curie-001,reduced_x_text-similarity-curie-001,reduced_y_text-similarity-curie-001,embedding_code-search-ada-code-001,reduced_x_code-search-ada-code-001,reduced_y_code-search-ada-code-001,embedding_code-search-babbage-code-001,reduced_x_code-search-babbage-code-001,reduced_y_code-search-babbage-code-001
284932,"[-0.005729738622903824, 0.002782381372526288, ...",,,"[0.01860060915350914, -0.015613649971783161, -...",,,,,,,,,,,,,,
220697,"[-0.015217021107673645, 0.007462559267878532, ...",,,"[0.018258072435855865, 0.03005059063434601, -0...",,,,,,,,,,,,,,
107908,"[-0.004584628622978926, 0.01346516516059637, -...",,,"[-0.0005673975683748722, -0.012302356772124767...",,,,,,,,,,,,,,
107800,"[0.0012591804843395948, 0.00389345595613122, -...",,,"[0.01373705267906189, 0.053661659359931946, -0...",,,,,,,,,,,,,,
205313,"[-0.0032210019417107105, 0.0040468997322022915...",,,"[0.021258626133203506, 0.007491868454962969, -...",,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7178,"[-0.0005286579835228622, 0.0024464321322739124...",,,"[-0.016586609184741974, 0.03342374041676521, -...",,,,,,,,,,,,,,
401972,"[-0.0002831067831721157, -0.003314807778224349...",,,"[-0.016825618222355843, -0.00798442680388689, ...",,,,,,,,,,,,,,
462088,"[-0.0036534518003463745, 0.0004380333120934665...",,,"[0.00622323714196682, 0.008891819044947624, -0...",,,,,,,,,,,,,,
267549,"[-0.005907019134610891, 0.0012909695506095886,...",,,"[0.012264199554920197, 0.022083846852183342, -...",,,,,,,,,,,,,,


In [36]:
df_result['label'] = df['Score']

In [37]:
import os

def check_dir(file_name):
    directory = os.path.dirname(file_name)
    if not os.path.exists(directory):
        os.makedirs(directory)

In [38]:
check_dir('output/result_Reviews.csv')
df_result.to_csv('output/result_Reviews.csv')

In [39]:
import os
os.system('cp -r "/content/output/" "/content/drive/MyDrive/Embedding with GPT-3/"')

0

In [40]:
df_result

Unnamed: 0,embedding_text-similarity-davinci-001,reduced_x_text-similarity-davinci-001,reduced_y_text-similarity-davinci-001,embedding_text-similarity-ada-001,reduced_x_text-similarity-ada-001,reduced_y_text-similarity-ada-001,embedding_text-similarity-babbage-001,reduced_x_text-similarity-babbage-001,reduced_y_text-similarity-babbage-001,embedding_text-similarity-curie-001,reduced_x_text-similarity-curie-001,reduced_y_text-similarity-curie-001,embedding_code-search-ada-code-001,reduced_x_code-search-ada-code-001,reduced_y_code-search-ada-code-001,embedding_code-search-babbage-code-001,reduced_x_code-search-babbage-code-001,reduced_y_code-search-babbage-code-001,label
284932,"[-0.005729738622903824, 0.002782381372526288, ...",,,"[0.01860060915350914, -0.015613649971783161, -...",,,,,,,,,,,,,,,5
220697,"[-0.015217021107673645, 0.007462559267878532, ...",,,"[0.018258072435855865, 0.03005059063434601, -0...",,,,,,,,,,,,,,,1
107908,"[-0.004584628622978926, 0.01346516516059637, -...",,,"[-0.0005673975683748722, -0.012302356772124767...",,,,,,,,,,,,,,,4
107800,"[0.0012591804843395948, 0.00389345595613122, -...",,,"[0.01373705267906189, 0.053661659359931946, -0...",,,,,,,,,,,,,,,3
205313,"[-0.0032210019417107105, 0.0040468997322022915...",,,"[0.021258626133203506, 0.007491868454962969, -...",,,,,,,,,,,,,,,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7178,"[-0.0005286579835228622, 0.0024464321322739124...",,,"[-0.016586609184741974, 0.03342374041676521, -...",,,,,,,,,,,,,,,5
401972,"[-0.0002831067831721157, -0.003314807778224349...",,,"[-0.016825618222355843, -0.00798442680388689, ...",,,,,,,,,,,,,,,5
462088,"[-0.0036534518003463745, 0.0004380333120934665...",,,"[0.00622323714196682, 0.008891819044947624, -0...",,,,,,,,,,,,,,,5
267549,"[-0.005907019134610891, 0.0012909695506095886,...",,,"[0.012264199554920197, 0.022083846852183342, -...",,,,,,,,,,,,,,,5
