In [1]:
import openai
import pandas as pd
import os
import re
from tenacity import retry, wait_random_exponential, stop_after_attempt
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

openai_api_key = "key"

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
df = pd.read_csv('/content/drive/MyDrive/NLP/Handin/Data/df_reviews_final_binary.csv')

In [3]:
df['sentiment'] = (df.review_rating == 'positive').astype('int')
df['reviews'] = df['review_comments']
df = df[['reviews', 'sentiment']]
df = pd.concat((df[df.sentiment == 1].sample(970), df[df.sentiment == 0].sample(30)))

In [22]:
DIR_PATH = "/content/drive/MyDrive/NLP/Handin/Data"
CSV_FILE = os.path.join(DIR_PATH, 'gpt_responses.csv')
def get_last_saved_index():
    if os.path.isfile(CSV_FILE):
        df = pd.read_csv(CSV_FILE)
        return df.shape[0]
    return 0
def process_with_retry():
    openai.api_key = openai_api_key
    last_index = get_last_saved_index()
    if last_index > 0:
        df_gpt_response = pd.read_csv(CSV_FILE)
    else:
        df_gpt_response = pd.DataFrame(columns=['review_comment', 'predicted_rating', 'actual_rating'])
    for i, (comment, actual_rating) in enumerate(tqdm(zip(df['reviews'][last_index:], df['sentiment'][last_index:]), desc="Processing reviews", unit="review")):
        response = openai.Completion.create(
            model="gpt-3.5-turbo-instruct",
            prompt=f"""Given the following Airbnb review, predict the reviewer's rating as 1 or 0,
            where 1 is positive and 0 is the negative. Provide your answer as only an integer.
            Here is the review: '{comment}' """,
            temperature=0,
            max_tokens=60,
            logprobs=10,
            top_p=1.0,
            frequency_penalty=0.0,
            presence_penalty=0.0
        )
        try:
            predicted_rating = int(response.choices[0].text.strip())
            df_gpt_response = pd.concat([df_gpt_response, pd.DataFrame([{'review_comment': comment, 'predicted_rating': predicted_rating, 'actual_rating': actual_rating}])], ignore_index=True)
            df_gpt_response.to_csv(CSV_FILE, index=False)
        except ValueError:
            print(f"Error: Invalid response for comment - '{comment}'")
    return df_gpt_response
result_df = process_with_retry()
print(result_df)

Processing reviews: 366review [01:48,  2.22review/s]

Error: Invalid response for comment - 'great spot in '


Processing reviews: 451review [02:12,  2.24review/s]

Error: Invalid response for comment - 'top'


Processing reviews: 485review [02:22,  1.49review/s]

Error: Invalid response for comment - ' apartment'


Processing reviews: 549review [02:41,  2.11review/s]

Error: Invalid response for comment - 'war top'


Processing reviews: 574review [02:49,  1.81review/s]

Error: Invalid response for comment - 'cool'


Processing reviews: 636review [03:08,  2.00review/s]

Error: Invalid response for comment - 'great stay in '


Processing reviews: 1000review [04:57,  3.36review/s]

                                        review_comment predicted_rating  \
0    a beautiful apartment great shower gets hot qu...                0   
1    great apartment in wonderful location of  lots...                1   
2     is a welcoming and very obliging host this is...                1   
3       was fantastic to stay at an easy walk to ev...                1   
4     s home is absolutely beautiful the photos don...                1   
..                                                 ...              ...   
989  host was gracious to meet us and let us in at ...                0   
990     in le marais was good however it is a very ...                0   
991  do not stay here false advertising filthy apar...                0   
992  to avoid the host was not reliable he cancelle...                0   

    actual_rating  
0               1  
1               1  
2               1  
3               1  
4               1  
..            ...  
989             0  
990           




In [24]:
from sklearn.metrics import accuracy_score, classification_report
df_gpt_response = pd.read_csv(CSV_FILE)
predicted_ratings = df_gpt_response['predicted_rating']
actual_ratings = df_gpt_response['actual_rating']
accuracy = accuracy_score(actual_ratings, predicted_ratings)
print(f"Overall Accuracy: {accuracy}")
classification_metrics = classification_report(actual_ratings, predicted_ratings, digits = 5)
print(classification_metrics)

Overall Accuracy: 0.993963782696177
              precision    recall  f1-score   support

           0    0.85294   0.96667   0.90625        30
           1    0.99896   0.99481   0.99688       964

    accuracy                        0.99396       994
   macro avg    0.92595   0.98074   0.95157       994
weighted avg    0.99455   0.99396   0.99415       994



In [27]:
df_gpt_response.actual_rating.value_counts()

actual_rating
1    964
0     30
Name: count, dtype: int64

In [28]:
df_gpt_response.predicted_rating.value_counts()

predicted_rating
1    960
0     34
Name: count, dtype: int64

In [30]:
openai.api_key = openai_api_key
FP = openai.Completion.create(
  model="gpt-3.5-turbo-instruct",
  prompt="""Given the following Airbnb review, predict the reviewer's rating as 1 or 0,
            where 1 is positive and 0 is the negative. Provide your answer as only an integer.
            Here is the review: 'The flat was good, host was bad and location great' """,
  temperature=0,
  max_tokens=60,
  logprobs=10,
  top_p=1.0,
  frequency_penalty=0.0,
  presence_penalty=0.0
)
import math
def clean_number(x):
    x = re.sub("[^0-9]", "", x)
    return x
token_probas = FP["choices"][0]["logprobs"]["top_logprobs"]
probabilities = {str(k):0 for k in range(0,2)}
for token_prob in token_probas:
    for keys, probas in token_prob.items():
        num = clean_number(keys)
        if num in ["0","1"]:
            probabilities[num] += math.exp(probas)
total = sum(probabilities.values())
probabilities = {k: v / total for k, v in probabilities.items()}
print(probabilities)

{'0': 0.6034378420878418, '1': 0.3965621579121582}
