In [None]:
!pip install openai

Collecting openai
  Downloading openai-1.26.0-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, openai
Successfully installed h11-0.14.0 httpcore-1.0.5 ht

In [None]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

In [None]:
OPENAI_API_KEY = " "


In [None]:
import openai
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import os

if 'OPENAI_API_KEY' in os.environ:
    openai.api_key = os.environ['OPENAI_API_KEY']
else:
    openai.api_key = OPENAI_API_KEY

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    words = nltk.word_tokenize(text)
    return ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in words])

def remove_unwanted_words(text):
    # Define unwanted words and phrases
    unwanted_phrases = ["summary:", "the post", "the author"]
    # Iterate over each unwanted phrase and remove it from the text
    for phrase in unwanted_phrases:
        text = text.replace(phrase, "")
    return text.strip()

# Load data from CSV
input_file = '/content/drive/My Drive/Master_Thesis/translated_text_preprocessed.csv'
output_file = '/content/drive/My Drive/Master_Thesis/generated_summaries.csv'

if not os.path.exists(input_file):
    print("Error: Input CSV file not found.")
    exit()

# Check if previously generated summaries CSV file exists
if os.path.exists(output_file):
    df = pd.read_csv(output_file)
else:
    df = pd.read_csv(input_file)
    df["generated_summary"] = ""  # Initialize column if not present

try:
    # Set temperature and max_tokens values directly
    temperature = 0.4
    max_tokens = 350

    # Define batch size and resume index
    batch_size = 100
    resume_index = 0

    # Check if there's a resume point
    if os.path.exists("resume_index.txt"):
        with open("resume_index.txt", "r") as f:
            try:
                resume_index = int(f.read())
            except ValueError:
                print("Error: Could not read resume index from 'resume_index.txt'")

    # Loop through data in batches
    for i in range(resume_index, len(df), batch_size):
        batch_df = df.iloc[i:i+batch_size]
        batch_indices = batch_df.index
        batch_resumes = []

        for index, row in batch_df.iterrows():
            # Check if summary is already generated
            if pd.notnull(row["generated_summary"]) and row["generated_summary"]:
                continue  # Skip if summary already exists

            # Retrieve original post
            post = row["translated_text"].lower()  # Convert to lowercase

            print(f"Processing index: {index}")

            # Generate summary
            try:
                response = openai.chat.completions.create(
                    model="gpt-3.5-turbo",
                    temperature=temperature,
                    max_tokens=max_tokens,
                    messages=[{
                        "role": "system",
                        "content": "You are an expert assistant with expertise in summarizing posts."
                    }, {
                        "role": "user",
                        "content": f"Please provide an abstractive summary of the following post and use 'I' as a subject for the summaries don't paraphrase use the same words exist in the post:\nTEXT: {post}"
                    }]
                )
            except Exception as e:
                print(f"Error generating summary for index {index}: {e}")
                continue

            # Extract generated summary
            generated_summary = response.choices[0].message.content.lower()  # Convert to lowercase

            # Remove unwanted words
            generated_summary = remove_unwanted_words(generated_summary)

            # Append generated summary to the DataFrame
            df.at[index, "generated_summary"] = generated_summary

            # Record resume points
            batch_resumes.append(index)

        # Save DataFrame to CSV after each batch
        try:
            df.to_csv(output_file, index=False)
            print(f"Batch {i//batch_size + 1} done. Saved CSV file.")
        except Exception as e:
            print(f"Error saving CSV file for batch {i//batch_size + 1}: {e}")

        # Save resume index for resuming from this point
        if batch_resumes:  # Check if the list is not empty
            resume_index = max(batch_resumes) + 1
            with open("resume_index.txt", "w") as f:
                f.write(str(resume_index))

    print("All batches completed.")

except Exception as e:
    print("An error occurred:", e)

# Print DataFrame information for debugging
print(df.info())
print(df.head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processing index: 2806
Processing index: 2807
Processing index: 2808
Processing index: 2809
Processing index: 2810
Processing index: 2811
Processing index: 2812
Processing index: 2813
Processing index: 2814
Processing index: 2815
Processing index: 2816
Processing index: 2817
Processing index: 2818
Processing index: 2819
Processing index: 2820
Processing index: 2821
Processing index: 2822
Processing index: 2823
Processing index: 2824
Processing index: 2825
Processing index: 2826
Processing index: 2827
Processing index: 2828
Processing index: 2829
Processing index: 2830
Processing index: 2831
Processing index: 2832
Processing index: 2833
Processing index: 2834
Processing index: 2835
Processing index: 2836
Processing index: 2837
Processing index: 2838
Processing index: 2839
Processing index: 2840
Processing index: 2841
Processing index: 2842
Processing index: 2843
Processing index: 2844
Processing index: 2845
Processing inde