# Data Configuration
---------------------------------------------------

## Planning :

   1. First read out all available data/parquet files
   2. Using `Gemini` manipulate the data
      
      2.1  Select one of the lines from Human Generated data and rewrite using `Gemini`, that's how we are going to create combination of llm generated text and human generated text.

      2.2 Using the defined function, we will be counting no of words generated by `LLM(Gemini)` and already we will be having count of words available from `Human`.

      2.3 A new column will be storing percentage of `LLM` generated text, using the formula:
          
          llm_generated_perc = word count for llm generated data / total word count in the text

      2.4 Make sure to iterate the same fuction for different number of lines.
      Example : First we will be regenerating one of lines from human generated text, next 2 and next 3 lines and so on. This will help to enhance the data model.

   3. Finally summing up all functions to create the final dataset.

   





In [None]:
%%capture
pip install fastparquet

In [None]:

%%capture
! pip3 install --upgrade --user google-cloud-aiplatform

In [None]:
## creating environment for vertexai
import vertexai

PROJECT_ID = 'BTP_LLM_Generated_Text_Detection'
LOCATION = 'us-central1'

'''
Gemini Enviroment Setup
'''

vertexai.init(project = PROJECT_ID,
              location = LOCATION)


In [None]:
## importing important libraries

# from vertexai.generative_models import GenerationConfig, GenerativeModel, Image, Part

import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


import pandas as pd
import numpy as np

import nltk
nltk.download('punkt')

In [None]:
## Getting API from user and then finishing set up of GenAI

GOOGLE_API_KEY=input('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [None]:
# genai


## defining model

model = genai.GenerativeModel('models/gemini-pro')
model

In [None]:
## creating function to show the data

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

## Generating New Text
-----------------------------------

----------------------------------------------------

**Text Parameters**

  1. `prompt` : Text input to generate model response. [Text]
  2. `temperature` : The tempetature is used for sampling during response generation, which occurs when `topK` and `topP` are applied. Temperature controls the randomness of the tokens.
      
      * Lower temprature are good for less creative or less open ended results.
      * Higher temperature can lead to more diverse or creative results.
      
      If the `temperature` value is 0 means that the highest probability tokens are always selected.
      
       **[0.0 - 1.0]** *Default : 0.0*

  3. `max_output_tokens` : Maximum number of tokens that can be generated in the response.
     
      * A token is smalleer than a word. A token mostly contain 4 characters. 100 tokens roughly contains 60-80 words. [1 - 1024] Deafult : 0
  
  4. `topK` : Top-K changes how the model selects tokens for output.

      * A topK value of 1 means the selected token is the most probable among all the token available in the model dictionary. {greedy encoding}
      * A topK value of 3 means the next token is selected from among 3 most probable tokens. {using `temperature`}

      * For each token steps top K tokens are sampled. Then tokens are further filter using `topP` with final token selected using temperature sampling.

       **Specify lower values of `topK` for less random response and higher value for random responses.** [1 - 40] Default : 40

  5. `topP` : Top-P changes how the model selects tokens for output.

     * Tokens are selected from most K to least until the sum of their probability equals the top-P value.

     * For example, three tokens *A*, *B*, *C* having probability 0.3, 0.2 and 0.1. And the predefined topP value is 0.5. Then the model will select either *A* or *B* based on the rules defined in temperature parameter.

      **Specify a lower value for less random responses and higher value for more random responses.** [0.0 - 1.0] Default : 0.95

  



## Generating New Content
---------------------------------

In [None]:
## reading all parquet files

data_path = ["/content/drive/MyDrive/BTP 8th SEM/Data/Dataset/DATASET_FROM_JSONL.parquet",
             "/content/drive/MyDrive/BTP 8th SEM/Data/Dataset/DeepfakeTextDetect.parquet",
             "/content/drive/MyDrive/BTP 8th SEM/Data/Dataset/Hello_simpleAI.parquet"]


for path in data_path:
  dataset_from_jsonl = pd.read_parquet(path, engine = 'fastparquet')
  break

In [None]:
## reading the data

dataset_from_jsonl.head(2)

In [None]:
## Extracting only Human generated data

human_generated_jsonl_df = dataset_from_jsonl[dataset_from_jsonl['label'] == 'human']

In [None]:
human_generated_jsonl_df.head(2)

In [None]:
## calculating average sentence length in human generated data

avg_text_len = sum(len(nltk.sent_tokenize(id)) for id in human_generated_jsonl_df['text']) / human_generated_jsonl_df.shape[0]

print(f"Average Sentence length : {avg_text_len}")

In [None]:
## writing a function

## defining a function to write content using gemini




In [None]:
import random

# Function to rewrite a line from Human Generated data using Gemini
def rewrite_with_gemini(text, target_length):
    # Rewrite the text using Gemini
    response = model.generate_content(text,
                                      generation_config=genai.types.GenerationConfig(
                                      candidate_count=1,
                                      stop_sequences=['.'],
                                      max_output_tokens=20,
                                      top_p = 0.7,
                                      top_k = 4,
                                      temperature=0.7))
    # print(f"Format of rewritten text : {type(rewritten_text)}")

    # Extract text content from the response object (assuming there is a method or attribute for this)
    rewritten_text = response.text  # Replace with the appropriate method or attribute

    # Tokenize the rewritten text into sentences
    rewritten_sentences = nltk.sent_tokenize(rewritten_text)

    # Trim or pad the sentences to match the target length
    if len(rewritten_sentences) > target_length:
        rewritten_sentences = rewritten_sentences[:target_length]
    elif len(rewritten_sentences) < target_length:
        padding_sentences = [''] * (target_length - len(rewritten_sentences))
        rewritten_sentences.extend(padding_sentences)

    return ' '.join(rewritten_sentences)

# Function to perform all tasks
def generate_final_dataset(dataframe):
    # Extract only Human generated data
    human_generated_df = dataframe[dataframe['label'] == 'human']

    final_dataset = []

    # Iterate through each row in the dataframe
    for index, row in human_generated_df.iterrows():
        original_text = row['text']
        sentences = nltk.sent_tokenize(original_text)
        sentence_count = len(sentences)

        # Shuffle the order of sentences randomly
        random.shuffle(sentences)

        # Iterate through different numbers of lines to regenerate
        for i in range(sentence_count):
            # Take i+1 sentences and rewrite using Gemini
            lines_to_rewrite = sentences[:i+1]
            lines_to_keep = sentences[i+1:]

            target_length = i + 1  # Target length for generated text

            rewritten_lines = [rewrite_with_gemini(line, target_length) for line in lines_to_rewrite]
            rewritten_text = ' '.join(rewritten_lines + lines_to_keep)

            # Store results in a dictionary
            result = {
                'original_text': original_text,
                'rewritten_text': rewritten_text
            }

            final_dataset.append(result)

    return final_dataset



In [None]:
## applying the function to create final data set

final_dataset = generate_final_dataset(dataset_from_jsonl)

In [None]:
data = dataset_from_jsonl[:10]
data.drop('id', axis = 1, inplace = True)
data

## Practice Gemini
-------------------

In [None]:
## creating environment for vertexai
import vertexai

PROJECT_ID = 'BTP_LLM_Generated_Text_Detection'
LOCATION = 'us-central1'

'''
Gemini Enviroment Setup
'''

vertexai.init(project = PROJECT_ID,
              location = LOCATION)

In [None]:
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [None]:
import google.generativeai as genai
# Used to securely store your API key
from google.colab import userdata

# Or use `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [None]:
## model info

for m in genai.list_models():
    if 'generateContent' in m.supported_generation_methods:
        print(m.name)

In [None]:
model = genai.GenerativeModel('models/gemini-1.0-pro')
model

In [None]:
response = model.generate_content("Please provide a list of the most influential people in the world.")

print(response.text)

In [None]:
response.candidates

In [None]:
from IPython.display import Markdown

response = model.generate_content("Build a simple Python web application.")

Markdown(response.text)

### Streaming

In [None]:
from IPython.display import Markdown

model = genai.GenerativeModel("models/gemini-pro")
response = model.generate_content("How can I make authentic Indian Biriyani", stream = True)

for chunk in response:
  display(Markdown(chunk.text))
  display(Markdown("_" * 80))

## Fine Tuning the responses
-------------------

In [None]:
response = model.generate_content(
    "I'm reaching out due to your industry expertise and connection with Urban Company.",
    generation_config = genai.types.GenerationConfig(
        candidate_count=1,
        temperature=0.7,
        stop_sequences = ["."],
        max_output_tokens = 100
    )
)

result = Markdown(response.text)
# len(list(result))
result

In [None]:
def generate_content(sentence):
    response = model.generate_content(
                      sentence,
                      generation_config = genai.types.GenerationConfig(
                          # candidate_count=1,
                          temperature=0.7,
                          stop_sequences = ["."],
                          max_output_tokens = 100
                      )
                  )
    return response.text

In [None]:
generate_content("I am a good boy")

In [None]:
human_generated_jsonl_df.columns
# human_generated_jsonl_df.label.value_counts()

In [None]:
## creating a function to create the new dataset

for i in range(human_generated_jsonl_df.shape[0]):
  # print(human_generated_jsonl_df['text'][i])
  sentence_list = human_generated_jsonl_df['text'][i].split(".")
  for sentence in sentence_list:
    print(sentence)
    processed_sentence = generate_content(sentence)
    print(processed_sentence)
  break

In [None]:
human_generated_jsonl_df.loc[0, 'text']

In [None]:
for row in range(human_generated_jsonl_df.shape[0]):
  sentences = human_generated_jsonl_df['text'][row].split('. ')


In [None]:
human_generated_jsonl_df['text'][0]

In [None]:
def split_sentences_to_dataframe(human_generated_jsonl_df):
    split_sentences_df = pd.DataFrame()  # Create an empty DataFrame to store split sentences
    for row in range(human_generated_jsonl_df.shape[0]):
        sentences = human_generated_jsonl_df['text'][row].split('. ')
        for i, sentence in enumerate(sentences):
            column_name = f'split_{i}'
            if column_name not in split_sentences_df.columns:
                split_sentences_df[column_name] = ""
            split_sentences_df.at[row, column_name] = sentence
    return split_sentences_df

In [None]:
# human_generated_jsonl_df = pd.read_json('your_jsonl_file.jsonl', lines=True)
split_sentences_df = split_sentences_to_dataframe(human_generated_jsonl_df)