# DATASET PREPARATION
----------------------------------

### 1. Read the Dataset from `parquet` format to csv
--------

In [None]:
%%capture
pip install fastparquet

In [None]:
## importing necessary libraries

import pandas as pd
import numpy as np

In [None]:
## reading the data

datafolder_path = "/content/drive/MyDrive/BTP 8th SEM/Data/Dataset"

file_1 = pd.read_parquet("/content/drive/MyDrive/BTP 8th SEM/Data/Dataset/DATASET_FROM_JSONL.parquet", engine = 'fastparquet')
# file_1.head()
file_1.label.value_counts()
file_1.columns

In [None]:
file_2 = pd.read_parquet("Data/Dataset/DeepfakeTextDetect.parquet", engine = 'fastparquet')
# file_2.head()
file_2[['label', 'src']].value_counts()
# file_2.label.value_counts()
# file_2.columns

In [None]:
file_3 = pd.read_parquet("Data/Dataset/Hello_simpleAI.parquet", engine = 'fastparquet')
file_3.head()

In [None]:
file_3.shape

In [None]:
file_3.source.value_counts()

### 2. Merging all datasets in a single datasets
--------------------

In [None]:
'''
   Plan :
        Final Dataset will have columns, Text and Label[Human/LLMs]
        1. DATASET_FROM_JSONL, has columns label[Human/Machine] . Now extracting all `Human` labelled data and dumping into the new Dataset, `Human` column
        Same with Machine generated data and dumping them into `LLMs` columns.
        2. DEEPFAKETEXT_DETECT, have three columns, Text, Label[1/0], src. Interested in the Text and Label[1/0] columns only. Label 1 means Human written
                                                                                                  and Label 0 means machine generated.
        Will be performing the same operations as the previous one.
        3. HelloSimpleAI, have four columns, Question, Chatgpt_answer, Human_answer, source. Interested only in `Chatgpt_answer` and `Human_answer`.
        Performing the same operations as the previous ones.

'''

## create an empty final dataset
detectllms_data = pd.DataFrame(columns = ['Text', 'Label'])

## Extracting Human and Machine Data from DATASET_FROM_JSONL
human_data = file_1[file_1['label'] == 'human']['text']
machine_data = file_1[file_1['label'] == 'machine']['text']


## Append Human data and llms generated data to detectllm dataset
detectllms_data = detectllms_data.append(pd.DataFrame({'Text':human_data,
                                                       'Label':'Human'}))
detectllms_data = detectllms_data.append(pd.DataFrame({'Text':machine_data,
                                                       'Label':'LLMs'}))


## Extracting Human and Machine generated data from DEEPFAKETEXT_DETECT
human_fake_data = file_2[file_2['label'] == 1]
llms_fake_data = file_2[file_2['label'] == 0]


## Append Human and LLM generated data to detectllm dataset
detectllms_data = detectllms_data.append(pd.DataFrame({'Text' : human_fake_data['text'],
                                          'Label': 'Human'}))
detectllms_data = detectllms_data.append(pd.DataFrame({'Text' : llms_fake_data['text'],
                                          'Label': 'LLMs'}))


## Extrating Human and Chatgpt response from HelloSimpleAI
human_response = file_3['human_answer']
chatgpt_response = file_3['chatgpt_answer']

## Append Human and LLM generated data to detectllm dataset
detectllms_data = detectllms_data.append(pd.DataFrame({'Text' : human_response,
                                          'Label': 'Human'}))
detectllms_data = detectllms_data.append(pd.DataFrame({'Text' : chatgpt_response,
                                          'Label': 'LLMs'}))

In [None]:
## Final checking of the DetectLLMS dataset
detectllms_data.Label.value_counts()

In [None]:
# detectllms_data.index.min, detectllms_data.index.max

## Reset the index
detectllms_data.reset_index(drop = True, inplace = True)
detectllms_data


In [None]:
# Convert the 'Text' column to bytes using UTF-8 encoding
detectllms_data['Text'] = detectllms_data['Text'].astype('str')

### 3. Uploading and Saving the file for Future Use
-----------------

In [None]:
detectllms_data.to_parquet("Data/Dataset/DetectLLM.parquet", engine = 'fastparquet', index = False)