In [2]:
# Load the required libraries
from openai import OpenAI
import random
import subprocess
import pathlib
from dotenv import load_dotenv
import os
import xmlrpc.client
import time
import re
import requests
import pymupdf
import base64

In [3]:
# Load the API key from the .env file
load_dotenv()

api_key = os.getenv("API_KEY")

client = OpenAI(api_key=api_key)

## Generate Initial Texts

In [4]:
sys_instruction="All prompts should be answered with an in depth paper with an introduction, middle and end structured into chapters that is about 4 pages, written in markdown and include sources. Dont answer anything with less than 4 pages. Dont write anything other than the paper. Dont write ```markdown etc."
topics = ["technology", "science", "history", "art", "literature", "politics", "economics", "philosophy"]

plagiarism_score = 0
version = 1
file_path = f""
topic = ""

for paper_number in range(1, 31):
    topic = random.choice(topics)
    completion = client.chat.completions.create(
    model="gpt-4o",
    store=True,
    messages=[
        {"role": "developer", "content": sys_instruction},
        {"role": "user", "content": f"Write a paper about {topic}. The specific topic is up to you."}
    ]
    )


    try: 
        # Write the response to a file
        with open(f"outputs/{paper_number}_{topic}_version{version}.md", "a") as f:
            f.write(completion.choices[0].message.content)
        file_path = f"outputs/{paper_number}_{topic}_version{version}"
        print('Written to file')
    
    except Exception as e:
        print(f"Error writing to file: {e}")

    # Convert the markdown file to a PDF
    command = f'pandoc "{file_path}.md" --pdf-engine=xelatex -o "{file_path}.pdf'
    subprocess.run(command, shell=True, check=True)

Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file


## Upload initial files to iThenticate

In [5]:
# Authenticate with iThenticate
load_dotenv()

username = os.getenv("ITHENTICATE_USERNAME")
password = os.getenv("ITHENTICATE_PASSWORD")

url = "https://api.ithenticate.com/rpc"
server = xmlrpc.client.ServerProxy(url)

credentials = {
    'username': username,
    'password': password
}

response = server.login(credentials)
sid = response['sid']
sid_dict =  dict(sid = response['sid'])
print(response['api_status'])

200


In [6]:
# Find the folder ID for 'Trym Master Thesis' folder
response = server.folder.list(sid_dict)

folder_id = None
for folder in response.get('folders', []):
    if folder.get('name') == 'Trym Master Thesis':
        folder_id = folder.get('id')
        break

folder = dict(folder = folder_id)
print(f"Folder ID for 'Trym Master Thesis': {folder}")

Folder ID for 'Trym Master Thesis': {'folder': 4378261}


In [7]:
# Define the folder containing the PDFs
folder_path = 'outputs'

# Initialize the array to hold document data
documents = []

# Iterate through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pdf'):
        # Extract the title from the filename (assuming the title is the filename without extension)
        title = os.path.splitext(filename)[0]
        
        author_first = 'OpenAI'
        author_last = 'ChatGPT'
        
        # Read the PDF file and encode its content in base64
        with open(os.path.join(folder_path, filename), 'rb') as pdf_file:
            encoded_pdf = xmlrpc.client.Binary(pdf_file.read())
        
        # Create the document data dictionary
        document_data = {
            'title': title,
            'author_first': author_first,
            'author_last': author_last,
            'filename': filename,
            'upload': encoded_pdf
        }
        
        # Add the document data to the array
        documents.append(document_data)

# Update the dictionary with the documents array
arguments = dict(sid=sid, folder=folder_id, submit_to=1, uploads=documents)

# Submit the documents to iThenticate
response = server.document.add(arguments)
print(response['api_status'])

200


## Upload initial files to Winston AI

In [8]:
# Load the API key from the .env file
load_dotenv()

winston_api = os.getenv("WINSTON_API")

In [13]:
url = "https://api.gowinston.ai/v2/plagiarism"
# Define the folder containing the PDFs
folder_path = 'outputs'

scores = []

# Initialize the array to hold document data
documents = []

# Iterate through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pdf'):
        text = ""
        doc = pymupdf.open(os.path.join(folder_path, filename)) # open the PDF file
        for page in doc: # iterate the document pages
            text += page.get_text() # get plain text encoded as UTF-8
        title = os.path.splitext(filename)[0]

        payload = {

            "text": text,
            "language": "en",
        }
        headers = {
            "Authorization": f"Bearer {winston_api}",
            "Content-Type": "application/json"
        }

        response = requests.post(url, json=payload, headers=headers)
        response_data = response.json()
        print(response.status_code)
        # Check if the request was successful
        if response.status_code == 200:
            score_data = {
                "name": title,
                "score": response_data["result"]["score"]
            }
            scores.append(score_data)
            print(score_data)


200
{'name': '10_history_version1', 'score': 0}
200
{'name': '11_literature_version1', 'score': 0}
200
{'name': '12_literature_version1', 'score': 0}
200
{'name': '13_art_version1', 'score': 0}
200
{'name': '14_technology_version1', 'score': 0}
200
{'name': '15_history_version1', 'score': 0}
200
{'name': '15_history_version2', 'score': 0}
200
{'name': '16_history_version1', 'score': 0}
200
{'name': '17_economics_version1', 'score': 4}
200
{'name': '17_economics_version2', 'score': 0}
200
{'name': '18_literature_version1', 'score': 0}
200
{'name': '19_technology_version1', 'score': 0}
200
{'name': '19_technology_version2', 'score': 0}
200
{'name': '19_technology_version3', 'score': 0}
200
{'name': '1_literature_version1', 'score': 0}
200
{'name': '20_history_version1', 'score': 0}
200
{'name': '21_economics_version1', 'score': 0}
200
{'name': '22_economics_version1', 'score': 0}
200
{'name': '23_literature_version1', 'score': 0}
200
{'name': '24_science_version1', 'score': 0}
200
{'name

KeyboardInterrupt: 

## Rerun initial files if plagiarism score is >= 10 in iThenticate

In [10]:
arguments = dict(sid=sid, id=folder_id)

response = server.folder.get(arguments)
# Extract document IDs and store them in a list
document_ids = [doc['id'] for doc in response['documents'] if doc.get('author_last') == "ChatGPT"]

print(document_ids)

[115565636, 115565649, 115565641, 115565640, 115565646, 115565637, 115565639, 115565651, 115565643, 115565627, 115565650, 115565647, 115565642, 115565633, 115565645, 115565648, 115565628, 115565629, 115565635, 115565638, 115565644, 115565626, 115565632, 115565624, 115565631]


In [11]:
# Iterate through the document IDs
for document_id in document_ids:
    arguments = dict(sid=sid, id=document_id)
    response = server.document.get(arguments)
    plagiarism_score = response['documents'][0]['parts'][0]['score']
    document_title = response['documents'][0]['title']
    
    # Extract the paper number and topic from the document title
    match = re.match(r'^\D*(\d{1,2})', document_title)
    paper_number = match.group(1) if match else ''
    
    match = re.search(r'_(.*?)_', document_title)
    topic = match.group(1) if match else ''
    
    paper_version = int(document_title[-1])

    # Check plagiarism score and generate a new paper if the score is above 10% a maximum of 10 times
    for paper_version in range(1, 10):
        #Check plagiarism score
        if plagiarism_score >= 10:
            print(f"{document_title} has plagiarism score: {plagiarism_score}%")
        
            # Path to the PDF file
            doc_path = f"outputs/{document_title}.pdf"
            
            # Read the PDF file bytes
            with open(doc_path, 'rb') as doc:
                data = doc.read()
            base64_string = base64.b64encode(data).decode("utf-8")

            # Generate a new paper based on the previous one
            prompt = f"You got got cought plagiarizing. The plagirism score was {plagiarism_score}%. Write the paper again to fix the plagiarism."
            sys_instruction="All prompts should be answered with an in depth paper with an introduction, middle and end structured into chapters that is about 4 pages, written in markdown and include sources. Dont answer anything with less than 4 pages. Dont write anything other than the paper. The paper should be about the same topic as the previous one. With as few changes as possible."

            completion = client.chat.completions.create(
                model="gpt-4o",
                store=True,
                messages=[
                    {"role": "developer", "content": sys_instruction},
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "file",
                                "file": {
                                    "filename": f"{document_title}.pdf",
                                    "file_data": f"data:application/pdf;base64,{base64_string}",
                                },
                            },
                            {
                                "type": "text",
                                "text": f"You got got cought plagiarizing. The plagirism score was {plagiarism_score}%. Write the paper again to fix the plagiarism.",
                            },
                        ],
                    },
                ],
            )

            try: 
                # Write the response to a file
                file_path = f"outputs/{paper_number}_{topic}_version{paper_version+1}"
                with open(f"outputs/{paper_number}_{topic}_version{paper_version+1}.md", "a") as f:
                    f.write(completion.choices[0].message.content)

                command = [
                            "pandoc",
                            f"{file_path}.md",
                            "--pdf-engine=xelatex",
                            "-o",
                            f"{file_path}.pdf"
                        ]

                subprocess.run(command, check=True)
                print(f"written to {file_path} to file")
                
                # Initialize the array to hold document data
                documents = []
            
                title = f"{paper_number}_{topic}_version{paper_version+1}"
                
                author_first = 'OpenAI'
                author_last = 'ChatGPT'
                
                # Read the PDF file and encode its content in base64
                with open(f"outputs/{title}.pdf", 'rb') as pdf_file:
                    encoded_pdf = xmlrpc.client.Binary(pdf_file.read())
                

                # Create the document data dictionary
                document_data = {
                    'title': title,
                    'author_first': author_first,
                    'author_last': author_last,
                    'filename': filename,
                    'upload': encoded_pdf
                }

                # Add the document data to the array
                documents.append(document_data)

                # Update the test dictionary with the documents array
                arguments = dict(sid=sid, folder=folder_id, submit_to=1, uploads=documents)

                # Submit the documents to iThenticate
                response = server.document.add(arguments)
                document_id = response['uploaded'][0]['id']
                print('Document uploaded to iThenticate')
                time.sleep(20)
                
                arguments = dict(sid=sid, id=document_id)
                response = server.document.get(arguments)

                plagiarism_score = response['documents'][0]['parts'][0]['score']
                document_title = response['documents'][0]['title']
            except Exception as e:
                print(f"Error writing to file: {e}")
                pass
        else:
            break

7_technology_version1 has plagiarism score: 20%
written to outputs/7_technology_version2 to file
Document uploaded to iThenticate
7_technology_version2 has plagiarism score: 11%
written to outputs/7_technology_version3 to file
Document uploaded to iThenticate
4_science_version1 has plagiarism score: 15%
written to outputs/4_science_version2 to file
Document uploaded to iThenticate
26_science_version1 has plagiarism score: 15%
written to outputs/26_science_version2 to file
Document uploaded to iThenticate
26_science_version2 has plagiarism score: 14%
written to outputs/26_science_version3 to file
Document uploaded to iThenticate
26_science_version3 has plagiarism score: 15%
written to outputs/26_science_version4 to file
Document uploaded to iThenticate
26_science_version4 has plagiarism score: 16%
written to outputs/26_science_version5 to file
Document uploaded to iThenticate
26_science_version5 has plagiarism score: 15%
written to outputs/26_science_version6 to file
Document uploaded t

In [None]:
folder_path = 'outputs'
# Iterate through the documents
for element in scores:
    # Extract the plagiarism score and document title
    plagiarism_score = element['score']
    document_title = element['name']
    
    # Define the regex pattern
    pattern1 = r"(\d+)_([a-zA-Z]+)_version(\d+)"
    pattern2 = r"(\d+)_([a-zA-Z]+)_version(\d+)_winston"

    # Match the pattern for the first string
    match_1 = re.match(pattern1, document_title)
    match_2 = re.match(pattern2, document_title)
    if match_1:
        paper_number = match_1.group(1)
        topic = match_1.group(2)
        paper_version = match_1.group(3)

    elif match_2:
        paper_number = match_2.group(1)
        topic = match_2.group(2)
        paper_version = match_2.group(3)
    else:
        print("Error in matching the pattern for input_string_1")


    # Check plagiarism score and generate a new paper if the score is above 10% a maximum of 10 times
    for paper_version in range(1, 10):
        text = ""
        #Check plagiarism score
        if plagiarism_score >= 10:
            print(f"{document_title} has plagiarism score: {plagiarism_score}%")
        
            # Path to the PDF file
            doc_path = f"outputs/{document_title}.pdf"
            
            # Read the PDF file bytes
            with open(doc_path, 'rb') as doc:
                data = doc.read()
            
            base64_string = base64.b64encode(data).decode("utf-8")

            # Generate a new paper based on the previous one
            prompt = f"You got got cought plagiarizing. The plagirism score was {plagiarism_score}%. Write the paper again to fix the plagiarism."
            sys_instruction="All prompts should be answered with an in depth paper with an introduction, middle and end structured into chapters that is about 4 pages, written in markdown and include sources. Dont answer anything with less than 4 pages. Dont write anything other than the paper. The paper should be about the same topic as the previous one. With as few changes as possible."

            completion = client.chat.completions.create(
                model="gpt-4o",
                store=True,
                messages=[
                    {"role": "developer", "content": sys_instruction},
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "file",
                                "file": {
                                    "filename": f"{document_title}.pdf",
                                    "file_data": f"data:application/pdf;base64,{base64_string}",
                                },
                            },
                            {
                                "type": "text",
                                "text": f"You got got cought plagiarizing. The plagirism score was {plagiarism_score}%. Write the paper again to fix the plagiarism.",
                            },
                        ],
                    },
                ],
            )

            try: 
                # Write the response to a file
                file_path = f"outputs/{paper_number}_{topic}_version{paper_version+1}_winston{plagiarism_score}"
                with open(f"{file_path}.md", "a") as f:
                    f.write(response.text)
                command = f'pandoc "{file_path}.md" --pdf-engine=xelatex -o "{file_path}.pdf"'
                subprocess.run(command, shell=True, check=True)
                print(f"written {file_path} to file")
            
                title = f"{paper_number}_{topic}_version{paper_version+1}"
                
                doc = pymupdf.open(os.path.join(folder_path, filename)) # open the PDF file
                for page in doc: # iterate the document pages
                    text += page.get_text() # get plain text encoded as UTF-8


                payload = {

                    "text": text,
                    "language": "en",
                }
                headers = {
                    "Authorization": f"Bearer {winston_api}",
                    "Content-Type": "application/json"
                }

                response = requests.post(url, json=payload, headers=headers)
                response_data = response.json()
                # Check if the request was successful
                if response.status_code == 200:
                    score_data = {
                        "name": filename,
                        "score": response_data["result"]["score"]
                    }


                plagiarism_score = response_data["result"]["score"]
                document_title = f"{title}_winston{plagiarism_score}"
                print(f"New Score: {document_title}")
            except Exception as e:
                print(f"Error writing to file: {e}")
                pass
        else:
            break

paper_number: 10, topic: history, paper_version: 1
paper_number: 11, topic: literature, paper_version: 1
paper_number: 12, topic: literature, paper_version: 1
paper_number: 13, topic: art, paper_version: 1
paper_number: 14, topic: technology, paper_version: 1
paper_number: 15, topic: history, paper_version: 1
paper_number: 16, topic: history, paper_version: 1
paper_number: 17, topic: economics, paper_version: 1
paper_number: 18, topic: literature, paper_version: 1
paper_number: 19, topic: technology, paper_version: 1
paper_number: 1, topic: literature, paper_version: 1
paper_number: 20, topic: history, paper_version: 1
paper_number: 21, topic: economics, paper_version: 1
paper_number: 22, topic: economics, paper_version: 1
paper_number: 23, topic: literature, paper_version: 1
paper_number: 24, topic: science, paper_version: 1
paper_number: 25, topic: economics, paper_version: 1
paper_number: 26, topic: science, paper_version: 1
