In [14]:
# Load the required libraries
from openai import OpenAI
import random
import subprocess
import pathlib
from dotenv import load_dotenv
import os
import xmlrpc.client
import time
import re
import base64
import pymupdf

In [2]:
# Load the API key from the .env file
load_dotenv()

api_key = os.getenv("API_KEY")

client = OpenAI(api_key=api_key, base_url="https://api.deepseek.com")

In [4]:
sys_instruction="All prompts should be answered with an in depth paper with an introduction, middle and end structured into chapters that is about 4 pages, written in markdown and include sources. Dont answer anything with less than 4 pages. Dont write anything other than the paper. Dont write ```markdown etc."
topics = ["technology", "science", "history", "art", "literature", "politics", "economics", "philosophy"]

plagiarism_score = 0
version = 1
file_path = f""
topic = ""

for paper_number in range(1, 30):
    topic = random.choice(topics)
    completion = client.chat.completions.create(
    model="deepseek-chat",
    store=True,
    messages=[
        {"role": "system", "content": sys_instruction},
        {"role": "user", "content": f"Write a paper about {topic}. The specific topic is up to you."}
    ]
    )


    try: 
        # Write the response to a file
        with open(f"outputs/{paper_number}_{topic}_version{version}.md", "a") as f:
            f.write(completion.choices[0].message.content)
        file_path = f"outputs/{paper_number}_{topic}_version{version}"
        print('Written to file')
    
    except Exception as e:
        print(f"Error writing to file: {e}")

    # Convert the markdown file to a PDF
    command = f'pandoc "{file_path}.md" --pdf-engine=xelatex -o "{file_path}.pdf'
    subprocess.run(command, shell=True, check=True)

Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file
Written to file


In [5]:
# Authenticate with iThenticate
load_dotenv()

username = os.getenv("ITHENTICATE_USERNAME")
password = os.getenv("ITHENTICATE_PASSWORD")

url = "https://api.ithenticate.com/rpc"
server = xmlrpc.client.ServerProxy(url)

credentials = {
    'username': username,
    'password': password
}

response = server.login(credentials)
sid = response['sid']
sid_dict =  dict(sid = response['sid'])
print(response['api_status'])

200


In [6]:
# Find the folder ID for 'Trym Master Thesis' folder
response = server.folder.list(sid_dict)

folder_id = None
for folder in response.get('folders', []):
    if folder.get('name') == 'Trym Master Thesis':
        folder_id = folder.get('id')
        break

folder = dict(folder = folder_id)
print(f"Folder ID for 'Trym Master Thesis': {folder}")

Folder ID for 'Trym Master Thesis': {'folder': 4378261}


In [7]:
# Define the folder containing the PDFs
folder_path = 'outputs'

# Initialize the array to hold document data
documents = []

# Iterate through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pdf'):
        # Extract the title from the filename (assuming the title is the filename without extension)
        title = os.path.splitext(filename)[0]
        
        author_first = 'Hangzhou'
        author_last = 'DeepSeek'
        
        # Read the PDF file and encode its content in base64
        with open(os.path.join(folder_path, filename), 'rb') as pdf_file:
            encoded_pdf = xmlrpc.client.Binary(pdf_file.read())
        
        # Create the document data dictionary
        document_data = {
            'title': title,
            'author_first': author_first,
            'author_last': author_last,
            'filename': filename,
            'upload': encoded_pdf
        }
        
        # Add the document data to the array
        documents.append(document_data)

# Update the dictionary with the documents array
arguments = dict(sid=sid, folder=folder_id, submit_to=1, uploads=documents)

# Submit the documents to iThenticate
response = server.document.add(arguments)
print(response['api_status'])

200


In [9]:
arguments = dict(sid=sid, id=folder_id)

response = server.folder.get(arguments)
# Extract document IDs and store them in a list
document_ids = [doc['id'] for doc in response['documents'] if doc.get('author_last') == "DeepSeek"]

print(document_ids)

[115488120, 115488130, 115488112, 115488124, 115488114, 115488118, 115488122, 115488106, 115488127, 115488132, 115488131, 115488110, 115488128, 115488119, 115488125, 115488126, 115488121, 115488123, 115488116, 115488105, 115488111, 115488115, 115488107, 115488117, 115488101]


In [None]:
# Iterate through the document IDs
for document_id in document_ids:
    arguments = dict(sid=sid, id=document_id)
    response = server.document.get(arguments)
    plagiarism_score = response['documents'][0]['parts'][0]['score']
    document_title = response['documents'][0]['title']
    
    # Extract the paper number and topic from the document title
    match = re.match(r'^\D*(\d{1,2})', document_title)
    paper_number = match.group(1) if match else ''
    
    match = re.search(r'_(.*?)_', document_title)
    topic = match.group(1) if match else ''
    
    paper_version = int(document_title[-1])

    # Check plagiarism score and generate a new paper if the score is above 10% a maximum of 10 times
    for paper_version in range(1, 10):
        #Check plagiarism score
        if plagiarism_score >= 10:
            print(f"{document_title} has plagiarism score: {plagiarism_score}%")
        
            # Path to the PDF file
            doc_path = f"outputs/{document_title}.pdf"
            
            # Convert the PDF to text using PyMuPDF since DeepSeek does not support file upload
            doc = pymupdf.open(doc_path)
            for page in doc: # iterate the document pages
                text = page.get_text() # get plain text encoded as UTF-8

            # Generate a new paper based on the previous one
            prompt = f"You got got cought plagiarizing. The plagirism score was {plagiarism_score}%. Write the paper again to fix the plagiarism."
            sys_instruction="All prompts should be answered with an in depth paper with an introduction, middle and end structured into chapters that is about 4 pages, written in markdown and include sources. Dont answer anything with less than 4 pages. Dont write anything other than the paper. The paper should be about the same topic as the previous one. With as few changes as possible."

            completion = client.chat.completions.create(
                model="deepseek-chat",
                store=True,
                messages=[
                    {"role": "system", "content": sys_instruction},
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": f"You got got cought plagiarizing. The plagirism score was {plagiarism_score}%. Write the paper again to fix the plagiarism. Here is the previous paper: {text}",
                            },
                        ],
                    },
                ],
            )

            try: 
                # Write the response to a file
                file_path = f"outputs/{paper_number}_{topic}_version{paper_version+1}"
                with open(f"outputs/{paper_number}_{topic}_version{paper_version+1}.md", "a") as f:
                    f.write(completion.choices[0].message.content)

                command = [
                            "pandoc",
                            f"{file_path}.md",
                            "--pdf-engine=xelatex",
                            "-o",
                            f"{file_path}.pdf"
                        ]

                subprocess.run(command, check=True)
                print(f"written to {file_path} to file")
                
                # Initialize the array to hold document data
                documents = []
            
                title = f"{paper_number}_{topic}_version{paper_version+1}"
                
                author_first = 'Hangzhou'
                author_last = 'DeepSeek'
                
                # Read the PDF file and encode its content in base64
                with open(f"outputs/{title}.pdf", 'rb') as pdf_file:
                    encoded_pdf = xmlrpc.client.Binary(pdf_file.read())
                

                # Create the document data dictionary
                document_data = {
                    'title': title,
                    'author_first': author_first,
                    'author_last': author_last,
                    'filename': filename,
                    'upload': encoded_pdf
                }

                # Add the document data to the array
                documents.append(document_data)

                # Update the test dictionary with the documents array
                arguments = dict(sid=sid, folder=folder_id, submit_to=1, uploads=documents)

                # Submit the documents to iThenticate
                response = server.document.add(arguments)
                document_id = response['uploaded'][0]['id']
                print('Document uploaded to iThenticate')
                time.sleep(20)
                
                arguments = dict(sid=sid, id=document_id)
                response = server.document.get(arguments)

                plagiarism_score = response['documents'][0]['parts'][0]['score']
                document_title = response['documents'][0]['title']
            except Exception as e:
                print(f"Error writing to file: {e}")
                pass
        else:
            break

26_history_version1 has plagiarism score: 17%
written to outputs/26_history_version2 to file
Document uploaded to iThenticate
7_science_version1 has plagiarism score: 28%
written to outputs/7_science_version2 to file
Document uploaded to iThenticate
19_technology_version1 has plagiarism score: 19%
written to outputs/19_technology_version2 to file
Document uploaded to iThenticate
2_economics_version1 has plagiarism score: 28%
written to outputs/2_economics_version2 to file
Document uploaded to iThenticate
2_economics_version2 has plagiarism score: 11%
written to outputs/2_economics_version3 to file
Document uploaded to iThenticate
20_history_version1 has plagiarism score: 16%
written to outputs/20_history_version2 to file
Document uploaded to iThenticate
24_science_version1 has plagiarism score: 22%
written to outputs/24_science_version2 to file
Document uploaded to iThenticate
24_science_version2 has plagiarism score: 26%
written to outputs/24_science_version3 to file
Document uploaded