diff --git a/opendevin/llm/readers.py b/opendevin/llm/readers.py new file mode 100644 index 00000000000..bc3e13a9f90 --- /dev/null +++ b/opendevin/llm/readers.py @@ -0,0 +1,516 @@ +import base64 +import json +import subprocess +import zipfile +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Any, Union + +import charset_normalizer +import cv2 +import docx + +# import markdown +import openpyxl +import pandas as pd +import PyPDF2 +import requests +import yaml +from bs4 import BeautifulSoup +from openai import OpenAI +from pptx import Presentation +from pylatexenc.latex2text import LatexNodes2Text + +from opendevin.core.config import config +from opendevin.core.logger import opendevin_logger as logger + +# TODO: Find way to directly get the API key from ConfigType.LLM_API_KEY or change it with litellm. +OPENAI_API_KEY = config.llm.api_key + + +class Reader(ABC): + """ + @Desc: Implementation to support reading 41 multimodal files efficiently. + @Ref: https://github.com/metauto-ai/GPTSwarm/blob/main/swarm/environment/tools/reader/readers.py + """ + + @abstractmethod + def parse(self, file_path: Path) -> str: + """To be overriden by the descendant class""" + pass + + +class TXTReader(Reader): + def parse(self, file_path: Path) -> str: + content = charset_normalizer.from_path(file_path).best() + logger.info( + f"Reading TXT file from {file_path} using encoding '{content.encoding}.'" + ) + return str(content) + + +class PDFReader(Reader): + def parse(self, file_path: Path) -> str: + logger.info(f'Reading PDF file from {file_path}.') + content = PyPDF2.PdfReader(file_path) + text = '' + for page_idx in range(len(content.pages)): + text += f'Page {page_idx + 1}\n' + content.pages[page_idx].extract_text() + return text + + +class DOCXReader(Reader): + def parse(self, file_path: Path) -> str: + logger.info(f'Reading DOCX file from {file_path}.') + content = docx.Document(str(file_path)) + text = '' + for i, para in enumerate(content.paragraphs): + text += f'Page {i + 1}:\n' + para.text + return text + + +class JSONReader(Reader): + def parse(self, file_path: Path) -> str: + logger.info(f'Reading JSON file from {file_path}.') + + with open(file_path, 'r') as f: + data = json.load(f) + text = str(data) + return text + + +class JSONLReader(Reader): + def parse(self, file_path: Path) -> str: + logger.info(f'Reading JSON Lines file from {file_path}.') + with open(file_path, 'r') as f: + lines = [json.loads(line) for line in f] + text = '\n'.join([str(line) for line in lines]) + return text + + +class XMLReader(Reader): + def parse(self, file_path: Path) -> str: + logger.info(f'Reading XML file from {file_path}.') + with open(file_path, 'r') as f: + data = BeautifulSoup(f, 'xml') + text = data.get_text() + return text + + +class YAMLReader(Reader): + def parse(self, file_path: Path, return_str=True) -> Union[str, Any]: + logger.info(f'Reading YAML file from {file_path}.') + with open(file_path, 'r') as f: + data = yaml.load(f, Loader=yaml.FullLoader) + text = str(data) + if return_str: + return text + else: + return data + + +class HTMLReader(Reader): + def parse(self, file_path: Path) -> str: + logger.info(f'Reading HTML file from {file_path}.') + with open(file_path, 'r') as f: + data = BeautifulSoup(f, 'html.parser') + text = data.get_text() + return text + + +# class MarkdownReader(Reader): +# def parse(self, file_path: Path) -> str: +# logger.info(f'Reading Markdown file from {file_path}.') +# with open(file_path, 'r') as f: +# data = markdown.markdown(f.read()) +# text = ''.join(BeautifulSoup(data, 'html.parser').findAll(string=True)) +# return text + + +class LaTexReader(Reader): + def parse(self, file_path: Path) -> str: + logger.info(f'Reading LaTex file from {file_path}.') + with open(file_path, 'r') as f: + data = f.read() + text = LatexNodes2Text().latex_to_text(data) + return text + + +class AudioReader(Reader): + @staticmethod + def parse(file_path: Path) -> str: + api_key = OPENAI_API_KEY + logger.info(f'Transcribing audio file from {file_path}.') + client = OpenAI(api_key=api_key) + try: + # TODO: record the COST of the API call + client = OpenAI() + with open(file_path, 'rb') as audio_file: + transcript = client.audio.translations.create( + model='whisper-1', file=audio_file + ) + return transcript.text + + except Exception as e: + logger.info(f'Error transcribing audio file: {e}') + return 'Error transcribing audio file.' + + +class PPTXReader(Reader): + def parse(self, file_path: Path) -> str: + logger.info(f'Reading PowerPoint file from {file_path}.') + try: + pres = Presentation(str(file_path)) + text = [] + for slide_idx, slide in enumerate(pres.slides): + text.append(f'Slide {slide_idx + 1}:\n') + for shape in slide.shapes: + if hasattr(shape, 'text'): + text.append(shape.text) + return '\n'.join(text) + + except Exception as e: + logger.info(f'Error reading PowerPoint file: {e}') + return 'Error reading PowerPoint file.' + + +class ExcelReader(Reader): + def parse(self, file_path: Path) -> str: + logger.info(f'Reading Excel file from {file_path}.') + try: + excel_data = pd.read_excel(file_path, sheet_name=None) + + all_sheets_text = [] + for sheet_name, data in excel_data.items(): + all_sheets_text.append( + f'Sheet Name: {sheet_name}\n{data.to_string()}\n' + ) + return '\n'.join(all_sheets_text) + + except Exception as e: + logger.info(f'Error reading Excel file: {e}') + return 'Error reading Excel file.' + + +class XLSXReader(Reader): + def parse(self, file_path: Path) -> str: + logger.info(f'Reading XLSX file from {file_path}.') + workbook = openpyxl.load_workbook(file_path, data_only=True) + text = '' + + for sheet in workbook: + text += f'\nSheet: {sheet.title}\n' + for row in sheet.iter_rows(values_only=True): + row_data = [str(cell) if cell is not None else '' for cell in row] + text += '\t'.join(row_data) + '\n' + return text + + +class ZipReader(Reader): + def parse(self, file_path: Path) -> str: + logger.info(f'Reading ZIP file from {file_path}.') + file_content = '' + with zipfile.ZipFile(file_path, 'r') as zip_ref: + extract_dir = str(file_path)[:-4] + '/' + zip_ref.extractall(Path(extract_dir)) + reader = FileReader() + for file_name in zip_ref.namelist(): + file_content += f'File {file_name}:\n"{reader.read_file(Path(extract_dir + file_name))}"\n' + return file_content + + +class PythonReader(Reader): + def parse(self, file_path: Path) -> str: + logger.info(f'Executing and reading Python file from {file_path}.') + execution_result = '' + try: + completed_process = subprocess.run( + ['python', file_path], capture_output=True, text=True, check=True + ) + execution_result = 'Execution information:\n' + completed_process.stdout + except subprocess.CalledProcessError as e: + execution_result = 'Error:\n' + e.stderr + return execution_result + except Exception as e: + logger.info(f'Error executing Python file: {e}') + + try: + with open(file_path, 'r') as file: + file_content = '\nFile Content:\n' + file.read() + except Exception as e: + logger.info(f'Error reading Python file: {e}') + return file_content + '\n' + execution_result + + +class IMGReader(Reader): + def base64_img(self, file_path: Path) -> str: + import base64 + + with open(file_path, 'rb') as image_file: + encoded_image = base64.b64encode(image_file.read()).decode('utf-8') + return encoded_image + + def prepare_api_call( + self, task: str, base64_frame: str, model='gpt-4o-2024-05-13', max_token=500 + ) -> dict: + return { + 'model': model, + 'messages': [ + { + 'role': 'user', + 'content': [ + {'type': 'text', 'text': task}, + { + 'type': 'image_url', + 'image_url': { + 'url': f'data:image/jpeg;base64,{base64_frame}' + }, + }, + ], + } + ], + 'max_tokens': max_token, + } + + def get_headers(self) -> dict: + return { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {OPENAI_API_KEY}', + } + + def parse( + self, file_path: Path, task: str = 'Describe this image as detail as possible.' + ) -> str: + logger.info(f'Reading image file from {file_path}.') + # TODO: record the COST of the API call + try: + openai_proxy: str = 'https://api.openai.com/v1/chat/completions' + base64_image = self.base64_img(Path(file_path)) + api_call = self.prepare_api_call(task, base64_image) + response = requests.post( + openai_proxy, headers=self.get_headers(), json=api_call + ) + out = response.json() + content = out['choices'][0]['message']['content'] + return content + + except Exception as error: + logger.error(f'Error with the request: {error}') + raise + + +class VideoReader(Reader): + def base64_video(self, file_path: Path, frame_interval: int = 10) -> list: + video = cv2.VideoCapture(str(file_path)) + base64_frames = [] + frame_count = 0 + while video.isOpened(): + success, frame = video.read() + if not success: + break + if frame_count % frame_interval == 0: + _, buffer = cv2.imencode('.jpg', frame) + base64_frames.append(base64.b64encode(buffer).decode('utf-8')) + frame_count += 1 + video.release() + return base64_frames + + def prepare_api_call( + self, task: str, base64_frame: str, model='gpt-4o-2024-05-13', max_token=500 + ) -> dict: + return { + 'model': model, + 'messages': [ + { + 'role': 'user', + 'content': [ + {'type': 'text', 'text': task}, + { + 'type': 'image_url', + 'image_url': { + 'url': f'data:image/jpeg;base64,{base64_frame}' + }, + }, + ], + } + ], + 'max_tokens': max_token, + } + + def get_headers(self) -> dict: + return { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {OPENAI_API_KEY}', + } + + def parse( + self, + file_path: Path, + task: str = 'Describe this image as detail as possible.', + frame_interval: int = 30, + used_audio: bool = True, + ) -> str: + logger.info( + f'Processing video file from {file_path} with frame interval {frame_interval}.' + ) + + video_summary = '' + idx = 0 + task = task or 'This is one frame from a video, please summarize this frame.' + base64_frames = self.base64_video(Path(file_path)) + selected_frames = base64_frames[::frame_interval] + + if len(selected_frames) > 30: + new_interval = len(base64_frames) // 30 + selected_frames = base64_frames[::new_interval] + + logger.info(f'Totally {len(selected_frames)} would be analyze...') + + idx = 0 + for base64_frame in selected_frames: + idx += 1 + logger.info( + f'Process the {file_path}, current No. {idx * frame_interval} frame...' + ) + # TODO: record the COST of the API call + api_call = self.prepare_api_call(task, base64_frame) + try: + openai_proxy: str = 'https://api.openai.com/v1/chat/completions' + response = requests.post( + openai_proxy, headers=self.get_headers(), json=api_call + ) + content = response.json()['choices'][0]['message']['content'] + current_frame_content = f"Frame {idx}'s content: {content}\n" + video_summary += current_frame_content + logger.info(current_frame_content) + + except Exception as error: + logger.error(f'Error with the request: {error}') + raise + + logger.info(f'video summary: {video_summary}') + return video_summary + + +# Support 41 kinds of files. +READER_MAP = { + '.png': IMGReader(), + '.jpg': IMGReader(), + '.jpeg': IMGReader(), + '.gif': IMGReader(), + '.bmp': IMGReader(), + '.tiff': IMGReader(), + '.tif': IMGReader(), + '.webp': IMGReader(), + '.mp3': AudioReader(), + '.m4a': AudioReader(), + '.wav': AudioReader(), + '.MOV': VideoReader(), + '.mp4': VideoReader(), + '.mov': VideoReader(), + '.avi': VideoReader(), + '.mpg': VideoReader(), + '.mpeg': VideoReader(), + '.wmv': VideoReader(), + '.flv': VideoReader(), + '.webm': VideoReader(), + '.zip': ZipReader(), + '.pptx': PPTXReader(), + '.xlsx': ExcelReader(), + '.xls': ExcelReader(), + '.txt': TXTReader(), + '.csv': TXTReader(), + '.pdf': PDFReader(), + '.docx': DOCXReader(), + '.json': JSONReader(), + '.jsonld': JSONReader(), + '.jsonl': JSONLReader(), + '.xml': XMLReader(), + '.yaml': YAMLReader(), + '.yml': YAMLReader(), + '.html': HTMLReader(), + '.htm': HTMLReader(), + '.xhtml': HTMLReader(), + # '.md': MarkdownReader(), + # '.markdown': MarkdownReader(), + '.tex': LaTexReader(), + '.py': PythonReader(), + '.pdb': TXTReader(), +} + + +class FileReader: + def set_reader(self, suffix) -> None: + self.reader = READER_MAP[suffix] + logger.info(f'Setting Reader to {type(self.reader).__name__}') + + def read_file(self, file_path: Path, task='describe the file') -> str: + suffix = file_path.suffix + self.set_reader(suffix) + if isinstance(self.reader, IMGReader) or isinstance(self.reader, VideoReader): + file_content = self.reader.parse(file_path, task) + else: + file_content = self.reader.parse(file_path) + logger.info(f'Reading file {file_path} using {type(self.reader).__name__}') + return file_content + + +class GeneralReader: + def __init__(self): + self.file_reader = FileReader() + self.name = 'General File Reader' + self.description = """A general file reader support to multimodal files.""" + + def read(self, task, file): + files_content = '' + file_content = self.file_reader.read_file(file, task) + suffix = file.split('.')[-1] + + if suffix in ['py', 'java', 'cpp', 'c', 'js', 'css', 'html', 'htm', 'xml']: + files_content += f'\nThe {suffix} file contains:\n---\n{file_content[0]}' + if file_content[1] != '': + files_content += f'\nExecution result:\n{file_content[1]}' + if file_content[2] != '': + files_content += f'\nExecution error message:\n{file_content[2]}' + files_content += '\n---' + + elif suffix in [ + 'txt', + 'jsonl', + 'csv', + 'json', + 'jsonld', + 'jsonl', + 'yaml', + 'yml', + 'xlsx', + 'xls', + 'jpg', + 'png', + 'jpeg', + 'gif', + 'bmp', + 'mp3', + 'wav', + 'ogg', + 'mp4', + 'avi', + 'mkv', + 'mov', + 'pdf', + 'doc', + 'docx', + 'ppt', + 'pptx', + 'md', + 'markdown', + 'tex', + 'zip', + 'tar', + 'gz', + '7z', + 'rar', + ]: + files_content += f'\nThe {suffix} file contains:\n---\n{file_content}\n---' + + return files_content diff --git a/poetry.lock b/poetry.lock index e090ec25fdc..9ac7f1a1d4c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1311,6 +1311,17 @@ files = [ {file = "english-words-2.0.1.tar.gz", hash = "sha256:a4105c57493bb757a3d8973fcf8e1dc05e7ca09c836dff467c3fb445f84bc43d"}, ] +[[package]] +name = "et-xmlfile" +version = "1.1.0" +description = "An implementation of lxml.xmlfile for the standard library" +optional = false +python-versions = ">=3.6" +files = [ + {file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"}, + {file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"}, +] + [[package]] name = "evaluate" version = "0.4.2" @@ -2364,13 +2375,13 @@ files = [ [[package]] name = "json-repair" -version = "0.19.2" +version = "0.19.1" description = "A package to repair broken json strings" optional = false python-versions = ">=3.7" files = [ - {file = "json_repair-0.19.2-py3-none-any.whl", hash = "sha256:eeacf422c620d98499c6a7d6da78dc52857bd419f2276157d44ef2441eccca2e"}, - {file = "json_repair-0.19.2.tar.gz", hash = "sha256:0bb1963a2a0958b18f403a4cc937fdb580f63ba7b86b9779c5a9be6d9bdc9e9d"}, + {file = "json_repair-0.19.1-py3-none-any.whl", hash = "sha256:ec9bf426481352390771f7814823e68975cbedcf8e09bfefdc8478e648692215"}, + {file = "json_repair-0.19.1.tar.gz", hash = "sha256:b1de048af915044e5324c1d7a48cc2e083d9f3929ac453dc50972a0d47011c37"}, ] [[package]] @@ -3142,6 +3153,21 @@ html5 = ["html5lib"] htmlsoup = ["BeautifulSoup4"] source = ["Cython (>=3.0.10)"] +[[package]] +name = "markdown" +version = "3.6" +description = "Python implementation of John Gruber's Markdown." +optional = false +python-versions = ">=3.8" +files = [ + {file = "Markdown-3.6-py3-none-any.whl", hash = "sha256:48f276f4d8cfb8ce6527c8f79e2ee29708508bf4d40aa410fbc3b4ee832c850f"}, + {file = "Markdown-3.6.tar.gz", hash = "sha256:ed4f41f6daecbeeb96e576ce414c41d2d876daa9a16cb35fa8ed8c2ddfad0224"}, +] + +[package.extras] +docs = ["mdx-gh-links (>=0.2)", "mkdocs (>=1.5)", "mkdocs-gen-files", "mkdocs-literate-nav", "mkdocs-nature (>=0.6)", "mkdocs-section-index", "mkdocstrings[python]"] +testing = ["coverage", "pyyaml"] + [[package]] name = "markdown-it-py" version = "3.0.0" @@ -4025,6 +4051,42 @@ typing-extensions = ">=4.7,<5" [package.extras] datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"] +[[package]] +name = "opencv-python" +version = "4.9.0.80" +description = "Wrapper package for OpenCV python bindings." +optional = false +python-versions = ">=3.6" +files = [ + {file = "opencv-python-4.9.0.80.tar.gz", hash = "sha256:1a9f0e6267de3a1a1db0c54213d022c7c8b5b9ca4b580e80bdc58516c922c9e1"}, + {file = "opencv_python-4.9.0.80-cp37-abi3-macosx_10_16_x86_64.whl", hash = "sha256:7e5f7aa4486651a6ebfa8ed4b594b65bd2d2f41beeb4241a3e4b1b85acbbbadb"}, + {file = "opencv_python-4.9.0.80-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:71dfb9555ccccdd77305fc3dcca5897fbf0cf28b297c51ee55e079c065d812a3"}, + {file = "opencv_python-4.9.0.80-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b34a52e9da36dda8c151c6394aed602e4b17fa041df0b9f5b93ae10b0fcca2a"}, + {file = "opencv_python-4.9.0.80-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4088cab82b66a3b37ffc452976b14a3c599269c247895ae9ceb4066d8188a57"}, + {file = "opencv_python-4.9.0.80-cp37-abi3-win32.whl", hash = "sha256:dcf000c36dd1651118a2462257e3a9e76db789a78432e1f303c7bac54f63ef6c"}, + {file = "opencv_python-4.9.0.80-cp37-abi3-win_amd64.whl", hash = "sha256:3f16f08e02b2a2da44259c7cc712e779eff1dd8b55fdb0323e8cab09548086c0"}, +] + +[package.dependencies] +numpy = [ + {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, +] + +[[package]] +name = "openpyxl" +version = "3.1.2" +description = "A Python library to read/write Excel 2010 xlsx/xlsm files" +optional = false +python-versions = ">=3.6" +files = [ + {file = "openpyxl-3.1.2-py2.py3-none-any.whl", hash = "sha256:f91456ead12ab3c6c2e9491cf33ba6d08357d802192379bb482f1033ade496f5"}, + {file = "openpyxl-3.1.2.tar.gz", hash = "sha256:a6f5977418eff3b2d5500d54d9db50c8277a368436f4e4f8ddb1be3422870184"}, +] + +[package.dependencies] +et-xmlfile = "*" + [[package]] name = "opentelemetry-api" version = "1.24.0" @@ -4272,7 +4334,6 @@ optional = false python-versions = ">=3.9" files = [ {file = "pandas-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce"}, - {file = "pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7adfc142dac335d8c1e0dcbd37eb8617eac386596eb9e1a1b77791cf2498238"}, {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4abfe0be0d7221be4f12552995e58723c7422c80a659da13ca382697de830c08"}, {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8635c16bf3d99040fdf3ca3db669a7250ddf49c55dc4aa8fe0ae0fa8d6dcc1f0"}, {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:40ae1dffb3967a52203105a077415a86044a2bea011b5f321c6aa64b379a3f51"}, @@ -4293,7 +4354,6 @@ files = [ {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:43498c0bdb43d55cb162cdc8c06fac328ccb5d2eabe3cadeb3529ae6f0517c32"}, {file = "pandas-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:d187d355ecec3629624fccb01d104da7d7f391db0311145817525281e2804d23"}, {file = "pandas-2.2.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0ca6377b8fca51815f382bd0b697a0814c8bda55115678cbc94c30aacbb6eff2"}, - {file = "pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9057e6aa78a584bc93a13f0a9bf7e753a5e9770a30b4d758b8d5f2a62a9433cd"}, {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:001910ad31abc7bf06f49dcc903755d2f7f3a9186c0c040b827e522e9cef0863"}, {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66b479b0bd07204e37583c191535505410daa8df638fd8e75ae1b383851fe921"}, {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a77e9d1c386196879aa5eb712e77461aaee433e54c68cf253053a73b7e49c33a"}, @@ -4911,6 +4971,16 @@ dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pyte docs = ["sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"] tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] +[[package]] +name = "pylatexenc" +version = "2.10" +description = "Simple LaTeX parser providing latex-to-unicode and unicode-to-latex conversion" +optional = false +python-versions = "*" +files = [ + {file = "pylatexenc-2.10.tar.gz", hash = "sha256:3dd8fd84eb46dc30bee1e23eaab8d8fb5a7f507347b23e5f38ad9675c84f40d3"}, +] + [[package]] name = "pyparsing" version = "3.1.2" @@ -4943,6 +5013,24 @@ docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] full = ["Pillow (>=8.0.0)", "PyCryptodome", "cryptography"] image = ["Pillow (>=8.0.0)"] +[[package]] +name = "pypdf2" +version = "3.0.1" +description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" +optional = false +python-versions = ">=3.6" +files = [ + {file = "PyPDF2-3.0.1.tar.gz", hash = "sha256:a74408f69ba6271f71b9352ef4ed03dc53a31aa404d29b5d31f53bfecfee1440"}, + {file = "pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928"}, +] + +[package.extras] +crypto = ["PyCryptodome"] +dev = ["black", "flit", "pip-tools", "pre-commit (<2.18.0)", "pytest-cov", "wheel"] +docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] +full = ["Pillow", "PyCryptodome"] +image = ["Pillow"] + [[package]] name = "pypika" version = "0.48.9" @@ -5045,6 +5133,21 @@ files = [ [package.dependencies] six = ">=1.5" +[[package]] +name = "python-docx" +version = "1.1.2" +description = "Create, read, and update Microsoft Word .docx files." +optional = false +python-versions = ">=3.7" +files = [ + {file = "python_docx-1.1.2-py3-none-any.whl", hash = "sha256:08c20d6058916fb19853fcf080f7f42b6270d89eac9fa5f8c15f691c0017fabe"}, + {file = "python_docx-1.1.2.tar.gz", hash = "sha256:0cf1f22e95b9002addca7948e16f2cd7acdfd498047f1941ca5d293db7762efd"}, +] + +[package.dependencies] +lxml = ">=3.1.0" +typing-extensions = ">=4.9.0" + [[package]] name = "python-dotenv" version = "1.0.1" @@ -5073,6 +5176,22 @@ files = [ [package.extras] dev = ["atomicwrites (==1.4.1)", "attrs (==23.2.0)", "coverage (==7.4.1)", "hatch", "invoke (==2.2.0)", "more-itertools (==10.2.0)", "pbr (==6.0.0)", "pluggy (==1.4.0)", "py (==1.11.0)", "pytest (==8.0.0)", "pytest-cov (==4.1.0)", "pytest-timeout (==2.2.0)", "pyyaml (==6.0.1)", "ruff (==0.2.1)"] +[[package]] +name = "python-pptx" +version = "0.6.23" +description = "Generate and manipulate Open XML PowerPoint (.pptx) files" +optional = false +python-versions = "*" +files = [ + {file = "python-pptx-0.6.23.tar.gz", hash = "sha256:587497ff28e779ab18dbb074f6d4052893c85dedc95ed75df319364f331fedee"}, + {file = "python_pptx-0.6.23-py3-none-any.whl", hash = "sha256:dd0527194627a2b7cc05f3ba23ecaa2d9a0d5ac9b6193a28ed1b7a716f4217d4"}, +] + +[package.dependencies] +lxml = ">=3.1.0" +Pillow = ">=3.3.2" +XlsxWriter = ">=0.5.7" + [[package]] name = "pytz" version = "2024.1" @@ -7026,6 +7145,33 @@ files = [ {file = "wrapt-1.16.0.tar.gz", hash = "sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d"}, ] +[[package]] +name = "xlrd" +version = "2.0.1" +description = "Library for developers to extract data from Microsoft Excel (tm) .xls spreadsheet files" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +files = [ + {file = "xlrd-2.0.1-py2.py3-none-any.whl", hash = "sha256:6a33ee89877bd9abc1158129f6e94be74e2679636b8a205b43b85206c3f0bbdd"}, + {file = "xlrd-2.0.1.tar.gz", hash = "sha256:f72f148f54442c6b056bf931dbc34f986fd0c3b0b6b5a58d013c9aef274d0c88"}, +] + +[package.extras] +build = ["twine", "wheel"] +docs = ["sphinx"] +test = ["pytest", "pytest-cov"] + +[[package]] +name = "xlsxwriter" +version = "3.2.0" +description = "A Python module for creating Excel XLSX files." +optional = false +python-versions = ">=3.6" +files = [ + {file = "XlsxWriter-3.2.0-py3-none-any.whl", hash = "sha256:ecfd5405b3e0e228219bcaf24c2ca0915e012ca9464a14048021d21a995d490e"}, + {file = "XlsxWriter-3.2.0.tar.gz", hash = "sha256:9977d0c661a72866a61f9f7a809e25ebbb0fb7036baa3b9fe74afcfca6b3cb8c"}, +] + [[package]] name = "xxhash" version = "3.4.1" @@ -7327,4 +7473,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "56eda3b68982ff0d01c77ef11e591fb12c4d0b786357638ca65d5008f9b6d65a" +content-hash = "16ad3e107750c4b31eb1b181883aeb5e5efb59e96665d6ae2cca5df6043aae78" diff --git a/pyproject.toml b/pyproject.toml index dedde6eb197..c8b83833535 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,16 @@ boto3 = "*" minio = "^7.2.7" gevent = "^24.2.1" pyarrow = "16.1.0" # transitive dependency, pinned here to avoid conflicts +openai = "*" +python-docx = "*" +markdown = "*" +PyPDF2 = "*" +openpyxl = "*" +beautifulsoup4 = "*" +pylatexenc = "*" +python-pptx = "*" +xlrd = "*" +opencv-python = "*" [tool.poetry.group.llama-index.dependencies] llama-index = "*" diff --git a/tests/unit/multi_modal_files/utf16.txt b/tests/unit/multi_modal_files/utf16.txt new file mode 100644 index 00000000000..73f5158e38f Binary files /dev/null and b/tests/unit/multi_modal_files/utf16.txt differ diff --git a/tests/unit/multi_modal_files/utf8.txt b/tests/unit/multi_modal_files/utf8.txt new file mode 100644 index 00000000000..9fcd9bfc0c8 --- /dev/null +++ b/tests/unit/multi_modal_files/utf8.txt @@ -0,0 +1,20 @@ +English: Hello, World! +Japanese: こんにちは、世界! +Russian: Привет, мир! +Chinese: 你好,世界! +Korean: 안녕하세요, 세계! +Spanish: ¡Hola, mundo! +French: Bonjour, le monde! +German: Hallo, Welt! +Italian: Ciao, mondo! +Portuguese: Olá, mundo! +Arabic: مرحبا بالعالم! +Hebrew: שלום, עולם! +Hindi: नमस्ते, दुनिया! +Greek: Γειά σου, κόσμε! +Turkish: Merhaba, dünya! +Dutch: Hallo, wereld! +Swedish: Hej, världen! +Finnish: Hei, maailma! +Polish: Witaj, świecie! +Hungarian: Helló, világ! diff --git a/tests/unit/test_readers.py b/tests/unit/test_readers.py new file mode 100644 index 00000000000..67c1b68d137 --- /dev/null +++ b/tests/unit/test_readers.py @@ -0,0 +1,19 @@ +import os + +from opendevin.llm import readers + +files_dir = os.path.join( + os.path.dirname(os.path.abspath(__file__)), 'multi_modal_files' +) + + +def test_txt_reader(): + reader = readers.TXTReader() + + utf8_file_path = os.path.join(files_dir, 'utf8.txt') + with open(utf8_file_path, 'r', encoding='utf-8') as file: + assert reader.parse(utf8_file_path) == file.read() + + utf16_file_path = os.path.join(files_dir, 'utf16.txt') + with open(utf16_file_path, 'r', encoding='utf-16-be') as file: + assert reader.parse(utf16_file_path) == file.read()