<a href="https://colab.research.google.com/github/Walter-Haydock/OpenAI/blob/main/Google_Drive_ChatGPT_Query_from_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This script borrows heavily from this article: https://vasos-koupparis.com/build-a-chatbot-based-on-your-own-documents-with-chatgpt-step-by-step-guide/

In [1]:
# If you are in Google Colab, this will check to see if you have the right libraries installed. If not, they will be installed for you.
try:
  from llama_index import GPTSimpleVectorIndex, Document, SimpleDirectoryReader
except:
  !pip install llama-index
  from llama_index import GPTSimpleVectorIndex, Document, SimpleDirectoryReader
  
try:
  import openai
except:
  !pip install openai
  import openai

try:
  import docx
except:
  !pip install python-docx
  import docx  

In [2]:
import os

# Recommend using GPT-4 API Key, if available
os.environ['OPENAI_API_KEY'] = REPLACE_WITH_OPENAI_API_KEY_IN_SINGLE_QUOTATION_MARKS

In [3]:
# Clearing out any files that exist and have the same names
try:
  os.remove('credentials.json')
except:
  pass

try:
  os.remove('client_secrets.json')
except:
  pass

In [18]:
# Import google cloud credentials.
# Read these document for how to create a "Desktop App" OAuth Client ID: https://developers.google.com/workspace/guides/create-credentials#desktop-app
# When the "choose files" button appears, upload from your desktop
from google.colab import files
files.upload()

In [5]:
# Creating a copy of the credentials.json file
!cp credentials.json client_secrets.json

In [6]:
# Read these docs for how to get your folder id: https://llamahub.ai/l/google_drive
google_drive_folder_id = REPLACE_WITH_GOOGLE_DRIVE_FOLDER_ID_IN_SINGLE_QUOTATION_MARKS

In [7]:
from llama_index import download_loader
GoogleDriveReader = download_loader("GoogleDriveReader")
loader = GoogleDriveReader()

In [8]:
# This script specifically extracts from Google Doc and Microsoft .docx files

from google.colab import auth
from oauth2client.client import GoogleCredentials
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import io
from googleapiclient.http import MediaIoBaseDownload

class GoogleDriveReader:

    def __init__(self):
        auth.authenticate_user()
        self.creds = GoogleCredentials.get_application_default()
        self.service = build('drive', 'v3', credentials=self.creds)

    def load_data(self, folder_id=None, file_ids=None):
        documents = []

        if folder_id:
            try:
                query = f"'{folder_id}' in parents and (mimeType='application/vnd.google-apps.document' or mimeType='application/vnd.openxmlformats-officedocument.wordprocessingml.document')"
                results = self.service.files().list(q=query, fields="nextPageToken, files(id, name, mimeType)").execute()
                items = results.get('files', [])

                for item in items:
                    file_id = item['id']
                    file_name = item['name']
                    mimeType = item['mimeType']

                    if mimeType == 'application/vnd.google-apps.document':
                        file_content = self.service.files().export(fileId=file_id, mimeType='text/plain').execute()
                    else:
                        request = self.service.files().get_media(fileId=file_id)
                        file_content = io.BytesIO()
                        downloader = MediaIoBaseDownload(file_content, request)
                        done = False
                        while done is False:
                            status, done = downloader.next_chunk()
                        file_content.seek(0)

                    if isinstance(file_content, io.BytesIO):
                        if file_name.endswith('.docx'):
                            doc = docx.Document(file_content)
                            content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
                        else:
                            content = file_content.read().decode('utf-8')
                    else:
                        content = file_content.decode('utf-8')

                    documents.append({"name": file_name, "content": content})

            except HttpError as error:
                print(f'An error occurred: {error}')
                return None

        elif file_ids:
            for file_id in file_ids:
                try:
                    file = self.service.files().get(fileId=file_id).execute()
                    file_name = file['name']
                    mimeType = file['mimeType']

                    if mimeType == 'application/vnd.google-apps.document':
                        file_content = self.service.files().export(fileId=file_id, mimeType='text/plain').execute()
                    else:
                        request = self.service.files().get_media(fileId=file_id)
                        file_content = io.BytesIO()
                        downloader = MediaIoBaseDownload(file_content, request)
                        done = False
                        while done is False:
                            status, done = downloader.next_chunk()
                        file_content.seek(0)

                    documents.append({"name": file_name, "content": file_content.read().decode('utf-8')})

                except HttpError as error:
                    print(f'An error occurred: {error}')
                    return None

        return documents

In [9]:
loader = GoogleDriveReader()

# Load the folder by id
loaded_documents = loader.load_data(folder_id=google_drive_folder_id)

In [10]:
documents = [Document(doc["content"]) for doc in loaded_documents]

In [11]:
index = GPTSimpleVectorIndex.from_documents(documents)

In [12]:
# Save your index to a index.json file
index.save_to_disk('index.json')

# Load the index from your saved index.json file
index = GPTSimpleVectorIndex.load_from_disk('index.json')

In [17]:
# Querying the index
response = index.query("What is the most interesting peice of information in this folder?")
print(response)