In [1]:
from tools.imports import *

In [11]:
import os
import json
import pickle
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from dotenv import load_dotenv

load_dotenv()
# Define the scopes
SCOPES = ['https://www.googleapis.com/auth/drive.metadata.readonly']

# Function to authenticate and get the service
def authenticate_with_oauth():
    creds = None
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(os.getenv('CREDENTIALS_PATH'), SCOPES)
            creds = flow.run_local_server(port=0)
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)
    service = build('drive', 'v3', credentials=creds)
    return service

# Function to list files and write to intermediate files
def list_files_and_write(service, output_dir, batch_size=100):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    page_token = None
    batch_num = 0
    file_count = 0
    batch_data = []

    while True:
        response = service.files().list(
            q="'me' in owners",
            spaces='drive',
            fields='nextPageToken, files(id, name, mimeType, createdTime, modifiedTime)',
            pageToken=page_token
        ).execute()
        
        items = response.get('files', [])
        if not items:
            break
        
        for item in items:
            batch_data.append(item)
            file_count += 1
            
            if file_count >= batch_size:
                batch_file = os.path.join(output_dir, f'batch_{batch_num}.json')
                with open(batch_file, 'w') as f:
                    json.dump(batch_data, f, indent=4)
                
                batch_num += 1
                file_count = 0
                batch_data = []

        page_token = response.get('nextPageToken', None)
        if page_token is None:
            break
    
    # Write remaining data if exists
    if batch_data:
        batch_file = os.path.join(output_dir, f'batch_{batch_num}.json')
        with open(batch_file, 'w') as f:
            json.dump(batch_data, f, indent=4)

# Authenticate and get the service
service = authenticate_with_oauth()

# List files and write to intermediate files
output_dir = 'drive_batches'
list_files_and_write(service, output_dir)


Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=291175256673-gr5p5vf3pi2h0m46h5qnd3ila4iitfqs.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A59880%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.metadata.readonly&state=rOPvALsdlbiKk0qE7k7JsngsKeoRdX&access_type=offline


In [12]:
from collections import defaultdict
import os
import json

def map_function(batch_file, output_dir):
    with open(batch_file, 'r') as f:
        items = json.load(f)
    
    mapped_data = defaultdict(list)
    for item in items:
        date_key = item['createdTime'][:10]  # Use only the date part
        file_info = {
            'id': item['id'],
            'name': item['name'],
            'type': item['mimeType'],
            'created_time': item['createdTime'],
            'modified_time': item['modifiedTime']
        }
        mapped_data[date_key].append(file_info)
    
    for date, files in mapped_data.items():
        output_file = os.path.join(output_dir, f'{date}.json')
        with open(output_file, 'a') as f:
            json.dump(files, f)
            f.write("\n")  # Ensure each batch is on a new line

map_output_dir = 'mapped_batches'
if not os.path.exists(map_output_dir):
    os.makedirs(map_output_dir)

for batch_file in os.listdir(output_dir):
    map_function(os.path.join(output_dir, batch_file), map_output_dir)


In [13]:
def reduce_function(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for map_file in os.listdir(input_dir):
        aggregated_data = defaultdict(list)
        with open(os.path.join(input_dir, map_file), 'r') as f:
            for line in f:
                batch_data = json.loads(line)
                for item in batch_data:
                    aggregated_data[item['created_time'][:10]].append(item)
        
        # Write aggregated data to final file
        for date, files in aggregated_data.items():
            output_file = os.path.join(output_dir, f'final_{date}.json')
            with open(output_file, 'w') as f:
                json.dump(files, f, indent=4)

reduce_output_dir = 'final_aggregated'
reduce_function(map_output_dir, reduce_output_dir)

In [3]:
import os
import json

def search_item_in_reduced_files(reduce_output_dir, item_name):
    unique_items = set()
    folder_items = []

    for output_file in os.listdir(reduce_output_dir):
        if output_file.endswith('.json'):
            with open(os.path.join(reduce_output_dir, output_file), 'r') as f:
                data = json.load(f)
                for item in data:
                    if item['name'].lower() == item_name.lower():
                        if item['id'] not in unique_items:
                            unique_items.add(item['id'])
                            folder_items.append(item)
    
    if not folder_items:
        print(f"No folder named '{item_name}' found.")
    else:
        print(f"Found folder(s) named '{item_name}':")
        for item in folder_items:
            print(f"Name: {item['name']}, ID: {item['id']}, Created Time: {item['created_time']}, Modified Time: {item['modified_time']}")
    return folder_items

# Search for the folder named 'a' in the reduced output files
reduce_output_dir = 'final_aggregated'
item_name = 'a'
search_item_in_reduced_files(reduce_output_dir, item_name)


Found folder(s) named 'a':
Name: a, ID: 1kIVSZss1hp9kHhQJmmCozYYTayLH6CIa, Created Time: 2023-12-14T23:27:40.417Z, Modified Time: 2023-12-14T23:27:40.417Z


[{'id': '1kIVSZss1hp9kHhQJmmCozYYTayLH6CIa',
  'name': 'a',
  'type': 'application/vnd.google-apps.folder',
  'created_time': '2023-12-14T23:27:40.417Z',
  'modified_time': '2023-12-14T23:27:40.417Z'}]

In [1]:
import os
import json
from collections import Counter

def count_drive_items(batch_dir):
    # Initialize counters
    item_counts = Counter()

    # Iterate over each batch file in the directory
    for batch_file in os.listdir(batch_dir):
        if batch_file.endswith('.json'):
            with open(os.path.join(batch_dir, batch_file), 'r') as f:
                items = json.load(f)
                for item in items:
                    item_type = item['mimeType']
                    item_counts[item_type] += 1

    # Print the counts
    print("Total counts of items in 'drive_batches':")
    for item_type, count in item_counts.items():
        print(f"{item_type}: {count}")

    return item_counts

# Directory containing the batch files
batch_dir = 'drive_batches'

# Count the items
count_drive_items(batch_dir)


Total counts of items in 'drive_batches':
application/vnd.google-apps.spreadsheet: 4
application/vnd.google-apps.document: 57
application/vnd.google-apps.folder: 95
application/pdf: 30
audio/mpeg: 1003
application/json: 6
application/vnd.google.colaboratory: 10
image/jpeg: 6
application/vnd.openxmlformats-officedocument.wordprocessingml.document: 2
video/mp4: 1
application/octet-stream: 11263
application/x-ipynb+json: 12
text/plain: 22
application/xml: 1
text/xml: 1
application/vnd.palm: 5
application/x-dosexec: 4
image/heif: 18
application/x-zip-compressed: 10


Counter({'application/octet-stream': 11263,
         'audio/mpeg': 1003,
         'application/vnd.google-apps.folder': 95,
         'application/vnd.google-apps.document': 57,
         'application/pdf': 30,
         'text/plain': 22,
         'image/heif': 18,
         'application/x-ipynb+json': 12,
         'application/vnd.google.colaboratory': 10,
         'application/x-zip-compressed': 10,
         'application/json': 6,
         'image/jpeg': 6,
         'application/vnd.palm': 5,
         'application/vnd.google-apps.spreadsheet': 4,
         'application/x-dosexec': 4,
         'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 2,
         'video/mp4': 1,
         'application/xml': 1,
         'text/xml': 1})