This repository has been archived by the owner on Apr 22, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 12
/
gdocs_streams.py
124 lines (98 loc) · 4.45 KB
/
gdocs_streams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import base
import brewery.metadata as metadata
try:
import gdata.spreadsheet.text_db
except:
from brewery.utils import MissingPackage
gdata = MissingPackage("gdata", "Google data (spreadsheet) source/target")
# Documentation:
# http://gdata-python-client.googlecode.com/svn/trunk/pydocs/
class GoogleSpreadsheetDataSource(base.DataSource):
"""Reading data from a google spreadsheet.
Some code taken from OKFN Swiss library.
"""
def __init__(self, spreadsheet_key=None, spreadsheet_name=None,
worksheet_id=None, worksheet_name=None,
query_string="",
username=None, password=None):
"""Creates a Google Spreadsheet data source stream.
:Attributes:
* spreadsheet_key: The unique key for the spreadsheet, this
usually in the the form 'pk23...We' or 'o23...423.12,,,3'.
* spreadsheet_name: The title of the spreadsheets.
* worksheet_id: ID of a worksheet
* worksheet_name: name of a worksheet
* query_string: optional query string for row selection
* username: Google account user name
* password: Google account password
You should provide either spreadsheet_key or spreadsheet_name, if more than one spreadsheet with
given name are found, then the first in list returned by Google is used.
For worksheet selection you should provide either worksheet_id or worksheet_name. If more than
one worksheet with given name are found, then the first in list returned by Google is used. If
no worksheet_id nor worksheet_name are provided, then first worksheet in the workbook is used.
For details on query string syntax see the section on sq under
http://code.google.com/apis/spreadsheets/reference.html#list_Parameters
"""
self.spreadsheet_key = spreadsheet_key
self.spreadsheet_name = spreadsheet_name
self.worksheet_id = worksheet_id
self.worksheet_name = worksheet_name
self.query_string = query_string
self.username = username
self.password = password
self.client = None
self._fields = None
def initialize(self):
"""Connect to the Google documents, authenticate.
"""
self.client = gdata.spreadsheet.text_db.DatabaseClient(username=self.username, password=self.password)
dbs = self.client.GetDatabases(spreadsheet_key=self.spreadsheet_key,
name=self.spreadsheet_name)
if len(dbs) < 1:
raise Exception("No spreadsheets with key '%s' or name '%s'" %
(self.spreadsheet_key, self.spreadsheet_key))
db = dbs[0]
worksheets = db.GetTables(worksheet_id=self.worksheet_id,
name=self.worksheet_name)
self.worksheet = worksheets[0]
self.worksheet.LookupFields()
# FIXME: try to determine field types from next row
self._fields = metadata.FieldList(self.worksheet.fields)
def rows(self):
if not self.worksheet:
raise RuntimeError("Stream is not initialized (no worksheet)")
iterator = self.worksheet.FindRecords(self.query_string).__iter__()
return GDocRowIterator(self.fields.names(), iterator)
def records(self):
if not self.worksheet:
raise RuntimeError("Stream is not initialized (no worksheet)")
iterator = self.worksheet.FindRecords(self.query_string).__iter__()
return GDocRecordIterator(self.fields.names(), iterator)
class GDocRowIterator(object):
"""
Iterator that returns immutable list (tuple) of values
"""
def __init__(self, field_names, iterator):
self.iterator = iterator
self.field_names = field_names
def __iter__(self):
return self
def next(self):
record = self.iterator.next()
content = record.content
values = [content[field] for field in self.field_names]
return list(values)
class GDocRecordIterator(object):
"""
Iterator that returns records as dict objects
"""
def __init__(self, field_names, iterator):
self.iterator = iterator
self.field_names = field_names
def __iter__(self):
return self
def next(self):
record = self.iterator.next()
return record.content