In [2]:
dir()

['In',
 'Out',
 'SQLContext',
 'SparkConf',
 'SparkContext',
 'SparkSession',
 '_',
 '__',
 '___',
 '__builtin__',
 '__builtins__',
 '__doc__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_dh',
 '_i',
 '_i1',
 '_i2',
 '_ih',
 '_ii',
 '_iii',
 '_oh',
 '_pythonstartup',
 'atexit',
 'conf',
 'exit',
 'get_ipython',
 'os',
 'platform',
 'py4j',
 'pyspark',
 'quit',
 'sc',
 'spark',
 'sql',
 'sqlContext',
 'sqlCtx',

In [205]:
import time                                                
import re
import numpy as np

In [65]:
def timeit(method):

    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()

        print('%r (%r, %r) %2.2f sec' % \
              (method.__name__, args, kw, te-ts))
        return result

    return timed

In [234]:
train_rdd = sc.textFile('gs://lbanor/pyspark/train_query*.gz')

In [235]:
train_rdd.take(3)

['fv,sku,score',
 '3383270414872112082,MO578SHF77RTI,0.5',
 '7143168022217708588,DA923SHF54UJP,0.5']

In [60]:
valid_rdd = sc.textFile('gs://lbanor/pyspark/validation_query*.gz')

In [139]:
test_rdd = sc.textFile('gs://lbanor/pyspark/test_query*.gz')

In [14]:
train_df = spark.read.csv('gs://lbanor/pyspark/train_query*.gz', header=True)

In [150]:
valid_df = spark.read.csv('gs://lbanor/pyspark/validation_query*.gz', header=True)

In [152]:
valid_df.createGlobalTempView('valid_data')

In [151]:
test_df = spark.read.csv('gs://lbanor/pyspark/test_query*.gz', header=True)

In [154]:
test_df.createGlobalTempView('test_data')

AnalysisException: "Temporary table 'test_data' already exists;"

In [25]:
@timeit
def create_global(df, name):
    df.createGlobalTempView(name)

In [24]:
create_global(train_df, 'test4')

'create_global' ((DataFrame[fv: string, sku: string, score: string], 'test4'), {}) 2.01 sec


In [77]:
d2 = train_rdd.map(lambda x: x.split(',')).map(lambda x: re.sub('-\d+$', '', x[1])).distinct()

In [78]:
d2 = d2.zipWithIndex()

[('FI911SHF30RVT', 0), ('BO185SHF82JEJ', 1), ('IS227SHF92EKT', 2)]

In [92]:
final_train_rdd = train_rdd.join(d2)

In [93]:
final_train_rdd.take(3)

[]

In [87]:
#d = spark.sql("SELECT regexp_replace(sku, '-\d+$', ''), row_number() over(ORDER BY 1) FROM global_temp.test4 group by sku")
d = spark.sql("SELECT regexp_replace(sku, '-\d+$', ''), sum(1) over(ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) FROM global_temp.test4 group by sku")

In [112]:
query = """
SELECT
a.fv fv,
b.idx idx,
a.score score
FROM(
SELECT
 *
FROM  global_temp.test4
) a
JOIN(
SELECT
  regexp_replace(sku, '-\d+$', '') sku,
  sum(1) over(ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) idx
FROM global_temp.test4
GROUP BY sku
) b
ON a.sku = b.sku
"""

In [136]:
query = """
SELECT
b.idx fv,
c.idx sku,
a.score score
FROM(
SELECT
 *
FROM  global_temp.%s
) a
JOIN(
SELECT
  fv,
  idx
FROM global_temp.users_idx
) b
ON a.fv = b.fv
JOIN(
SELECT
sku,
idx
FROM global_temp.sku_idx
) c
ON c.sku = a.sku
"""

In [120]:
users_query = \
"""
SELECT
fv,
sum(1) over(ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) idx
FROM global_temp.test4 group by fv
"""

In [122]:
users_df = spark.sql(users_query)
users_df.createGlobalTempView('users_idx')

In [146]:
users_df.toPandas?

In [125]:
sku_query = \
"""
SELECT
sku,
sum(1) over(ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) idx
FROM global_temp.test4 group by sku
"""

In [126]:
sku_df = spark.sql(sku_query)
sku_df.createGlobalTempView('sku_idx')

In [134]:
train_data = spark.sql(query)

In [149]:
train_data.select('fv', 'sku').take(3)

[Row(fv=105748, sku=1511), Row(fv=222145, sku=1662), Row(fv=264737, sku=1662)]

In [156]:
valid_data = spark.sql(query %('valid_data'))

In [157]:
valid_data.take(3)

[Row(fv=646783, sku=86158, score='1'),
 Row(fv=943352, sku=86309, score='1'),
 Row(fv=679390, sku=86056, score='0.5')]

In [158]:
test_data = spark.sql(query %('test_data'))

In [164]:
test_data.rdd

TypeError: 'RDD' object is not callable

In [79]:
d2.take(3)

[('FI911SHF30RVT', 0), ('BO185SHF82JEJ', 1), ('IS227SHF92EKT', 2)]

In [89]:
%timeit d.take(3)

8.21 s ± 350 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [52]:
%timeit d2.take(3)

68.6 ms ± 1.62 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
train_df.take(3)

[Row(_c0='fv', _c1='sku', _c2='score'),
 Row(_c0='3383270414872112082', _c1='MO578SHF77RTI', _c2='0.5'),
 Row(_c0='7143168022217708588', _c1='DA923SHF54UJP', _c2='0.5')]

In [59]:
train_rdd.take(5)

['fv,sku,score',
 '3383270414872112082,MO578SHF77RTI,0.5',
 '7143168022217708588,DA923SHF54UJP,0.5',
 '8844960186636261737,LU621ACM67NYU,0.5',
 '4982876416707808713,NE184SCM38ZOR,1']

In [61]:
valid_rdd.take(4)

['fv,sku,score',
 '8904296658784430607,BE139SHF93GPE,0.5',
 '8387443859206657573,FA258SHF63HTC,3',
 '3603778167605721116,HD124SRM40WZB,4']

In [202]:
cocu = sc.textFile('gs://lbanor/1_0_00000000000561840423.gz')

In [203]:
cocu.count()

139201

In [236]:
cocu.take(10)

['{"event":{"schema_version":1,"user":{"location":{}},"identifiers":{"bid":{"value":"7645041da17c1b1ed4e86a20714e9abf","type":"bid"},"djUCID":{"value":"d4e22d11a6905a8c","type":"djUCID"}},"device":{"client":"Mozilla/5.0 (Linux; Android 4.4.2; LG-D855 Build/KVT49L) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36","os":"Linux armv7l","origin":"web"},"source":{"tracker":"fish","url":"m.dafiti.com.br/?placeholder\\u0026gclid=CMuSis3z0tUCFRMIkQodRP8GfA","url_referrer":"www.google.com.br/"},"created_at":1502582400031,"local_timestamp":1502582399182,"type":"homepageview","details":{}},"created_at":1502582400031}',
 '{"event":{"schema_version":1,"user":{"location":{}},"identifiers":{"bid":{"value":"dcb7b9b540188da2ef245e15785d2ecb","type":"bid"},"djUCID":{"value":"25e35a54c8cace51","type":"djUCID"}},"device":{"client":"Mozilla/5.0 (Linux; Android 4.4.4; SM-G530BT Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36","os

In [69]:
re.sub?

In [87]:
train_data = train_rdd.filter(lambda x: x != 'fv,sku,score').map(lambda row: row.split(',')).map(lambda e: (e[0], e[1], float(e[2]))).cache()

In [70]:
valid_data = valid_rdd.filter(lambda x: x != 'fv,sku,score').map(lambda row: row.split(',')).map(lambda e: (e[0], e[1], e[2])).cache()

In [72]:
test_data = test_rdd.filter(lambda x: x != 'fv,sku,score').map(lambda row: row.split(',')).map(lambda e: (e[0], e[1], e[2])).cache()

In [155]:
train_data.take(3)

[Row(fv=813424, sku=1511, score='1'),
 Row(fv=1017617, sku=1662, score='0.5'),
 Row(fv=392066, sku=1662, score='1')]

In [71]:
valid_data.take(3)

[('8904296658784430607', 'BE139SHF93GPE', '0.5'),
 ('8387443859206657573', 'FA258SHF63HTC', '3'),
 ('3603778167605721116', 'HD124SRM40WZB', '4')]

In [73]:
test_data.take(3)

[('3980302175032894809', 'CI369APM10NSB', '0.5'),
 ('2538539940766825016', 'VI618SHF96VTV', '0.5'),
 ('5159285433814597554', 'DA923SHF01ZNY', '0.5')]

In [76]:
valid_to_predict = valid_data.map(lambda x: (x[0], x[1]))

In [77]:
valid_to_predict.take(3)

[('8904296658784430607', 'BE139SHF93GPE'),
 ('8387443859206657573', 'FA258SHF63HTC'),
 ('3603778167605721116', 'HD124SRM40WZB')]

In [78]:
test_to_predict = test_data.map(lambda x: (x[0], x[1]))

In [79]:
test_to_predict.take(3)

[('3980302175032894809', 'CI369APM10NSB'),
 ('2538539940766825016', 'VI618SHF96VTV'),
 ('5159285433814597554', 'DA923SHF01ZNY')]

In [132]:
from pyspark.mllib.recommendation import ALS
import math

In [162]:
model = ALS.train?

In [None]:
model = ALS.train

In [167]:
best_rank = -1
best_iteration = -1

for rank in [4, 6, 10]:
    model = ALS.train(train_data, rank, seed=5, iterations=5, lambda_=0.1)
    predictions = model.predictAll(valid_data.select('fv', 'sku').rdd).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = valid_data.rdd.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1]) ** 2).mean())
    print(error)

1.6183618802967397
1.7074869368155823
1.4520415029913245


In [169]:
predictions.take(3)

[((903500, 320127), -0.297609535386314),
 ((903500, 174229), -0.10394798458488858),
 ((903500, 337057), 0.03640408823616548)]

In [105]:
t1 = sc.parallelize([('a', 'a', 1), ('b', 'c', 2)])
t2 = sc.parallelize([('b', 3)])

In [108]:
r = t1.union?

In [None]:
r = t1.union

[('b', ('c', 3))]

In [168]:
import sys
sys.maxsize

9223372036854775807

In [187]:
t1 = sc.parallelize([('a', 1), ('b', 2)]).toDF(['name', 'value'])

In [181]:
t1.toDF?

In [None]:
t1 = sc.parallelize([('a', 1), ('b', 2)]).toDF

In [188]:
t1.createOrReplaceTempView('t2')

In [189]:
r = spark.sql('select * from t2')

In [190]:
r.take(3)

[Row(name='a', value=1), Row(name='b', value=2)]

In [191]:
r = spark.sql('select array(name) from t2')

In [192]:
r.take(3)

[Row(array(name)=['a']), Row(array(name)=['b'])]

In [194]:
sc = SparkContext.getOrCreate()

In [197]:
jvm = sc._jvm

In [198]:
o = jvm.PythonMLLibAPI()

In [200]:
m = o.trainALSModel?

In [None]:
m = o.trainALSModel

In [232]:
a = np.array([[1], [2], [3]])
b = np.array([1, 2, 3])

In [233]:
n = np.multiply(a, b)
print(n)

[[1 2 3]
 [2 4 6]
 [3 6 9]]


In [227]:
np.dot?