In [None]:
!pip install pyspark
!pip install findspark

In [1]:
import os
import gc
import sys
import pandas as pd
os.environ['PYSPARK_PYTHON'] = sys.executable 
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ["PYARROW_IGNORE_TIMEZONE"] = '1' 

import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException
from pyspark.sql.types import IntegerType,StructField,StructType,StringType,LongType,MapType
import pyspark.pandas as ps
ps.options.compute.ops_on_diff_frames = True


# Iniciamos sesión de spark.
# SparkSession.stop(spk)
spk = SparkSession.builder.appName("PySpark Transformations to Populate our Data Warehouse").master("local[1]").getOrCreate()
spk

In [14]:
df = pd.read_json('../data/Google Maps/estados/review-Texas/11.json',lines=True)
df.dtypes, df.count()

(user_id    float64
 name        object
 time         int64
 rating       int64
 text        object
 pics        object
 resp        object
 gmap_id     object
 dtype: object,
 user_id    150000
 name       150000
 time       150000
 rating     150000
 text        88255
 pics         4637
 resp        22406
 gmap_id    150000
 dtype: int64)

In [15]:
psdf = ps.read_json('../data/Google Maps/estados/review-Texas/11.json',index_col='gmap_id').reset_index(drop=True)
psdf.dtypes, psdf.count()

(name       object
 pics       object
 rating      int64
 resp       object
 text       object
 time        int64
 user_id    object
 dtype: object,
 name       150000
 pics         4637
 rating     150000
 resp        22406
 text        88255
 time       150000
 user_id    150000
 dtype: int64)

In [24]:
schema = StructType([
    StructField('user_id',StringType(),False),
    StructField('name',StringType(),True),
    StructField('time',LongType(),True),
    StructField('rating',IntegerType(),True),
    StructField('text',StringType(),True),
    StructField('resp',MapType(StringType(),StringType()),True),
    StructField('gmap_id',StringType(),False)
])

sdf = spk.read.schema(schema).json(r'C:\Users\tinma\OneDrive\Escritorio\HENRY\Proyecto_Grupal_HENRY\data\Google Maps\estados\review-Texas\11.json')[['user_id','name','time','rating','text','resp','gmap_id']]
sdf.selectExpr('cast(user_id as int) user_id')
# sdf.count()
psdf = sdf.pandas_api()
# psdf['time'] = ps.to_datetime(psdf['time'],unit='ms')
psdf.spark.cache().head(5)

Unnamed: 0,user_id,name,time,rating,text,resp,gmap_id
0,102540505680898147322,Sergio Orjuela,1569461702556,5,,"{'time': '1569937557198', 'text': 'We greatly ...",0x864c23ee6b5d9d09:0x3cc9cba7f179b2ee
1,114879155270428006890,olivia,1620775162751,5,,"{'time': '1622864960517', 'text': 'Thank you f...",0x864c23ee6b5d9d09:0x3cc9cba7f179b2ee
2,106118137879389957889,Sid,1596227461108,5,,,0x864c23ee6b5d9d09:0x3cc9cba7f179b2ee
3,113695676953931571516,Mohit Bhole,1598127862268,5,,,0x864c23ee6b5d9d09:0x3cc9cba7f179b2ee
4,114399976182662326161,Chris luu,1580609932048,5,,,0x864c23ee6b5d9d09:0x3cc9cba7f179b2ee


In [None]:
from google.oauth2 import service_account

SERVICE_ACCOUNT_FILE = '../credentials/fiery-protocol-399500-f2566dd92ef4.json'

creds = service_account.Credentials.from_service_account_file(
        SERVICE_ACCOUNT_FILE)

from google.cloud import translate_v2 as translate

translate_client = translate.Client(credentials=creds)

language = translate_client.detect_language(psdf.resp[66])

translation = translate_client.translate('现在接线员是很好的英语', target_language='en')

language, translation

In [None]:
import ast

respuestas_negocio = psdf[psdf.resp != 'nan'][['user_id','gmap_id','resp']].reset_index(drop=True)
respuestas_negocio
for i in range(len(respuestas_negocio)):
    respuestas_negocio['resp'].apply(lambda x: ast.literal_eval(x))
    respuestas_negocio.loc[i,'resp_time'] = respuestas_negocio.loc[i,'resp']['time']
    respuestas_negocio.loc[i,'resp_text'] = respuestas_negocio.loc[i,'resp']['text']
respuestas_negocio
languages_psdf = ps.DataFrame(translate_client.get_languages())

In [None]:
PROJECT_ID = 'fiery-protocol-399500'
STATES = ['California'] #,'Texas'] # ,'New_York','Colorado','Georgia']
schema = StructType([
    StructField('user_id',StringType(),False),
    StructField('name',StringType(),True),
    StructField('time',LongType(),True),
    StructField('rating',IntegerType(),True),
    StructField('text',StringType(),True),
    StructField('resp',StringType(),False),
    StructField('gmap_id',StringType(),False)
])



psdfx = ps.DataFrame(columns=['gmap_id','user_id','name','time','text','rating','resp_time','resp_text'])
for state in STATES:
    i = 1
    df_list = []
    while True:
        try:
            # Leemos los archivos en un SPARK Data Frame para poder acceder directamente a GCS
            sdf = spk.read.schema(schema).json(f'../data/Google Maps/estados/review-{state}/{i}.json')[['user_id','name','time','rating','text','resp','gmap_id']].cache()
            sdf.selectExpr('cast(user_id as int) user_id')
            sdf.selectExpr('cast(null as map<string,string>) resp')
            # PANDAS API Data Frame: Paso intermedio para trabajar con los métodos de pandas pero con la potencia de spark, posteriormente guardaremos los datos en BQ después de 
            # las transformaciones...
            # sdf.count()
            psdf = sdf.pandas_api().spark.cache()
            sdf.unpersist()
            del sdf
            gc.collect()
            # psdf['time'] = ps.to_datetime(psdf['time'],unit='ms')
            psdf['estado'] = state
            psdf['resp'] = psdf.resp.fillna('nan')
            psdf['text'] = psdf.text.fillna('nan')
            df_list.append(psdf)
            i += 1
        except AnalysisException:
            break

    psdf = ps.concat(df_list,axis=0)
    psdf = psdf.reset_index(drop=True)
    del df_list
    psdf.spark.cache()
    print(f'pyspark.pandas data frame persisted - {state}')





#     # Generamos el primer grupo de transformaciones para los datos de las reviews de Maps en PANDAS API. Queda la metadata y los archivos de Yelp.
#     psdf['resp_time'] = ps.Series([],dtype='int64')
#     print('serie resp_time creada')
#     psdf['resp_text'] = ps.Series([],dtype='str')
#     psdf.spark.cache()
#     print('serie resp_text creada')
#     for i in range(len(psdf)):
#         print(i)
#         if type(psdf.loc[i,'resp']) == dict:
#             psdf.loc[i,'resp_time'] = psdf.loc[i,'resp']['time']
#             psdf.loc[i,'resp_text'] = psdf.loc[i,'resp']['text']
#         else:
#             pass
#     psdf.resp_time = psdf.resp_time.fillna(0).astype('int64')
#     psdf.resp_text = psdf.resp_text.fillna('')
#     psdf = psdf[['gmap_id','user_id','name','time','text','rating','resp_time','resp_text']]
#     psdf.spark.cache()
    
#     # Aquí concatenamos todos los archivos del estado en curso a los demás estados, para obtener una tabla total de estados.
#     psdfx = ps.concat(psdf,axis=0)
#     psdf.spark.unpersist()
#     del psdf
#     gc.collect()
#     print('pyspark.pandas data frame unpersisted and deleted')

# # Convertimos el dataframe de Pandas API on Spark a un dataframe de Spark
# sdf = psdfx.to_spark()
# del psdfx
# gc.collect()

# # Guardamos las tablas concatenadas en archivos .json en GCS.
# sdf.write.mode('overwrite').format('csv').save(f'../data/Google Maps/clean_test/estados/all_tables')