# To Script

In [32]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="credentials/credentials-gs.json"

# https://kashif-sohail.medium.com/read-files-from-google-cloud-storage-bucket-using-local-pyspark-and-jupyter-notebooks-f8bd43f4b42e

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import StructType,StructField, StringType, IntegerType, TimestampType, ArrayType

from google.cloud import storage

spark = SparkSession \
  .builder \
  .appName('spark-ETL-Tweets') \
  .config('spark.jars', '../gcs-connector-hadoop2-latest.jar') \
  .getOrCreate()

spark._jsc.hadoopConfiguration().set('fs.gs.impl', 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem')

BUCKET = 'cloud-computing-2122-bjr'
BUCKET_LINK = 'gs://cloud-computing-2122-bjr'

## 0: Check which CSV's havent been read

https://cloud.google.com/storage/docs/downloading-objects#storage-download-object-portion-python

Need: 
* To Have such .txt in bucket
* Pull .txt
* Check against list of csv's in bucket

In [2]:
# Read txt file of already processed CSV's
def read_txt_blob(bucket_name, destination_file_name):
    # https://stackoverflow.com/questions/48279061/gcs-read-a-text-file-from-google-cloud-storage-directly-into-python
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.get_blob(destination_file_name)
    read_file = blob.download_as_text(encoding="utf-8")
    # Log instead of print
    print(
        "Loaded from bucket {} to local file {}.".format(
            bucket_name, destination_file_name
        )
    )
    return read_file.split('\n')

FILES_ALREADY_READ = read_txt_blob(BUCKET, 'control/read_files.txt')

Loaded from bucket cloud-computing-2122-bjr to local file control/read_files.txt.


In [3]:
FILES_ALREADY_READ

['']

In [4]:
# List all available data files
def list_blobs_with_prefix(bucket_name, prefix):
    # https://cloud.google.com/storage/docs/listing-objects#storage-list-objects-python
    storage_client = storage.Client()

    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs(bucket_name, prefix=prefix)
    list_of_csvs = []
    """
    print("Blobs:")
    """
    for blob in blobs:
        list_of_csvs.append(blob.name.split("/")[-1])
        # print(blob.name)
    return list_of_csvs

DATA_IN_BUCKET = list_blobs_with_prefix(BUCKET, 'data/')

In [5]:
FILES_TO_PROCESS = [f'data/{FILE}' for FILE in DATA_IN_BUCKET if FILE not in FILES_ALREADY_READ]

In [6]:
FILES_TO_PROCESS

['data/220312.csv']

## 1: Read CSV into Spark

In [7]:
## print(f"{BUCKET_LINK}/{csv_file}")

### 1.1: Define Schema of Global Data

https://sparkbyexamples.com/pyspark/pyspark-structtype-and-structfield/   
https://spark.apache.org/docs/latest/sql-ref-datatypes.html

In [36]:
GLOBAL_SCHEMA = StructType([ \
    StructField("RowID",IntegerType(),False), \
    StructField("userid",IntegerType(),False), \
    StructField("acctdesc",StringType(),True), \
    StructField("location", StringType(), True), \
    StructField("following", IntegerType(), True), \
    StructField("followers", IntegerType(), True), \
    StructField("totaltweets", IntegerType(), True), \
    StructField("usercreatedts", TimestampType(), False), \
    StructField("tweetid", IntegerType(), False), \
    StructField("tweetcreatedts", TimestampType(), False), \
    StructField("retweetcount", IntegerType(), False), \
    StructField("text", StringType(), True), \
    StructField("hashtags", StringType(), False), \
    StructField("language", StringType(), True), \
    StructField("coordinates", StringType(), True), \
    StructField("favorite_count", IntegerType(), True), \
    StructField("extractedts", TimestampType(), True) \
  ])

# StructField("hashtags", StructType([StructField('text', StringType(), False), \
# StructField('indices', ArrayType(IntegerType()), False)]),False), \

In [23]:
csv_file.head(2)

[Row(_c0=None, _c1='userid', _c2='username', _c3='acctdesc', _c4='location', _c5='following', _c6='followers', _c7='totaltweets', _c8='usercreatedts', _c9='tweetid', _c10='tweetcreatedts', _c11='retweetcount', _c12='text', _c13='hashtags', _c14='language', _c15='coordinates', _c16='favorite_count', _c17='extractedts'),
 Row(_c0='5428025', _c1='1421921881528016896', _c2='PFComCon', _c3='Timely, concise, and comprehensive source of information and analysis on Indo-Pacific bilateral relations, published by @pacificforum', _c4='Honolulu, Hawaii', _c5='96', _c6='53', _c7='706', _c8='2021-08-01 19:53:49.000000', _c9='1502434194462412804', _c10='2022-03-12 00:00:00', _c11='1', _c12='https://t.co/6vzbiMRRbP', _c13=None, _c14=None, _c15=None, _c16=None, _c17=None)]

In [22]:
csv_file.where(csv_file['_c15'] != 0).head(10)
#csv_file.head(10)

[Row(_c0='5431702', _c1='1007709126460309504', _c2='Adam80644464', _c3='"""Jeśli MatkaBoska przemówiła w języku polskim do dzieci', _c4='jeśli moce niebieskie nie gardzą tym językiem', _c5='jakże my', _c6='ludzki', _c7='szary proch możemy się wstydzić polskości"""', _c8=None, _c9='2109', _c10='1780', _c11='25367', _c12='2018-06-15 19:39:13.000000', _c13='1502437181343903744', _c14='2022-03-12 00:11:52', _c15='348', _c16='Oto co mówił prof. Lech Kaczyński o gazociągu #NordStream1 w 2005 r. ', _c17=None),
 Row(_c0='5432273', _c1='2694895650', _c2='leszekfurtakcom', _c3='"Redaktor. Prowadzący: ""Przesłuchanie Polityczne""', _c4=' ""Przesłuchanie Historyczne""', _c5=' ""Przesłuchanie Klimatyczne""', _c6=' ""Co z tą Rosją?""', _c7=' ""Co z tą Ukrainą?""."', _c8=None, _c9='1154', _c10='1525', _c11='58873', _c12='2014-07-31 06:21:20.000000', _c13='1502437818492203010', _c14='2022-03-12 00:14:24', _c15='493', _c16="Good to speak to @ZelenskyyUa again today to update him on the tough sanctions 

In [16]:
csv_file.tail(4)

[Row(_c0='no more blood."', _c1='kyoto', _c2='323', _c3='108', _c4='614', _c5='2021-09-30 14:13:41.000000', _c6='1502796580143136768', _c7='2022-03-12 23:59:59', _c8='714', _c9='#BREAKING ', _c10=None, _c11=None, _c12=None, _c13=None, _c14=None, _c15=None, _c16=None, _c17=None),
 Row(_c0='The moment of missile attack on US consulate in #Erbil https://t.co/5u7tpsCaWy"', _c1="[{'text': 'BREAKING', 'indices': [17, 26]}, {'text': 'Erbil', 'indices': [76, 82]}]", _c2='en', _c3=None, _c4='0', _c5='2022-03-13 00:01:13.561375', _c6=None, _c7=None, _c8=None, _c9=None, _c10=None, _c11=None, _c12=None, _c13=None, _c14=None, _c15=None, _c16=None, _c17=None),
 Row(_c0='5825969', _c1='1494227064408346634', _c2='Deramba1', _c3='ትግራይ እያ ዓለመይ!!', _c4=None, _c5=None, _c6=None, _c7=None, _c8=None, _c9=None, _c10=None, _c11=None, _c12=None, _c13=None, _c14=None, _c15=None, _c16=None, _c17=None),
 Row(_c0='#StopTigrayGenocide"', _c1=None, _c2='272', _c3='278', _c4='4616', _c5='2022-02-17 08:29:07.000000', 

### 1.2 Define Schema of Tweets DB

Has to be in accordance with BigQuery Table

### 1.3 Define Schema of Users DB

Has to be in accordance with BigQuery Table

### Iterate over CSV's

* Control flow (Pull user data from BigQuery)
* Update / Add Users
* Add Tweets

In [42]:
# https://stackoverflow.com/questions/51751852/dataproc-reading-from-google-cloud-storage
# https://stackoverflow.com/questions/61197811/can-i-read-csv-files-from-google-storage-using-spark-in-more-than-one-executor

for csv_file in FILES_TO_PROCESS:
    ## DOES NOT WORK
    ##sc = SparkContext.getOrCreate()
    ##rdd_csv = sc.wholeTextFiles(f"{BUCKET_LINK}/{csv_file}")
    ##rdd_csv.collect()
    ## WORKS - BUT DOES IT WORK FOR MULTIPLE FILES READ?
    csv_file = (spark
                .read
                .format("csv")
                .options(header='True')
                .
                #.schema(GLOBAL_SCHEMA)
                .load(f"{BUCKET_LINK}/{csv_file}", compression='gzip'))

In [43]:
csv_file.head(3)

[Row(_c0='5428025', userid='1421921881528016896', username='PFComCon', acctdesc='Timely, concise, and comprehensive source of information and analysis on Indo-Pacific bilateral relations, published by @pacificforum', location='Honolulu, Hawaii', following='96', followers='53', totaltweets='706', usercreatedts='2021-08-01 19:53:49.000000', tweetid='1502434194462412804', tweetcreatedts='2022-03-12 00:00:00', retweetcount='1', text='https://t.co/6vzbiMRRbP', hashtags=None, language=None, coordinates=None, favorite_count=None, extractedts=None),
 Row(_c0='China\'s online comments show that pro-Moscow posturing is a veil for expressing a deeper critique of U.S. influence. #ComparativeConnections #Ukraine #RussiaUkraineWar #China #US"', userid="[{'text': 'ComparativeConnections', 'indices': [142, 165]}, {'text': 'Ukraine', 'indices': [166, 174]}, {'text': 'RussiaUkraineWar', 'indices': [175, 192]}, {'text': 'China', 'indices': [193, 199]}, {'text': 'US', 'indices': [200, 203]}]", username='e

In [None]:
# Use the Cloud Storage bucket for temporary BigQuery export data used
# by the connector.
spark.conf.set('temporaryGcsBucket', BUCKET)

#

words = spark.read.format('bigquery') \
  .option('table', 'bigquery-public-data:samples.shakespeare') \
  .load()
words.createOrReplaceTempView('words')


In [None]:
dfTweet = spark.read.option("header",True).csv("./input/data.csv")

In [None]:
dfTweet.printSchema()

In [None]:
dfTweet.select('userid', 'username').collect()[0]

# Write Read CSV's to control file

In [None]:
## Join FILES_TO_PROCESS and FILES_ALREADY_READ in PROCESSED_FILES
PROCESSED_FILES = []

In [None]:
# Update txt file after data is processed
# https://stackoverflow.com/questions/43682521/writing-data-to-google-cloud-storage-using-python
def write_txt_to_bucket(bucket_name, destination_file_name, files_list):
    client = storage.Client()
    bucket = client.get_bucket(bucket_name)
    blob = bucket.blob(destination_file_name) 
    ## Use bucket.get_blob('path/to/existing-blob-name.txt') to write to existing blobs
    with blob.open(mode='w') as f:
        f.write('\n'.join(files_list))
            
write_txt_to_bucket('cloud-computing-2122-bjr', 'control/read_files.txt', PROCESSED_FILES)