## Import

In [1]:
import findspark
findspark.init()
from pyspark.context import SparkContext, SparkConf
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType, ArrayType, StringType, ShortType, FloatType
from pyspark.sql import SparkSession
from spark_functions import *
from datetime import datetime

  from tqdm.autonotebook import tqdm, trange


In [2]:
sc = SparkContext().getOrCreate()

In [3]:
spark = SparkSession(sc).builder \
    .appName("medicitalia") \
    .getOrCreate()
    #.master("local[1]") \
print(spark)

<pyspark.sql.session.SparkSession object at 0x000001C360F0EFE0>


## UDFS

In [4]:
#imported from spark_functions.py
chunking_udf = udf(lambda x: chunking(x, 300), ArrayType(StringType()))
embeddings_udf = udf(lambda x: embed(x), ArrayType(FloatType()))
locations_udf = udf(lambda x: get_coordinates(x, "IT"), ArrayType(FloatType()))

convert_to_iso_udf = udf(lambda x: convert_to_iso(x), StringType()) 

## Funzioni

In [6]:
def split_dataframe(df):
    df_rag = df.select(['URL', 'Category', 'Question' , 'Answer'])
    df_analytics = df.select(['URL','Category', 'Answer Date', 'Question Date'])

    return df_rag, df_analytics

def convert_to_iso(data):
    try:
        date_obj = datetime.strptime(data, '%d.%m.%Y %H:%M')
        # Convert the datetime object to ISO 8601 format
        iso_date_str = date_obj.strftime('%Y-%m-%dT%H:%M:%SZ')
        return iso_date_str
    except:
        pass
    try:
        parsed_date = datetime.strptime(data, '%d.%m.%y')
        # Convert to ISODate format
        iso_date_str = parsed_date.isoformat() + 'Z'
        return iso_date_str
    except:
        return "0000-00-00T00:00:00Z"

## Transformations generali

In [7]:
medicitalia_df = load_dataframe(spark, '../medicitalia/data')
medicitalia_df = medicitalia_df.dropDuplicates(['URL'])
medicitalia_df = medicitalia_df.dropDuplicates(['Question'])

../medicitalia/data loaded
root
 |-- URL: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Question: string (nullable = true)
 |-- Answer: string (nullable = true)
 |-- Question Date: string (nullable = true)
 |-- Answer Date: string (nullable = true)



## Splitting

In [8]:
#splitta le informazioni per ogni database
df_rag, df_analytics = split_dataframe(medicitalia_df)

## Transformations di df_rag

In [8]:
df_rag = df_rag.where((length(col('Question')) > 30) & (length(col('Answer')) > 30))
df_rag = df_rag.na.drop(how='any', subset=['Question', 'Answer'])

In [9]:
df_rag = df_rag.withColumn('Question', chunking_udf(df_rag['Question']))
df_rag = df_rag.select('*', posexplode('Question').alias('Chunk_number', 'Chunked_Question'))
df_rag = df_rag.drop('Question')
df_rag = df_rag.withColumnRenamed('Chunked_Question', 'Question')
#df_rag = df_rag.withColumn('embeddings', embeddings_udf(df_rag['Question']))

## Transformations di df_analytics

In [9]:
df_analytics = df_analytics.na.drop(how='any', subset=['Category'])
df_analytics = df_analytics.withColumn('Question Date', convert_to_iso_udf(df_analytics['Question Date']))
df_analytics = df_analytics.withColumn('Answer Date', convert_to_iso_udf(df_analytics['Answer Date']))

## Salvataggio

In [11]:
df_rag.write.json('../medicitalia/json_medicitalia/rag', mode="overwrite")
df_analytics.write.json('../medicitalia/json_medicitalia/analytics', mode="overwrite")