## **BATCH - OpenWeatherMap**

In [None]:
import datetime
import json
import minio
import pyspark
import requests

In [None]:
with open("/variables.json", "r") as file :
    data = json.load(file)

apikey = data["apikey"]
locations = data["locations"]
plants = data["plants"]

URL_timemachine = "https://api.openweathermap.org/data/3.0/onecall/timemachine"

In [None]:
def url(lat, lon, dt, units="metric") :
    return URL_timemachine + "?lat=%f&lon=%f&dt=%d&appid=%s&units=%s"%(lat, lon, dt, apikey, units)

def preprocess(data) :
    res =  {"lat": data["lat"], "lon": data["lon"]}
    res.update(data["data"][0])
    res.pop("weather", None)
    date = datetime.datetime.fromtimestamp(res["dt"])
    res["day"] =  date.day
    res["month"] = date.month
    res["year"] = date.year
    res["hour"] = date.hour
    res["minute"] = date.minute
    return res

def fetch(url) :
    response = requests.get(url)
    if response.status_code == 200 :
        data = response.json()
        return preprocess(data)
    else :
        return None

def collect(location = locations["nantes"], hours = range(8, 19), duration = 1, day_step = 1) :
    day = datetime.date.today()
    data = []
    for i in range(duration) :
        for hour in hours :
            dt_utc = datetime.datetime(day.year, day.month, day.day, hour, 30)
            unix_timestamp = int(dt_utc.timestamp())
            data_i = fetch(url(location["lat"], location["lon"], unix_timestamp))
            if data_i :
                data.append(data_i)
        day = day + datetime.timedelta(days = - day_step)
    return data

In [None]:
conf = pyspark.SparkConf() \
    .setAppName('Naolib') \
    .setMaster('spark://spark:7077') \
    .set("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4") \
    .set("spark.sql.shuffle.partitions", "10")

spark_context = pyspark.SparkContext.getOrCreate(conf=conf)
sql_context = pyspark.sql.SQLContext(spark_context)

spark_context._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "http://minio:9000")
spark_context._jsc.hadoopConfiguration().set("fs.s3a.access.key", "root")
spark_context._jsc.hadoopConfiguration().set("fs.s3a.secret.key", "password")
spark_context._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
spark_context._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
spark_context._jsc.hadoopConfiguration().set("fs.s3a.connection.ssl.enabled", "false")

minio_client = minio.Minio(
    "minio:9000",
    access_key="root",
    secret_key="password",
    secure=False
)

if not(minio_client.bucket_exists("weather")):
    minio_client.make_bucket("weather")

In [None]:
from pyspark.sql.functions import col, when

In [None]:
data = collect(duration = 7*5)
if data :
    df = sql_context.read.json(spark_context.parallelize(data, 7))
    df.select("*").show()
    df.select("day", "month", "year", "hour", "minute", "clouds", "temp", "feels_like", "humidity", "visibility") \
        .withColumn("morning", when(col("hour") > 13, 1).otherwise(2)) \
        .withColumn("hash", ((col("year")*10000+col("month"))*100+col("day"))*10+col("morning")) \
        .select("hash", "clouds", "temp", "feels_like", "humidity", "visibility") \
        .groupBy(["hash"]) \
        .mean() \
        .select("hash", "avg(clouds)", "avg(temp)", "avg(feels_like)", "avg(humidity)", "avg(visibility)") \
        .orderBy(["hash"]) \
        .show()
else :
    print("API Error")

In [None]:
data = collect(duration = 365, day_step = 7)
if data :
    df = sql_context.read.json(spark_context.parallelize(data, 7))
    df.select("month", "year", "clouds", "temp", "feels_like", "humidity", "visibility") \
        .withColumn("hash", col("year")*100+col("month")) \
        .select("hash", "clouds", "temp", "feels_like", "humidity", "visibility") \
        .groupBy(["hash"]) \
        .mean() \
        .select("hash", "avg(clouds)", "avg(temp)", "avg(feels_like)", "avg(humidity)", "avg(visibility)") \
        .orderBy(["hash"]) \
        .show()
else :
    print("API Error")