In [1]:
#Run Once
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
!tar xf spark-3.1.1-bin-hadoop2.7.tgz
!pip install -q findspark
#Run Once
import os
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()
spark

In [3]:
from pyspark.sql.functions import to_timestamp
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql import SQLContext
from datetime import datetime

In [4]:
df1  = spark.read.csv('/content/drive/MyDrive/Colab datasets/Final/Final_Xdataset_v2.csv', header=True, sep=",", inferSchema=True )

In [5]:
df1.show()

+----------------+-------------+-------------+-------------+------------+-------------+--------------+--------------------+--------------------+--------------------+-------------------+--------------------+---------------------+------------+------------+------------+-----------+------------+-------------+--------------+--------------+--------------+-------------+--------------+---------------+-------------+-------------+-------------+------------+-------------+--------------+----------------+----------------+----------------+---------------+----------------+-----------------+-----------------+-----------------+-----------------+----------------+-----------------+------------------+-------------------+-------------------+-------------------+------------------+-------------------+--------------------+-----------+-----------+-----------+----------+-----------+------------+--------------+--------------+--------------+-------------+--------------+---------------+------------------+---------

In [6]:
Final_Xdataset = df1.withColumn("date", to_timestamp(df1["date"], 'dd/MM/yyyy HH:mm'))
Final_Xdataset.show()

+-------------------+-------------+-------------+-------------+------------+-------------+--------------+--------------------+--------------------+--------------------+-------------------+--------------------+---------------------+------------+------------+------------+-----------+------------+-------------+--------------+--------------+--------------+-------------+--------------+---------------+-------------+-------------+-------------+------------+-------------+--------------+----------------+----------------+----------------+---------------+----------------+-----------------+-----------------+-----------------+-----------------+----------------+-----------------+------------------+-------------------+-------------------+-------------------+------------------+-------------------+--------------------+-----------+-----------+-----------+----------+-----------+------------+--------------+--------------+--------------+-------------+--------------+---------------+------------------+------

In [7]:
labels = [
     ('date',StringType()),
     ('ind1',DoubleType()),
     ('rain',DoubleType()),
     ('ind3',DoubleType()),
     ('temp',DoubleType()),
     ('ind5',DoubleType()),
     ('wetb',DoubleType()),
     ('dewpt',IntegerType()),
     ('vappr',IntegerType()),
     ('rhum', IntegerType()),
     ('msl',DoubleType()),
     ('ind11',DoubleType()),
     ('wdsp',IntegerType()),
     ('ind13',DoubleType()),
     ('wddir',IntegerType())
]

In [8]:
# Creating the schema that will be passed when reading the csv
schema = StructType([StructField (x[0], x[1], True) for x in labels])


In [9]:
name = 'sligo_Y3H'
df1  = spark.read.csv('/content/drive/MyDrive/Colab datasets/weatherdata2/Markree.csv', header=True, sep=",", schema=schema )
df2 = df1.withColumn("TS_date", to_timestamp(df1["date"], 'dd/MM/yyyy HH:mm'))
df3 = df2.drop('date')
finalDF12 = df3.select('TS_date', "rain",'temp','rhum', 'msl', ).filter((df3['TS_date'] >= "2011-01-01 00:00:00") & 
                                                                                       (df3['TS_date'] <= "2021-01-01 00:00:00"))
finalDF13 = finalDF12.selectExpr("TS_date as date","rain as rain_{}".format(name), "temp as temp_{}".format(name), "rhum as rhum_{}".format(name), \
                                 "msl as msl_{}".format(name))
finalDF13.show()
finalDF13.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in finalDF13.columns]).show()
finalDF13 = finalDF13.na.fill(value=0)
finalDF13.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in finalDF13.columns]).show()
finalDF13.show()


+-------------------+--------------+--------------+--------------+-------------+
|               date|rain_sligo_Y3H|temp_sligo_Y3H|rhum_sligo_Y3H|msl_sligo_Y3H|
+-------------------+--------------+--------------+--------------+-------------+
|2011-01-01 00:00:00|           0.0|           5.2|            97|       1028.3|
|2011-01-01 01:00:00|           0.0|           5.1|            97|       1028.4|
|2011-01-01 02:00:00|           0.0|           5.0|            97|       1028.1|
|2011-01-01 03:00:00|           0.0|           4.9|            97|       1028.0|
|2011-01-01 04:00:00|           0.0|           4.8|            97|       1028.0|
|2011-01-01 05:00:00|           0.0|           4.8|            97|       1027.9|
|2011-01-01 06:00:00|           0.0|           4.9|            97|       1027.9|
|2011-01-01 07:00:00|           0.5|           5.2|            97|       1027.9|
|2011-01-01 08:00:00|           0.2|           5.5|            97|       1028.0|
|2011-01-01 09:00:00|       

In [10]:
df20 = finalDF13.withColumn("date", to_timestamp(finalDF13["date"], 'yyyy-MM-dd HH:mm:ss'))
df20.show(5)

+-------------------+--------------+--------------+--------------+-------------+
|               date|rain_sligo_Y3H|temp_sligo_Y3H|rhum_sligo_Y3H|msl_sligo_Y3H|
+-------------------+--------------+--------------+--------------+-------------+
|2011-01-01 00:00:00|           0.0|           5.2|            97|       1028.3|
|2011-01-01 01:00:00|           0.0|           5.1|            97|       1028.4|
|2011-01-01 02:00:00|           0.0|           5.0|            97|       1028.1|
|2011-01-01 03:00:00|           0.0|           4.9|            97|       1028.0|
|2011-01-01 04:00:00|           0.0|           4.8|            97|       1028.0|
+-------------------+--------------+--------------+--------------+-------------+
only showing top 5 rows



In [11]:
dffinalDF_Tadjust = df20.withColumn('date', df20.date - F.expr('INTERVAL 3 HOURS'))
dffinalDF_Tadjust.show(5)

+-------------------+--------------+--------------+--------------+-------------+
|               date|rain_sligo_Y3H|temp_sligo_Y3H|rhum_sligo_Y3H|msl_sligo_Y3H|
+-------------------+--------------+--------------+--------------+-------------+
|2010-12-31 21:00:00|           0.0|           5.2|            97|       1028.3|
|2010-12-31 22:00:00|           0.0|           5.1|            97|       1028.4|
|2010-12-31 23:00:00|           0.0|           5.0|            97|       1028.1|
|2011-01-01 00:00:00|           0.0|           4.9|            97|       1028.0|
|2011-01-01 01:00:00|           0.0|           4.8|            97|       1028.0|
+-------------------+--------------+--------------+--------------+-------------+
only showing top 5 rows



In [12]:
Final_X_Ydataset =  Final_Xdataset.join(dffinalDF_Tadjust, on=['date'], how='inner')

In [None]:
Final_X_Ydataset.show(10)

+-------------------+-------------+-------------+-------------+------------+-------------+--------------+--------------------+--------------------+--------------------+-------------------+--------------------+---------------------+------------+------------+------------+-----------+------------+-------------+--------------+--------------+--------------+-------------+--------------+---------------+-------------+-------------+-------------+------------+-------------+--------------+----------------+----------------+----------------+---------------+----------------+-----------------+-----------------+-----------------+-----------------+----------------+-----------------+------------------+-------------------+-------------------+-------------------+------------------+-------------------+--------------------+-----------+-----------+-----------+----------+-----------+------------+--------------+--------------+--------------+-------------+--------------+---------------+------------------+------

In [None]:
# this uses pandas to create 1 csv file. 
Final_X_Ydataset.toPandas().to_csv('/content/drive/MyDrive/Colab datasets/Final/Final_X_Y_dataset_v2.csv')