In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
.master("local[4]") \
.appName("Schema") \
.config("spark.executor.memory","4g") \
.config("spark.driver.memory","2g") \
.getOrCreate()

df = spark.read \
.option("header","true") \
.option("inferSchema","True") \
.option("sep",";") \
.csv("C:\\Users\\umuto\\OneDrive\\Masaüstü\\OnlineRetail.csv")\
.select("InvoiceDate").distinct()

In [2]:
df.show(5)

+----------------+
|     InvoiceDate|
+----------------+
| 3.12.2010 16:50|
| 7.12.2010 12:28|
| 8.12.2010 15:02|
|10.12.2010 09:53|
|12.12.2010 13:32|
+----------------+
only showing top 5 rows



In [3]:
mevcut_format = "dd.MM.yyyy HH:mm" 

In [4]:
from pyspark.sql import functions as F

In [10]:
df2 = df \
.withColumn("normal_tarih", F.to_date(F.col("InvoiceDate"),mevcut_format)) \
.withColumn("standart_ts",F.to_timestamp(F.col("InvoiceDate"),mevcut_format))

In [11]:
df2.show(5)

+----------------+------------+-------------------+
|     InvoiceDate|normal_tarih|        standart_ts|
+----------------+------------+-------------------+
| 3.12.2010 16:50|  2010-12-03|2010-12-03 16:50:00|
| 7.12.2010 12:28|  2010-12-07|2010-12-07 12:28:00|
| 8.12.2010 15:02|  2010-12-08|2010-12-08 15:02:00|
|10.12.2010 09:53|  2010-12-10|2010-12-10 09:53:00|
|12.12.2010 13:32|  2010-12-12|2010-12-12 13:32:00|
+----------------+------------+-------------------+
only showing top 5 rows



# Tarih formatı değiştirme

In [13]:
format_tr = "dd/MM/yyyy HH:mm:ss"
format_eng = "MM/dd/yyyy HH:mm:ss"

df3 = df2 \
.withColumn("TSTR",F.date_format(F.col("standart_ts"),format_tr))\
.withColumn("TSENG",F.date_format(F.col("standart_ts"),format_eng))

df3.show(10)

+----------------+------------+-------------------+-------------------+-------------------+
|     InvoiceDate|normal_tarih|        standart_ts|               TSTR|              TSENG|
+----------------+------------+-------------------+-------------------+-------------------+
| 3.12.2010 16:50|  2010-12-03|2010-12-03 16:50:00|03/12/2010 16:50:00|12/03/2010 16:50:00|
| 7.12.2010 12:28|  2010-12-07|2010-12-07 12:28:00|07/12/2010 12:28:00|12/07/2010 12:28:00|
| 8.12.2010 15:02|  2010-12-08|2010-12-08 15:02:00|08/12/2010 15:02:00|12/08/2010 15:02:00|
|10.12.2010 09:53|  2010-12-10|2010-12-10 09:53:00|10/12/2010 09:53:00|12/10/2010 09:53:00|
|12.12.2010 13:32|  2010-12-12|2010-12-12 13:32:00|12/12/2010 13:32:00|12/12/2010 13:32:00|
|15.12.2010 13:21|  2010-12-15|2010-12-15 13:21:00|15/12/2010 13:21:00|12/15/2010 13:21:00|
|16.12.2010 08:41|  2010-12-16|2010-12-16 08:41:00|16/12/2010 08:41:00|12/16/2010 08:41:00|
|17.12.2010 09:52|  2010-12-17|2010-12-17 09:52:00|17/12/2010 09:52:00|12/17/201

# Tarih ekleme, tarih farkıi timestamp içinden yılı alma

In [15]:
df4 = df2\
.withColumn("bir_yıl",F.date_add(F.col("standart_ts"),365)) \
.withColumn("yil",F.year(F.col("standart_ts"))) \
.withColumn("tarih_fark",F.datediff(F.col("bir_yıl"),df2.standart_ts)) \

df4.show(5)


+----------------+------------+-------------------+----------+----+----------+
|     InvoiceDate|normal_tarih|        standart_ts|   bir_yıl| yil|tarih_fark|
+----------------+------------+-------------------+----------+----+----------+
| 3.12.2010 16:50|  2010-12-03|2010-12-03 16:50:00|2011-12-03|2010|       365|
| 7.12.2010 12:28|  2010-12-07|2010-12-07 12:28:00|2011-12-07|2010|       365|
| 8.12.2010 15:02|  2010-12-08|2010-12-08 15:02:00|2011-12-08|2010|       365|
|10.12.2010 09:53|  2010-12-10|2010-12-10 09:53:00|2011-12-10|2010|       365|
|12.12.2010 13:32|  2010-12-12|2010-12-12 13:32:00|2011-12-12|2010|       365|
+----------------+------------+-------------------+----------+----+----------+
only showing top 5 rows

