In [68]:
from pyspark.sql import SparkSession

In [69]:
spark = (
    SparkSession.builder.appName("Working with Strings")
    .config("spark.jars", "postgresql-42.2.20.jar")
    .getOrCreate()
)

In [70]:
def connect_database_to_read_file(database, table, user, password, name_file):
    data = (
        spark.read.format("jdbc")
        .option("url", "jdbc:postgresql://localhost:5432/{}".format(database))
        .option("dbtable", table)
        .option("user", user)
        .option("password", password)
        .option("driver", "org.postgresql.Driver").load()
    )
    return data.createOrReplaceTempView(name_file)
emp_data = connect_database_to_read_file("sqla", "emp", "postgres", "admin123", "emp")


In [71]:
emp_data = connect_database_to_read_file("sqla", "emp", "postgres", "admin123", "emp")


In [72]:
t_10 = connect_database_to_read_file("sqla", "t10", "postgres", "admin123", "t10")
t_1 = connect_database_to_read_file("sqla", "t1", "postgres", "admin123", "t1")
  

In [73]:
spark.sql(
    """
    select substr(e.ename,iter.pos,1) as C
    from (select ename from emp where ename = 'KING') e,
    (select id as pos from t10) iter
    where iter.pos <= length(e.ename)
    """
).show()

+---+
|  C|
+---+
|  K|
|  I|
|  N|
|  G|
+---+



In [74]:
spark.sql(
    """
    select substr(e.ename, iter.pos) as A,
    substr(e.ename, length(e.ename) -iter.pos +1) as B
    from (select ename from emp where ename = 'KING') e, (select id pos from t10) iter
    where iter.pos <= length(e.ename)
    """
).show()

+----+----+
|   A|   B|
+----+----+
|KING|   G|
| ING|  NG|
|  NG| ING|
|   G|KING|
+----+----+



In [75]:
# You want to embed quote marks within string literals.
spark.sql(
    """
    select 'g' "'" 'day mate' qmarks from t1 
    """
).show()

+----------+
|    qmarks|
+----------+
|g'day mate|
+----------+



In [76]:
spark.sql(
    """
    select * from t1
    """
).show()

+---+-----+-------+
| id| name|    job|
+---+-----+-------+
| 10|CLARK|MANAGER|
+---+-----+-------+



In [77]:
# You want to remove specific characters from your data.
spark.sql(
    """
    select ename, 
    replace(translate(ename, 'UEOAI', '#'),'#', '') as strip1, 
    sal,
    replace(translate(sal,'.0', '#'), '#','') as stripped2
    from emp
    """
).show()

+--------+--------+-------+---------+
|   ename|  strip1|    sal|stripped2|
+--------+--------+-------+---------+
|   ALLEN|     LLN|1600.00|       16|
|    WARD|     WRD|1250.00|      125|
|  MARTIN|    MRTN|1250.00|      125|
|   BLAKE|     BLK|2850.00|      285|
|   CLARK|    CLRK|2450.00|      245|
|    KING|     KNG|5000.00|        5|
|  TURNER|    TRNR|1500.00|       15|
|   JAMES|     JMS| 950.00|       95|
|  MILLER|    MLLR|1300.00|       13|
|Jonathan|Jonathan|   null|     null|
|   SMITH|    SMTH| 880.00|       88|
|   JONES|     JNS|3272.50|    32725|
|   SCOTT|    SCTT|3300.00|       33|
|   ADAMS|     DMS|1210.00|      121|
|    FORD|     FRD|3300.00|       33|
+--------+--------+-------+---------+



In [78]:
spark.sql(
    """
    select 
    replace(translate(data, ".0123456789",'#'),'#','') as ename,
    replace(data, replace(translate(data, ".0123456789",'#'),'#',''), '') as sal
    from
    (
        select ename || sal as data
        from emp
    )
    """
).show()

+------+-------+
| ename|    sal|
+------+-------+
| ALLEN|1600.00|
|  WARD|1250.00|
|MARTIN|1250.00|
| BLAKE|2850.00|
| CLARK|2450.00|
|  KING|5000.00|
|TURNER|1500.00|
| JAMES| 950.00|
|MILLER|1300.00|
|  null|   null|
| SMITH| 880.00|
| JONES|3272.50|
| SCOTT|3300.00|
| ADAMS|1210.00|
|  FORD|3300.00|
+------+-------+



In [79]:
spark.sql(
    """
    select replace(
           replace(
           translate(replace('Stewie Griffin', '.', ''),
                            'abcdefghijklmnopqrstuvwxyz',
                            rpad('#',26,'#') ), '#','' ),' ','.' ) ||'.' as full
    from t1
    """
).show()

+----+
|full|
+----+
|S.G.|
+----+



In [80]:
spark.sql(
    """
    select ename
    from emp
    """
).show()

+--------+
|   ename|
+--------+
|   ALLEN|
|    WARD|
|  MARTIN|
|   BLAKE|
|   CLARK|
|    KING|
|  TURNER|
|   JAMES|
|  MILLER|
|Jonathan|
|   SMITH|
|   JONES|
|   SCOTT|
|   ADAMS|
|    FORD|
+--------+



In [81]:
# Creating a Delimited List from Table Rows
# CREATE AGGREGATE textcat_all(
#   basetype    = text,
#   sfunc       = textcat,
#   stype       = text,
#   initcond    = ''
# );

# SELECT company_id, textcat_all(employee || ', ')
# FROM mytable
# GROUP BY company_id;
from pyspark.sql import functions as F
def connect_database_to_read_file(database, table, user, password, name_file):
    data = (
        spark.read.format("jdbc")
        .option("url", "jdbc:postgresql://localhost:5432/{}".format(database))
        .option("dbtable", table)
        .option("user", user)
        .option("password", password)
        .option("driver", "org.postgresql.Driver").load()
    )
    return data
emp_data = connect_database_to_read_file("sqla", "emp", "postgres", "admin123", "emp")
emp_data.groupby("deptno").agg(F.collect_set("ename")).show()


+------+--------------------+
|deptno|  collect_set(ename)|
+------+--------------------+
|  null|          [Jonathan]|
|    10|[MILLER, KING, CL...|
|    30|[MARTIN, BLAKE, J...|
|    20|[SMITH, SCOTT, JO...|
+------+--------------------+



In [96]:

data = [{'name', i} for i in range(len('7654,7698,7782,7788'))]

data_df = spark.createDataFrame([i for i in range(100)], IntegerType(), ['number'])
data_df.createOrReplaceTempView("t100")

In [108]:
spark.sql(
    """
    select t_100.value, substr('7654,7698,7782,7788',t_100.value) as number_split
    from t1 as t_1,t100 as t_100
    where t_100.value <= length('7654,7698,7782,7788')
    """
).show()

+-----+-------------------+
|value|       number_split|
+-----+-------------------+
|    0|7654,7698,7782,7788|
|    1|7654,7698,7782,7788|
|    2| 654,7698,7782,7788|
|    3|  54,7698,7782,7788|
|    4|   4,7698,7782,7788|
|    5|    ,7698,7782,7788|
|    6|     7698,7782,7788|
|    7|      698,7782,7788|
|    8|       98,7782,7788|
|    9|        8,7782,7788|
|   10|         ,7782,7788|
|   11|          7782,7788|
|   12|           782,7788|
|   13|            82,7788|
|   14|             2,7788|
|   15|              ,7788|
|   16|               7788|
|   17|                788|
|   18|                 88|
|   19|                  8|
+-----+-------------------+

