<a href="https://colab.research.google.com/github/Vasugi2003/Big-Data-Analytics/blob/main/PySparkSQL_String_manipulations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install and import PySpark
!pip install pyspark
from pyspark.sql import SparkSession



In [None]:
spark = SparkSession.builder.appName("StringManip").getOrCreate()

In [None]:
# Updated sample data for a "cars" DataFrame
data = [("Toyota Camry", "Sedan", "2022"),
        ("Honda Civic", "Sedan", "2022"),
        ("Ford F-150", "Truck", "2022"),
        ("Tesla Model 3", "Electric", "2022"),
        ("Skoda", "Rapid", "2022")]
columns = ["model", "type", "year"]

# Create a DataFrame with the updated car-related data
df = spark.createDataFrame(data, columns)

# Register the DataFrame as a temporary SQL table
df.createOrReplaceTempView("cars")


In [None]:
#1)concat_ws(separator, *cols):
#Combines multiple columns into a single column using the specified separator#
# Example queries using Spark SQL string functions

# 1. Concatenating strings with a separator
concatenated_df = spark.sql("SELECT * ,concat_ws(' : ', model, type, year) AS details FROM cars")
print("1. Concatenated Strings:")
concatenated_df.show(truncate=False)


1. Concatenated Strings:
+-------------+--------+----+-------------------------------+
|model        |type    |year|details                        |
+-------------+--------+----+-------------------------------+
|Toyota Camry |Sedan   |2022|Toyota Camry : Sedan : 2022    |
|Honda Civic  |Sedan   |2022|Honda Civic : Sedan : 2022     |
|Ford F-150   |Truck   |2022|Ford F-150 : Truck : 2022      |
|Tesla Model 3|Electric|2022|Tesla Model 3 : Electric : 2022|
|Skoda        |Rapid   |2022|Skoda : Rapid : 2022           |
+-------------+--------+----+-------------------------------+



In [None]:
#length(col)

#Computes the character length of string data or number of bytes of binary data.

# 2. Calculating the length of strings
length_df = spark.sql("SELECT *, length(type) AS type_length FROM cars")

print("2. Length of Types:")
length_df.show()

2. Length of Types:
+-------------+--------+----+-----------+
|        model|    type|year|type_length|
+-------------+--------+----+-----------+
| Toyota Camry|   Sedan|2022|          5|
|  Honda Civic|   Sedan|2022|          5|
|   Ford F-150|   Truck|2022|          5|
|Tesla Model 3|Electric|2022|          8|
|        Skoda|   Rapid|2022|          5|
+-------------+--------+----+-----------+



In [None]:
# 3. Extracting substrings
#Parameters
#str Column or str
    #target column to work on.

#pos int
    #starting position in str.

#len int
    #length of chars.

#Returns
#Column
#substring of given value.

substring_df = spark.sql("SELECT *, substring(type, 3, 3) AS type_abbr FROM cars")

print("3. Substring of Types:")
substring_df.show()

3. Substring of Types:
+-------------+--------+----+---------+
|        model|    type|year|type_abbr|
+-------------+--------+----+---------+
| Toyota Camry|   Sedan|2022|      dan|
|  Honda Civic|   Sedan|2022|      dan|
|   Ford F-150|   Truck|2022|      uck|
|Tesla Model 3|Electric|2022|      ect|
|        Skoda|   Rapid|2022|      pid|
+-------------+--------+----+---------+



In [None]:
#Parameters

#colColumn or str
    #target column to work on.

#Returns Column upper case values.

# 4. Converts a string expression to uppercase
uppercase_df = spark.sql("SELECT *, type, upper(type) AS uppercase_type FROM cars")
print("4. Uppercase Types:")
uppercase_df.show()

4. Uppercase Types:
+-------------+--------+----+--------+--------------+
|        model|    type|year|    type|uppercase_type|
+-------------+--------+----+--------+--------------+
| Toyota Camry|   Sedan|2022|   Sedan|         SEDAN|
|  Honda Civic|   Sedan|2022|   Sedan|         SEDAN|
|   Ford F-150|   Truck|2022|   Truck|         TRUCK|
|Tesla Model 3|Electric|2022|Electric|      ELECTRIC|
|        Skoda|   Rapid|2022|   Rapid|         RAPID|
+-------------+--------+----+--------+--------------+



In [None]:
# 5. Checking for lowercase
lowercase_df = spark.sql("SELECT model, type, lower(type) AS lowercase_type FROM cars")
print("5. Lowercase Types:")
lowercase_df.show()

5. Lowercase Types:
+-------------+--------+--------------+
|        model|    type|lowercase_type|
+-------------+--------+--------------+
| Toyota Camry|   Sedan|         sedan|
|  Honda Civic|   Sedan|         sedan|
|   Ford F-150|   Truck|         truck|
|Tesla Model 3|Electric|      electric|
|        Skoda|   Rapid|         rapid|
+-------------+--------+--------------+



In [None]:
# Show the results of each transformation
# Stop the Spark session
spark.stop()

base64(e: Column): Column

Explanation: Encodes a binary or string column using Base64 encoding.


#The Base64 method of encoding is used when binary data, such as images or video, is transmitted over systems that are designed to transmit data in a plain-text (ASCII) format

In [None]:
from pyspark.sql.functions import base64
from pyspark.sql.functions import col

# Step 2: Create a Spark session
spark = SparkSession.builder.appName("StringFunctionsExample").getOrCreate()

# Updated sample data for a "cars" DataFrame
data = [("Coyota Camry", "Sedan", "2022"),
        ("Honda Civic", "Sedan", "2022"),
        ("Ford F-150", "Truck", "2022"),
        ("Tesla Model 3", "Electric", "2022"),
        ("Skoda", "Rapid", "2022")]

columns = ["model", "type", "year"]


# Create a DataFrame with the updated car-related data
df = spark.createDataFrame(data, columns)

# Encode the "model" column using Base64

encoded_df = df.select("model", base64(col("model")))
encoded_df.show()


encoded_df = df.withColumn("model_base64", base64(col("model")))
encoded_df.show()



# Register the DataFrame as a temporary SQL table
df.createOrReplaceTempView("cars")
encoded_df=spark.sql("SELECT model, type, base64(model) AS model_type FROM cars")
encoded_df.show()

+-------------+--------------------+
|        model|       base64(model)|
+-------------+--------------------+
| Coyota Camry|    Q295b3RhIENhbXJ5|
|  Honda Civic|    SG9uZGEgQ2l2aWM=|
|   Ford F-150|    Rm9yZCBGLTE1MA==|
|Tesla Model 3|VGVzbGEgTW9kZWwgMw==|
|        Skoda|            U2tvZGE=|
+-------------+--------------------+

+-------------+--------+----+--------------------+
|        model|    type|year|        model_base64|
+-------------+--------+----+--------------------+
| Coyota Camry|   Sedan|2022|    Q295b3RhIENhbXJ5|
|  Honda Civic|   Sedan|2022|    SG9uZGEgQ2l2aWM=|
|   Ford F-150|   Truck|2022|    Rm9yZCBGLTE1MA==|
|Tesla Model 3|Electric|2022|VGVzbGEgTW9kZWwgMw==|
|        Skoda|   Rapid|2022|            U2tvZGE=|
+-------------+--------+----+--------------------+

+-------------+--------+--------------------+
|        model|    type|          model_type|
+-------------+--------+--------------------+
| Coyota Camry|   Sedan|    Q295b3RhIENhbXJ5|
|  Honda Civic|   Seda

ascii(e: Column): Column

Explanation: Returns the ASCII value of the first character in a string column.

In [None]:
from pyspark.sql.functions import ascii

#Calculate the ASCII value of the first character in the "model" column
ascii_df = df.select("model", ascii(col("model")))
ascii_df.show()



# Calculate the ASCII value of the first character in the "model" column
ascii_df = df.withColumn("model_ascii", ascii(col("model")))
ascii_df.show()



ascii_df=spark.sql("SELECT model, type, ascii(model) AS model_ascii FROM cars")
ascii_df.show()

+-------------+------------+
|        model|ascii(model)|
+-------------+------------+
| Coyota Camry|          67|
|  Honda Civic|          72|
|   Ford F-150|          70|
|Tesla Model 3|          84|
|        Skoda|          83|
+-------------+------------+

+-------------+--------+----+-----------+
|        model|    type|year|model_ascii|
+-------------+--------+----+-----------+
| Coyota Camry|   Sedan|2022|         67|
|  Honda Civic|   Sedan|2022|         72|
|   Ford F-150|   Truck|2022|         70|
|Tesla Model 3|Electric|2022|         84|
|        Skoda|   Rapid|2022|         83|
+-------------+--------+----+-----------+

+-------------+--------+-----------+
|        model|    type|model_ascii|
+-------------+--------+-----------+
| Coyota Camry|   Sedan|         67|
|  Honda Civic|   Sedan|         72|
|   Ford F-150|   Truck|         70|
|Tesla Model 3|Electric|         84|
|        Skoda|   Rapid|         83|
+-------------+--------+-----------+



concat_ws(sep: String, exprs: Column): Column*

Explanation: Concatenates multiple columns with a specified separator.

Working Code:

In [None]:
from pyspark.sql.functions import concat_ws

# Concatenate "model," "type," and "year" columns with a '-' separator
concatenated_df = df.withColumn("details", concat_ws("-", col("model"), col("type"), col("year")))
concatenated_df.show(truncate=False)


ascii_df=spark.sql("SELECT model,type,concat_ws('-',model,type) AS model_ascii FROM cars")
ascii_df.show()


ascii_df = df.select("model", concat_ws("-", col("model"), col("type"), col("year")))
ascii_df.show()






+-------------+--------+----+---------------------------+
|model        |type    |year|details                    |
+-------------+--------+----+---------------------------+
|Coyota Camry |Sedan   |2022|Coyota Camry-Sedan-2022    |
|Honda Civic  |Sedan   |2022|Honda Civic-Sedan-2022     |
|Ford F-150   |Truck   |2022|Ford F-150-Truck-2022      |
|Tesla Model 3|Electric|2022|Tesla Model 3-Electric-2022|
|Skoda        |Rapid   |2022|Skoda-Rapid-2022           |
+-------------+--------+----+---------------------------+

+-------------+--------+--------------------+
|        model|    type|         model_ascii|
+-------------+--------+--------------------+
| Coyota Camry|   Sedan|  Coyota Camry-Sedan|
|  Honda Civic|   Sedan|   Honda Civic-Sedan|
|   Ford F-150|   Truck|    Ford F-150-Truck|
|Tesla Model 3|Electric|Tesla Model 3-Ele...|
|        Skoda|   Rapid|         Skoda-Rapid|
+-------------+--------+--------------------+

+-------------+-------------------------------+
|        model

length(e: Column): Column

Explanation: Returns the length (number of characters) of a string column.

In [None]:
from pyspark.sql.functions import length

# Calculate the length of the "model" column
length_df = df.withColumn("model_length", length(col("model")))
length_df.show()

# Register the DataFrame as a temporary SQL table

length_df = df.select("type",length(col("type")) );
length_df.show()

df.createOrReplaceTempView("cars")
length_df=spark.sql("SELECT model, type,length(model) AS length_model FROM cars")
length_df.show()


+-------------+--------+----+------------+
|        model|    type|year|model_length|
+-------------+--------+----+------------+
| Coyota Camry|   Sedan|2022|          12|
|  Honda Civic|   Sedan|2022|          11|
|   Ford F-150|   Truck|2022|          10|
|Tesla Model 3|Electric|2022|          13|
|        Skoda|   Rapid|2022|           5|
+-------------+--------+----+------------+

+--------+------------+
|    type|length(type)|
+--------+------------+
|   Sedan|           5|
|   Sedan|           5|
|   Truck|           5|
|Electric|           8|
|   Rapid|           5|
+--------+------------+

+-------------+--------+------------+
|        model|    type|length_model|
+-------------+--------+------------+
| Coyota Camry|   Sedan|          12|
|  Honda Civic|   Sedan|          11|
|   Ford F-150|   Truck|          10|
|Tesla Model 3|Electric|          13|
|        Skoda|   Rapid|           5|
+-------------+--------+------------+



instr(str: Column, substring: String): Column

Explanation: Returns the position (1-based index) of the first occurrence of a substring in a string column.

In [None]:
#Locate the position of the first occurrence of substr column in the given string.
#Returns null if either of the arguments are nul


from pyspark.sql.functions import instr

# Find the position of "Camry" in the "model" column
position_df = df.withColumn("camry_position", instr(col("model"), "Civic"))
position_df.show()


# Register the DataFrame as a temporary SQL table
df.createOrReplaceTempView("cars")
encoded_df = spark.sql("SELECT  * ,instr(model ,'o') AS postion FROM cars");
encoded_df.show()


+-------------+--------+----+--------------+
|        model|    type|year|camry_position|
+-------------+--------+----+--------------+
| Coyota Camry|   Sedan|2022|             0|
|  Honda Civic|   Sedan|2022|             7|
|   Ford F-150|   Truck|2022|             0|
|Tesla Model 3|Electric|2022|             0|
|        Skoda|   Rapid|2022|             0|
+-------------+--------+----+--------------+

+-------------+--------+----+-------+
|        model|    type|year|postion|
+-------------+--------+----+-------+
| Coyota Camry|   Sedan|2022|      2|
|  Honda Civic|   Sedan|2022|      2|
|   Ford F-150|   Truck|2022|      2|
|Tesla Model 3|Electric|2022|      8|
|        Skoda|   Rapid|2022|      3|
+-------------+--------+----+-------+



levenshtein(l: Column, r: Column): Column

Explanation: Computes the Levenshtein distance between two string columns.

levenshtein(str1, str2[, threshold]) - Returns the Levenshtein distance between the two given strings. If threshold is set and distance more than it, return -1.


In [None]:
#Computes the Levenshtein distance of the two given strings
from pyspark.sql.functions import levenshtein
data = [("kitten", "sitting"),
        ("flaw", "lawn"),
        ("hello", "haihello")]
columns = ["string1", "string2"]

df = spark.createDataFrame(data, columns)
# Calculate the Levenshtein distance between "string1" and "string2" columns
distance_df =df.withColumn("levenshtein_distance", levenshtein(col("string1"), col("string2")))
distance_df.show()

distance_df = df.select(levenshtein(col("string1"),col("string2")))
distance_df.show()

# Register the DataFrame as a temporary SQL table
df.createOrReplaceTempView("cars")
encoded_df = spark.sql("SELECT string1,string2, levenshtein('string1','string2') AS distance FROM cars");
encoded_df.show()

+-------+--------+--------------------+
|string1| string2|levenshtein_distance|
+-------+--------+--------------------+
| kitten| sitting|                   3|
|   flaw|    lawn|                   2|
|  hello|haihello|                   3|
+-------+--------+--------------------+

+-----------------------------+
|levenshtein(string1, string2)|
+-----------------------------+
|                            3|
|                            2|
|                            3|
+-----------------------------+

+-------+--------+--------+
|string1| string2|distance|
+-------+--------+--------+
| kitten| sitting|       1|
|   flaw|    lawn|       1|
|  hello|haihello|       1|
+-------+--------+--------+



In [None]:
from pyspark.sql.functions import levenshtein

# Updated sample data for a "cars" DataFrame
data = [("   Coyota Camry", "Sedan", 2022,5),
        ("honda Civic2", "Sedan", 2022,6),
        ("Ford F-15000", "Truck", 2022,15),
        ("Tesla Mode", "Electric", 2022,24),
        ("Skoda45633", "Rapid", 2022,23)]

columns = ["model", "type", "year","driverage"]


# Create a DataFrame with the updated car-related data
df = spark.createDataFrame(data, columns)

# Calculate the Levenshtein distance between "model" and "type" columns
levenshtein_df = df.withColumn("levenshtein_distance", levenshtein(col("model"), col("type")))
levenshtein_df.show()

+---------------+--------+----+---------+--------------------+
|          model|    type|year|driverage|levenshtein_distance|
+---------------+--------+----+---------+--------------------+
|   Coyota Camry|   Sedan|2022|        5|                  14|
|   honda Civic2|   Sedan|2022|        6|                  10|
|   Ford F-15000|   Truck|2022|       15|                  11|
|     Tesla Mode|Electric|2022|       24|                   9|
|     Skoda45633|   Rapid|2022|       23|                   9|
+---------------+--------+----+---------+--------------------+



ltrim(e: Column): Column

Explanation: Removes leading whitespace from a string column.

In [None]:
from pyspark.sql.functions import ltrim

# Remove leading whitespace from the "model" column
trimmed_df = df.withColumn("model_trimmed", ltrim(df['model']))
trimmed_df.show()


trimmed_df=df.select( col("model") ,( ltrim(col("model")) ))
trimmed_df.show()


df.createOrReplaceTempView("cars")
trimmed_df = spark.sql("SELECT model ,ltrim(model) AS trimmed FROM cars");
trimmed_df.show()



+---------------+--------+----+---------+-------------+
|          model|    type|year|driverage|model_trimmed|
+---------------+--------+----+---------+-------------+
|   Coyota Camry|   Sedan|2022|        5| Coyota Camry|
|   honda Civic2|   Sedan|2022|        6| honda Civic2|
|   Ford F-15000|   Truck|2022|       15| Ford F-15000|
|     Tesla Mode|Electric|2022|       24|   Tesla Mode|
|     Skoda45633|   Rapid|2022|       23|   Skoda45633|
+---------------+--------+----+---------+-------------+

+---------------+------------+
|          model|ltrim(model)|
+---------------+------------+
|   Coyota Camry|Coyota Camry|
|   honda Civic2|honda Civic2|
|   Ford F-15000|Ford F-15000|
|     Tesla Mode|  Tesla Mode|
|     Skoda45633|  Skoda45633|
+---------------+------------+

+---------------+------------+
|          model|     trimmed|
+---------------+------------+
|   Coyota Camry|Coyota Camry|
|   honda Civic2|honda Civic2|
|   Ford F-15000|Ford F-15000|
|     Tesla Mode|  Tesla Mode

locate(substr: String, str: Column, pos: Int): Column

Explanation: Returns the position (1-based index) of the first occurrence of a substring in a string column, starting from a specified position.

In [None]:
#locate(substr, str[, pos]) -
#Returns the position of the first occurrence of substr in str after position pos.
# The given pos and return value are 1-based.
from pyspark.sql.functions import locate

# Find the position of "Sedan" in the "type" column, starting from position 3
position_df = df.withColumn("model", locate("Camry", col("model"), 1))
position_df.show()


df.createOrReplaceTempView("cars");

trimmed_df = spark.sql("SELECT model ,locate('Camry',model) AS out FROM cars");
trimmed_df.show()



+-----+--------+----+---------+
|model|    type|year|driverage|
+-----+--------+----+---------+
|   11|   Sedan|2022|        5|
|    0|   Sedan|2022|        6|
|    0|   Truck|2022|       15|
|    0|Electric|2022|       24|
|    0|   Rapid|2022|       23|
+-----+--------+----+---------+

+---------------+---+
|          model|out|
+---------------+---+
|   Coyota Camry| 11|
|   honda Civic2|  0|
|   Ford F-15000|  0|
|     Tesla Mode|  0|
|     Skoda45633|  0|
+---------------+---+



regexp_replace(e: Column, pattern: Column, replacement: Column): Column

Explanation: Replaces all substrings in a string column that match a specified regular expression pattern with a replacement string.

In [None]:
from pyspark.sql.functions import regexp_replace

# Replace "Sedan" with "Compact" in the "type" column
replaced_df =
df.withColumn("type_replaced", regexp_replace(col("type"), "Sedan", "Compact"))
replaced_df.show()


df.createOrReplaceTempView("cars");

trimmed_df =
spark.sql("SELECT model ,regexp_replace(type,'Sedan','SUV') AS out FROM cars");
trimmed_df.show()


SyntaxError: ignored

initcap(e: Column): Column

Explanation: Capitalizes the first letter of each word in a string column.

In [None]:
from pyspark.sql.functions import initcap

# Capitalize the "model" column
capitalized_df = df.withColumn("model_capitalized", initcap(col("model")))
capitalized_df.show()


df.createOrReplaceTempView("cars");

trimmed_df = spark.sql("SELECT model ,initcap(model) AS out FROM cars");
trimmed_df.show()


+---------------+--------+----+---------+-----------------+
|          model|    type|year|driverage|model_capitalized|
+---------------+--------+----+---------+-----------------+
|   Coyota Camry|   Sedan|2022|        5|     Coyota Camry|
|   honda Civic2|   Sedan|2022|        6|     Honda Civic2|
|   Ford F-15000|   Truck|2022|       15|     Ford F-15000|
|     Tesla Mode|Electric|2022|       24|       Tesla Mode|
|     Skoda45633|   Rapid|2022|       23|       Skoda45633|
+---------------+--------+----+---------+-----------------+

+---------------+---------------+
|          model|            out|
+---------------+---------------+
|   Coyota Camry|   Coyota Camry|
|   honda Civic2|   Honda Civic2|
|   Ford F-15000|   Ford F-15000|
|     Tesla Mode|     Tesla Mode|
|     Skoda45633|     Skoda45633|
+---------------+---------------+



regexp_replace(e: Column, pattern: String, replacement: String): Column

Explanation: Replaces all substrings in a string column that match a specified regular expression pattern with a replacement string.

In [None]:
from pyspark.sql.functions import regexp_replace

# Replace "Electric" with "EV" in the "type" column using a regular expression
replaced_df = df.withColumn("type_replaced", regexp_replace(col("type"), "Electric", "EV"))
replaced_df.show()

df.createOrReplaceTempView("cars");

trimmed_df = spark.sql("SELECT model ,regexp_replace(type,'Sedan','HATCHBACK') AS out FROM cars");
trimmed_df.show()



+---------------+--------+----+---------+-------------+
|          model|    type|year|driverage|type_replaced|
+---------------+--------+----+---------+-------------+
|   Coyota Camry|   Sedan|2022|        5|        Sedan|
|   honda Civic2|   Sedan|2022|        6|        Sedan|
|   Ford F-15000|   Truck|2022|       15|        Truck|
|     Tesla Mode|Electric|2022|       24|           EV|
|     Skoda45633|   Rapid|2022|       23|        Rapid|
+---------------+--------+----+---------+-------------+

+---------------+---------+
|          model|      out|
+---------------+---------+
|   Coyota Camry|HATCHBACK|
|   honda Civic2|HATCHBACK|
|   Ford F-15000|    Truck|
|     Tesla Mode| Electric|
|     Skoda45633|    Rapid|
+---------------+---------+



regexp_extract(e: Column, exp: String, groupIdx: Int): Column

Explanation: Extracts substrings from a string column using a regular expression and a specified capturing group index.

In [None]:
from pyspark.sql.functions import regexp_extract

# Extract the digits from the "model" column using a regular expression and group index 1
extracted_df = df.withColumn("model_digits", regexp_extract(col("model"), "(\d+)", 1))
extracted_df.show()


df.createOrReplaceTempView("cars");

trimmed_df = spark.sql("SELECT model ,regexp_extract(type,'(\d+)',1) AS out FROM cars");
trimmed_df.show()


+---------------+--------+----+---------+------------+
|          model|    type|year|driverage|model_digits|
+---------------+--------+----+---------+------------+
|   Coyota Camry|   Sedan|2022|        5|            |
|   honda Civic2|   Sedan|2022|        6|           2|
|   Ford F-15000|   Truck|2022|       15|       15000|
|     Tesla Mode|Electric|2022|       24|            |
|     Skoda45633|   Rapid|2022|       23|       45633|
+---------------+--------+----+---------+------------+

+---------------+---+
|          model|out|
+---------------+---+
|   Coyota Camry|  d|
|   honda Civic2|  d|
|   Ford F-15000|   |
|     Tesla Mode|   |
|     Skoda45633|  d|
+---------------+---+



encode(value: Column, charset: String): Column

Explanation: Encodes a string column using a specified character encoding.

In [None]:
from pyspark.sql.functions import encode

# Encode the "model" column using UTF-8 encoding
encoded_df = df\
  .withColumn("model_encoded", encode(col("model"), "UTF-8"))
encoded_df.show()

df.createOrReplaceTempView("cars");

trimmed_df = spark\
.sql("SELECT model ,encode(type,'UTF-8') AS out FROM cars");
trimmed_df.show()




+---------------+--------+----+---------+--------------------+
|          model|    type|year|driverage|       model_encoded|
+---------------+--------+----+---------+--------------------+
|   Coyota Camry|   Sedan|2022|        5|[20 20 20 43 6F 7...|
|   honda Civic2|   Sedan|2022|        6|[68 6F 6E 64 61 2...|
|   Ford F-15000|   Truck|2022|       15|[46 6F 72 64 20 4...|
|     Tesla Mode|Electric|2022|       24|[54 65 73 6C 61 2...|
|     Skoda45633|   Rapid|2022|       23|[53 6B 6F 64 61 3...|
+---------------+--------+----+---------+--------------------+

+---------------+--------------------+
|          model|                 out|
+---------------+--------------------+
|   Coyota Camry|    [53 65 64 61 6E]|
|   honda Civic2|    [53 65 64 61 6E]|
|   Ford F-15000|    [54 72 75 63 6B]|
|     Tesla Mode|[45 6C 65 63 74 7...|
|     Skoda45633|    [52 61 70 69 64]|
+---------------+--------------------+



decode(value: Column, charset: String): Column

Explanation: Decodes a string column using a specified character encoding.

In [None]:
from pyspark.sql.functions import decode

# Decode the "model_encoded" column using UTF-8 encoding
decoded_df = encoded_df\
  .withColumn("model_decoded", decode(col("model_encoded"), "UTF-8"))
decoded_df.show()



encoded_df.createOrReplaceTempView("cars_encoded");

trimmed_df = spark\
  .sql("SELECT model_encoded ,decode(type,'UTF-8') AS out FROM cars_encoded");
trimmed_df.show()



+---------------+--------+----+---------+--------------------+---------------+
|          model|    type|year|driverage|       model_encoded|  model_decoded|
+---------------+--------+----+---------+--------------------+---------------+
|   Coyota Camry|   Sedan|2022|        5|[20 20 20 43 6F 7...|   Coyota Camry|
|   honda Civic2|   Sedan|2022|        6|[68 6F 6E 64 61 2...|   honda Civic2|
|   Ford F-15000|   Truck|2022|       15|[46 6F 72 64 20 4...|   Ford F-15000|
|     Tesla Mode|Electric|2022|       24|[54 65 73 6C 61 2...|     Tesla Mode|
|     Skoda45633|   Rapid|2022|       23|[53 6B 6F 64 61 3...|     Skoda45633|
+---------------+--------+----+---------+--------------------+---------------+

+--------------------+--------+
|       model_encoded|     out|
+--------------------+--------+
|[20 20 20 43 6F 7...|   Sedan|
|[68 6F 6E 64 61 2...|   Sedan|
|[46 6F 72 64 20 4...|   Truck|
|[54 65 73 6C 61 2...|Electric|
|[53 6B 6F 64 61 3...|   Rapid|
+--------------------+--------+


format_number(x: Column, d: Int): Column

Explanation: Formats a numeric column to a specific number of decimal places.

In [None]:
from pyspark.sql.functions import format_number

# Format the "age" column to two decimal places
formatted_df = df\
.withColumn("age_formatted", format_number(col("year").cast("double"), 2))
formatted_df.show()

df.createOrReplaceTempView("cars");

trimmed_df = spark\
  .sql("SELECT model,format_number(driverage,5) AS out FROM cars");
trimmed_df.show()

+---------------+--------+----+---------+-------------+
|          model|    type|year|driverage|age_formatted|
+---------------+--------+----+---------+-------------+
|   Coyota Camry|   Sedan|2022|        5|     2,022.00|
|   honda Civic2|   Sedan|2022|        6|     2,022.00|
|   Ford F-15000|   Truck|2022|       15|     2,022.00|
|     Tesla Mode|Electric|2022|       24|     2,022.00|
|     Skoda45633|   Rapid|2022|       23|     2,022.00|
+---------------+--------+----+---------+-------------+

+---------------+--------+
|          model|     out|
+---------------+--------+
|   Coyota Camry| 5.00000|
|   honda Civic2| 6.00000|
|   Ford F-15000|15.00000|
|     Tesla Mode|24.00000|
|     Skoda45633|23.00000|
+---------------+--------+



format_string(format: String, arguments: Column): Column*

Explanation: Formats a string column using a format string and optional arguments.

In [None]:
from pyspark.sql.functions import format_string

# Create a formatted string using the "model," "type," and "year" columns
formatted_string_df = df.withColumn(
    "formatted_details",
    format_string("%s (%s) - %s", col("model"), col("type"), col("year"))
)
formatted_string_df.show(truncate=False)

df.createOrReplaceTempView("cars");

trimmed_df = spark.sql("SELECT model,type,year,format_string('%s-(%s)-%s' ,model,type,year) AS out FROM cars");
trimmed_df.show(truncate=False)

 # Coyota Camry (Sedan) - 2022

+---------------+--------+----+---------+------------------------------+
|model          |type    |year|driverage|formatted_details             |
+---------------+--------+----+---------+------------------------------+
|   Coyota Camry|Sedan   |2022|5        |   Coyota Camry (Sedan) - 2022|
|honda Civic2   |Sedan   |2022|6        |honda Civic2 (Sedan) - 2022   |
|Ford F-15000   |Truck   |2022|15       |Ford F-15000 (Truck) - 2022   |
|Tesla Mode     |Electric|2022|24       |Tesla Mode (Electric) - 2022  |
|Skoda45633     |Rapid   |2022|23       |Skoda45633 (Rapid) - 2022     |
+---------------+--------+----+---------+------------------------------+

+---------------+--------+----+----------------------------+
|model          |type    |year|out                         |
+---------------+--------+----+----------------------------+
|   Coyota Camry|Sedan   |2022|   Coyota Camry-(Sedan)-2022|
|honda Civic2   |Sedan   |2022|honda Civic2-(Sedan)-2022   |
|Ford F-15000   |Truck   |2022|Ford F

In [None]:
from pyspark.sql.functions import upper

# Find the position of "Camry" in the "model" column
position_df = df.withColumn("upper_model",upper(col("model")))
position_df.show()

df.createOrReplaceTempView("cars");

trimmed_df = spark.sql("SELECT model,upper(model) AS out FROM cars");
trimmed_df.show()

+---------------+--------+----+---------+---------------+
|          model|    type|year|driverage|    upper_model|
+---------------+--------+----+---------+---------------+
|   Coyota Camry|   Sedan|2022|        5|   COYOTA CAMRY|
|   honda Civic2|   Sedan|2022|        6|   HONDA CIVIC2|
|   Ford F-15000|   Truck|2022|       15|   FORD F-15000|
|     Tesla Mode|Electric|2022|       24|     TESLA MODE|
|     Skoda45633|   Rapid|2022|       23|     SKODA45633|
+---------------+--------+----+---------+---------------+

+---------------+---------------+
|          model|            out|
+---------------+---------------+
|   Coyota Camry|   COYOTA CAMRY|
|   honda Civic2|   HONDA CIVIC2|
|   Ford F-15000|   FORD F-15000|
|     Tesla Mode|     TESLA MODE|
|     Skoda45633|     SKODA45633|
+---------------+---------------+



In [None]:
from pyspark.sql.functions import lower

# Find the position of "Camry" in the "model" column
position_df = df.withColumn("lower_model",lower(col("model")))
position_df.show()

df.createOrReplaceTempView("cars");

trimmed_df = spark.sql("SELECT model,lower(model) AS out FROM cars");
trimmed_df.show()

+---------------+--------+----+---------+---------------+
|          model|    type|year|driverage|    lower_model|
+---------------+--------+----+---------+---------------+
|   Coyota Camry|   Sedan|2022|        5|   coyota camry|
|   honda Civic2|   Sedan|2022|        6|   honda civic2|
|   Ford F-15000|   Truck|2022|       15|   ford f-15000|
|     Tesla Mode|Electric|2022|       24|     tesla mode|
|     Skoda45633|   Rapid|2022|       23|     skoda45633|
+---------------+--------+----+---------+---------------+

+---------------+---------------+
|          model|            out|
+---------------+---------------+
|   Coyota Camry|   coyota camry|
|   honda Civic2|   honda civic2|
|   Ford F-15000|   ford f-15000|
|     Tesla Mode|     tesla mode|
|     Skoda45633|     skoda45633|
+---------------+---------------+



https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.concat_ws.html


https://medium.com/@deepa.account/spark-dataframes-select-vs-withcolumn-31388cecbca9


https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html

