In [3]:
from  pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.functions import countDistinct
from pyspark.sql import functions as f

In [31]:
spark = SparkSession.builder.appName("My_Spark_Project")\
.config("spark.memory.offHeap. enabled", "true") . config("spark.memory.offHeap. size", "10g") \
.getOrCreate()

In [5]:
file_path = r"C:\Users\abdel\Downloads\books_data.csv\books_data.csv"
# Read the CSV file into a DataFrame
books_data_df = spark.read.csv(file_path, header=True, inferSchema=True)
books_data_df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|               Title|         description|             authors|               image|         previewLink|           publisher|       publishedDate|            infoLink|          categories|        ratingsCount|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Its Only Art If I...|                null|    ['Julie Strain']|http://books.goog...|http://books.goog...|                null|                1996|http://books.goog...|['Comics & Graphi...|                null|
|Dr. Seuss: Americ...|"Philip Nel takes...| like that of Lew...| has changed lang...| giving us new wo...| inspiring artist...|      ['Philip Nel']|http

In [6]:
books_data_df.printSchema()
print("Count of dataframe:",books_data_df.count())

root
 |-- Title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- image: string (nullable = true)
 |-- previewLink: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- publishedDate: string (nullable = true)
 |-- infoLink: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- ratingsCount: string (nullable = true)

Count of dataframe: 212404


In [7]:

books_data_df.describe().show()

+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|summary|               Title|         description|             authors|               image|         previewLink|           publisher|       publishedDate|            infoLink|          categories|        ratingsCount|
+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  count|              212403|              144047|              181153|              161213|              188349|              139274|              186560|              188103|              171880|               63852|
|   mean|   3823.672941176471|  1.4285714285714286|              1578.4|              1184.0|            Infinity|      

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum

# Count null values in each column
null_counts = books_data_df.agg(*[spark_sum(col(c).isNull().cast("int")).alias(c + '_null_count') for c in books_data_df.columns])

# Collect the result as a single row
null_counts_single_row = null_counts.collect()[0]

# Show the result
print("Number of null values in each column:")
for col_name in books_data_df.columns:
    print(f"{col_name}: {null_counts_single_row[col_name + '_null_count']}")

Number of null values in each column:
Title: 1
description: 68357
authors: 31251
image: 51191
previewLink: 24055
publisher: 73130
publishedDate: 25844
infoLink: 24301
categories: 40524
ratingsCount: 148552


In [9]:
# List of columns to find most frequent values
cols_to_check = ["Title", "description", "authors", "image", "previewLink", "publisher", "publishedDate", "infoLink", "categories", "ratingsCount"]

# Find the most frequent value in each column
most_frequent_values = []
for col_name in cols_to_check:
    mode_value = books_data_df.groupBy(col_name).count().orderBy(col("count").desc()).select(col_name).first()[0]
    most_frequent_values.append((col_name, mode_value))

# Print the most frequent value in each column
for col_name, value in most_frequent_values:
    print(f"Most frequent value in column '{col_name}': {value}")

Most frequent value in column 'Title': """Please
Most frequent value in column 'description': None
Most frequent value in column 'authors': None
Most frequent value in column 'image': None
Most frequent value in column 'previewLink': None
Most frequent value in column 'publisher': None
Most frequent value in column 'publishedDate': None
Most frequent value in column 'infoLink': None
Most frequent value in column 'categories': None
Most frequent value in column 'ratingsCount': None


In [10]:
# Fill missing values with a default value
default_value = "Unknown"
books_data_df = books_data_df.fillna(default_value, subset=['Title', 'description','publisher','authors','categories'])
default_value2 = "Not Available"
books_data_df= books_data_df.fillna(default_value2, subset=['image', 'previewLink','infoLink'])
books_data_df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|               Title|         description|             authors|               image|         previewLink|           publisher|       publishedDate|            infoLink|          categories|        ratingsCount|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Its Only Art If I...|             Unknown|    ['Julie Strain']|http://books.goog...|http://books.goog...|             Unknown|                1996|http://books.goog...|['Comics & Graphi...|                null|
|Dr. Seuss: Americ...|"Philip Nel takes...| like that of Lew...| has changed lang...| giving us new wo...| inspiring artist...|      ['Philip Nel']|http

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = books_data_df.count()
null_percentages = []
for col_name in books_data_df.columns:
    null_count = books_data_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Title': 0.00%
Null percentage in column 'description': 0.00%
Null percentage in column 'authors': 0.00%
Null percentage in column 'image': 0.00%
Null percentage in column 'previewLink': 0.00%
Null percentage in column 'publisher': 0.00%
Null percentage in column 'publishedDate': 12.17%
Null percentage in column 'infoLink': 0.00%
Null percentage in column 'categories': 0.00%
Null percentage in column 'ratingsCount': 69.94%


In [12]:
threshold = 0.5  # Drop columns with more than 50% null values
books_data_df = books_data_df.drop(*[c for c in books_data_df.columns if books_data_df.where(col(c).isNull()).count() / books_data_df.count() > threshold])

In [13]:
from pyspark.sql.functions import col, to_date

# Convert 'publishedDate' column to DateType
books_data_df = books_data_df.withColumn('publishedDate', to_date(col('publishedDate'), 'yyyy-MM-dd'))

# Show the DataFrame after conversion
print("After conversion:")
books_data_df.show()

After conversion:
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+
|               Title|         description|             authors|               image|         previewLink|           publisher|publishedDate|            infoLink|          categories|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+
|Its Only Art If I...|             Unknown|    ['Julie Strain']|http://books.goog...|http://books.goog...|             Unknown|         null|http://books.goog...|['Comics & Graphi...|
|Dr. Seuss: Americ...|"Philip Nel takes...| like that of Lew...| has changed lang...| giving us new wo...| inspiring artist...|         null|http://books.goog...|http://books.goog...|
|Wonderful Worship...|This resource inc...|    ['David R. Ray'

In [14]:

books_data_df.printSchema()

root
 |-- Title: string (nullable = false)
 |-- description: string (nullable = false)
 |-- authors: string (nullable = false)
 |-- image: string (nullable = false)
 |-- previewLink: string (nullable = false)
 |-- publisher: string (nullable = false)
 |-- publishedDate: date (nullable = true)
 |-- infoLink: string (nullable = false)
 |-- categories: string (nullable = false)



In [15]:
from pyspark.sql.functions import when
default_value = "2022-06-01"
# Fill null values in 'publishedDate' column with the default value
books_data_df = books_data_df.withColumn('publishedDate', when(books_data_df.publishedDate.isNull(), default_value).otherwise(books_data_df.publishedDate))
# Show the DataFrame after filling
books_data_df.show()


+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+
|               Title|         description|             authors|               image|         previewLink|           publisher|publishedDate|            infoLink|          categories|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+
|Its Only Art If I...|             Unknown|    ['Julie Strain']|http://books.goog...|http://books.goog...|             Unknown|   2022-06-01|http://books.goog...|['Comics & Graphi...|
|Dr. Seuss: Americ...|"Philip Nel takes...| like that of Lew...| has changed lang...| giving us new wo...| inspiring artist...|   2022-06-01|http://books.goog...|http://books.goog...|
|Wonderful Worship...|This resource inc...|    ['David R. Ray']|http://books.goo

In [16]:

books_data_df = books_data_df.withColumnRenamed('previewLink', 'preview_Link') \
                             .withColumnRenamed('publishedDate', 'published_Date')\
                              .withColumnRenamed('infoLink', 'info_Link')

books_data_df.show()


+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+
|               Title|         description|             authors|               image|        preview_Link|           publisher|published_Date|           info_Link|          categories|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+
|Its Only Art If I...|             Unknown|    ['Julie Strain']|http://books.goog...|http://books.goog...|             Unknown|    2022-06-01|http://books.goog...|['Comics & Graphi...|
|Dr. Seuss: Americ...|"Philip Nel takes...| like that of Lew...| has changed lang...| giving us new wo...| inspiring artist...|    2022-06-01|http://books.goog...|http://books.goog...|
|Wonderful Worship...|This resource inc...|    ['David R. Ray']|http://book

In [17]:
print("Before drop dublication:",books_data_df.count())
books_data_df.dropDuplicates()
print("After drop dublication:",books_data_df.count())

Before drop dublication: 212404
After drop dublication: 212404


In [18]:
from pyspark.sql import functions as F
# Count the number of unique states
unique_states_df = books_data_df.agg(F.countDistinct("Title").alias("unique_Titles"))
unique_states_df.show()
# Count the number of unique cities
unique_cities_df = books_data_df.agg(F.countDistinct("categories").alias("unique_categories"))
unique_cities_df.show()

+-------------+
|unique_Titles|
+-------------+
|       212400|
+-------------+

+-----------------+
|unique_categories|
+-----------------+
|            28362|
+-----------------+



In [19]:
books_data_df.groupBy("authors").count().show()
books_data_df.groupBy("publisher").count().show()
books_data_df.groupBy("authors").count().show()
books_data_df.groupBy("publisher").count().show()

+--------------------+-----+
|             authors|count|
+--------------------+-----+
|  ['Barbara Melosh']|    1|
|    ['Donald Cline']|    1|
|     ['Dian Layton']|    2|
| ['Sergius Golowin']|    1|
|       ['Kotoyama,']|    1|
|   ['Joseph Kerman']|    1|
|     ['Kay Flowers']|    1|
|     ['John Rewald']|    3|
|"" To a very stro...|    1|
|['I. Ristic', 'Ia...|    1|
|['Andrew P. Tobias']|    3|
|['Judith Ennamora...|    2|
|['Jamgon Kongtrul...|    1|
|['Frank Miller', ...|    1|
|['National Resear...|    1|
| all of whom were...|    1|
|['William B. Park...|    1|
|      ['Max Fogiel']|    3|
|      ['Jules Bass']|    1|
|  ['Rebecca Harvin']|    1|
+--------------------+-----+
only showing top 20 rows

+--------------------+-----+
|           publisher|count|
+--------------------+-----+
|        Lorenz Books|   22|
|       The New Press|   15|
|Janes Information...|    7|
|National Committe...|    1|
|            Capstone|   82|
|          Soma Books|    5|
| perhaps the mos

In [20]:
from pyspark.sql.functions import regexp_replace

# Replace any links with an empty string in the publisher column
cleaned_df = books_data_df.withColumn("publisher", regexp_replace("publisher", "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", ""))

# Show the cleaned DataFrame
cleaned_df.show()


+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+
|               Title|         description|             authors|               image|        preview_Link|           publisher|published_Date|           info_Link|          categories|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+
|Its Only Art If I...|             Unknown|    ['Julie Strain']|http://books.goog...|http://books.goog...|             Unknown|    2022-06-01|http://books.goog...|['Comics & Graphi...|
|Dr. Seuss: Americ...|"Philip Nel takes...| like that of Lew...| has changed lang...| giving us new wo...| inspiring artist...|    2022-06-01|http://books.goog...|http://books.goog...|
|Wonderful Worship...|This resource inc...|    ['David R. Ray']|http://book

In [21]:
from pyspark.sql import functions as F
# Count the number of unique states
unique_description_df =books_data_df.agg(F.countDistinct("description").alias("unique_descriptiones"))
unique_description_df.show()
# Count the number of unique cities
unique_publisher_df = books_data_df.agg(F.countDistinct("publisher").alias("unique_publishers"))
unique_publisher_df.show()

+--------------------+
|unique_descriptiones|
+--------------------+
|              133257|
+--------------------+

+-----------------+
|unique_publishers|
+-----------------+
|            34265|
+-----------------+



In [22]:
from pyspark.sql import functions as F

# Define the URL pattern for checking
url_pattern = "%http%"

# Filter the DataFrame based on the URL pattern in the publisher column
url_publisher_df = books_data_df.filter(F.col("publisher").like(url_pattern))
url_publisher_df.show()


+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+
|               Title|         description|             authors|               image|        preview_Link|           publisher|published_Date|           info_Link|          categories|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+
|Open marriage;: A...|Advocates the imp...|  "[""Nena O'Neill""| ""George O'Neill...|http://books.goog...|http://books.goog...|    2022-06-01|                1984|http://books.goog...|
|Alternative Chica...|"""""Alternative ...|         adventurous| or slightly offb...|      ['Bill Franz']|http://books.goog...|    2022-06-01|Cumberland House ...|          2004-03-01|
|   The Unquiet Grave|"This enduring cl...| no matter how ma...| will never

In [23]:
from pyspark.sql import functions as F

# Define the URL pattern for checking
url_pattern = "%http%"

# Filter the DataFrame based on the URL pattern in the publisher column
url_publisher_df = books_data_df.filter(F.col("publisher").like(url_pattern))

# Count the number of URL values in the publisher column
url_count = url_publisher_df.count()
print("Number of URL values in the 'publisher' column:", url_count)


Number of URL values in the 'publisher' column: 5099


In [24]:
from pyspark.sql import functions as F

# Define the URL pattern for checking
url_pattern = "%http%"

# Create a new column with the "publisher" column filled with "Unknown" for URL values
filled_publisher_df = books_data_df.withColumn(
    "publisher_filled",
    F.when(F.col("publisher").like(url_pattern), "Unknown").otherwise(F.col("publisher"))
)

# Show the DataFrame with the filled values
filled_publisher_df.show()


+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+--------------------+
|               Title|         description|             authors|               image|        preview_Link|           publisher|published_Date|           info_Link|          categories|    publisher_filled|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+--------------------+
|Its Only Art If I...|             Unknown|    ['Julie Strain']|http://books.goog...|http://books.goog...|             Unknown|    2022-06-01|http://books.goog...|['Comics & Graphi...|             Unknown|
|Dr. Seuss: Americ...|"Philip Nel takes...| like that of Lew...| has changed lang...| giving us new wo...| inspiring artist...|    2022-06-01|http://books.goog...|http://books.

In [25]:
from pyspark.sql import functions as F

# Define the URL pattern for checking
url_pattern = "%http%"

# Create a new column with the "publisher" column filled with "Unknown" for URL values
filled_publisher_df = books_data_df.withColumn(
    "publisher_filled",
    F.when(F.col("publisher").like(url_pattern), "Unknown").otherwise(F.col("publisher"))
)

# Group by the filled publisher column and count the occurrences of each distinct value
publisher_counts_df = filled_publisher_df.groupBy("publisher_filled").count()

# Show the DataFrame with counts of each distinct value in the filled publisher column
publisher_counts_df.show()


+--------------------+-----+
|    publisher_filled|count|
+--------------------+-----+
|        Lorenz Books|   22|
|       The New Press|   15|
|Janes Information...|    7|
|National Committe...|    1|
|            Capstone|   82|
|          Soma Books|    5|
| perhaps the most...|    1|
|University Roches...|   16|
| and through his ...|    2|
|      Celestial Arts|   39|
|French & European...|    2|
|Arcadia Publishin...|    3|
| and cultural iss...|    1|
|   Ssar Publications|    1|
|    Random House LLC|   11|
|['Joseph Henry Ja...|    2|
|  Instructional Fair|    8|
|['Erwin Schroding...|    2|
|John Benjamins Pu...|    8|
|          Helen Hunt|    1|
+--------------------+-----+
only showing top 20 rows



In [27]:
books_data_df.groupBy("Title").count().show()
books_data_df.groupBy("categories").count().show()

+--------------------+-----+
|               Title|count|
+--------------------+-----+
|Isaac Asimov: Mas...|    1|
|     White Rock Ways|    1|
|The Face of the T...|    1|
|Your Signature Li...|    1|
|     Iridescent Soul|    1|
|L'Alchimiste (Cof...|    1|
|  The Book of Garlic|    1|
|A Jesse Stuart Ha...|    1|
|Raymond Chandler:...|    1|
|      Badenheim 1939|    1|
|        Pagan Babies|    1|
|The Self and its ...|    1|
|The Educated Chil...|    1|
|Future Perfect - ...|    1|
|The cornet of hor...|    1|
|Basic Arabic Work...|    1|
|Organizational Th...|    1|
|Oz and Beyond: Th...|    1|
|Fundamentals of I...|    1|
|We Love Baseball!...|    1|
+--------------------+-----+
only showing top 20 rows

+----------------------+-----+
|            categories|count|
+----------------------+-----+
|   Toronto Globe an...|    1|
|       ['Arboviruses']|    1|
|  "[""Children's so...|   11|
|   Gopnik shows tha...|    1|
|  ['Melanchthon, Ph...|    1|
|   and always naked...|    1|

In [28]:
from pyspark.sql import functions as F

# Define the URL pattern for checking in the categories column
url_pattern = "%http%"

# Filter out rows with URL values in the categories column
filtered_categories_df = books_data_df.filter(~F.col("categories").like(url_pattern))

# Show the DataFrame without rows containing URL values in the categories column
filtered_categories_df.show()


+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+
|               Title|         description|             authors|               image|        preview_Link|           publisher|published_Date|           info_Link|          categories|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+
|Its Only Art If I...|             Unknown|    ['Julie Strain']|http://books.goog...|http://books.goog...|             Unknown|    2022-06-01|http://books.goog...|['Comics & Graphi...|
|Wonderful Worship...|This resource inc...|    ['David R. Ray']|http://books.goog...|http://books.goog...|             Unknown|    2022-06-01|http://books.goog...|        ['Religion']|
|Whispers of the W...|Julia Thomas find...| ['Veronica Haddon']|http://book

In [29]:
distinct_counts_df = books_data_df.agg(
    F.countDistinct("publisher").alias("distinct_publishers"),
    F.countDistinct("categories").alias("distinct_categories")
)

distinct_counts_df.show()


+-------------------+-------------------+
|distinct_publishers|distinct_categories|
+-------------------+-------------------+
|              34265|              28362|
+-------------------+-------------------+



In [33]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("YourApp") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
    .getOrCreate()

published_per_day = books_data_df.groupBy("published_Date").count().orderBy("published_Date")
published_per_day.show() 

+--------------+-----+
|published_Date|count|
+--------------+-----+
|    0101-01-01|    6|
|    1016-10-11|    1|
|    1869-01-01|    1|
|    1894-01-01|    1|
|    1900-01-01|    2|
|    1908-01-01|    2|
|    1909-09-01|    1|
|    1911-11-01|    1|
|    1913-04-15|    1|
|    1916-10-13|    1|
|    1920-03-31|    1|
|    1920-06-15|    1|
|    1921-12-15|    1|
|    1924-01-31|    1|
|    1926-01-01|    1|
|    1927-01-01|    1|
|    1929-01-01|    4|
|    1930-05-27|    1|
|    1931-12-12|    1|
|    1932-01-01|    1|
+--------------+-----+
only showing top 20 rows



In [None]:
df_customers = df_customers.drop('country')
df_customers.show()

# Clean is done