In [None]:
pip install pyspark

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, expr, array_union, collect_list
from pyspark.ml.fpm import FPGrowth
import pandas as pd

In [19]:
spark = SparkSession.builder.appName("AssociationRules").getOrCreate()

# Replace 'your_dataset.csv' with the actual filename
df = spark.read.csv("/kaggle/input/recommendation-system/books.csv", header=True, inferSchema=True)


                                                                                

In [20]:
df.columns

['bookId',
 'title',
 'series',
 'author',
 'rating',
 'description',
 'language',
 'isbn',
 'genres',
 'characters',
 'bookFormat',
 'edition',
 'pages',
 'publisher',
 'publishDate',
 'firstPublishDate',
 'awards',
 'numRatings',
 'ratingsByStars',
 'likedPercent',
 'setting',
 'coverImg',
 'bbeScore',
 'bbeVotes',
 'price']

In [21]:
# Define a UDF to split columns by delimiter and clean up data
def split_and_clean(column):
    return split(column, "[',\\[\\]]")

# Apply UDF to preprocess columns
df = df.withColumn("genres", split_and_clean(df["genres"])) \
       .withColumn("characters", split_and_clean(df["characters"])) \
       .withColumn("awards", split_and_clean(df["awards"])) \
       .withColumn("setting", split_and_clean(df["setting"]))

# Combine all lists into a single column
df = df.withColumn("items", array_union(array_union(array_union("genres", "characters"), "awards"), "setting"))

# Remove null or empty items
df = df.withColumn("items", expr("filter(items, x -> x is not null and x != '')"))

# Remove duplicate rows
df = df.dropDuplicates()

In [22]:
# Assuming 'bookId' or 'title' can be used as a unique identifier
transactions = df.groupBy("bookId").agg(collect_list("items").alias("items"))

In [25]:
transactions.head(1)

                                                                                

[Row(bookId=None, items=[[' his companion for many years) is an absorbing', ' highly entertaining', ' Roderigo', 's oldest son', ' the Renaissance is in full swing as religion competes against humanism and the Church seeks autonomous control of what will one day become a united Italy. As in E. L. Doctrow', 's Ragtime and Glen David Gold', 's Carter Beats the Devil'], [' the manifestation of evil in the Lord of the Rings.Six thousand years before the One Ring is destroyed', ' Middle-earth lies under the shadow of the Dark Lord Morgoth. The greatest warriors among elves and men have perished', 'Fantasy', ' ', 'Fiction', 'Classics', 'High Fantasy', 'Epic Fantasy', 'Science Fiction Fantasy', 'Adventure', 'Novels', 'Literature', 'Epic', '313'], ['Action'], [' pretentious cheese whiz', ' pompous bestselling author', ' and unforgiving enemies to find a killer before Lara’s past casts a tainted pall on the festival’s future.RECIPES INCLUDED"'], [' freedom can be a burden. A frank defense of ho

In [28]:
fp_growth = FPGrowth(itemsCol="items", minSupport=0.02, minConfidence=0.5)
model = fp_growth.fit(transactions)

# Display frequent itemsets
model.freqItemsets.show(truncate=False)

# Display association rules
model.associationRules.show(truncate=False)

# Display predictions
model.transform(transactions).show(truncate=False)


In [30]:
## ************************** COMBINDED CODE *******************************

from pyspark.sql import SparkSession
from pyspark.sql.functions import split, collect_list, array_union, expr

# Initialize Spark session
spark = SparkSession.builder.appName("AssociationRules").getOrCreate()

# Load data into DataFrame
df = spark.read.csv("/kaggle/input/recommendation-system/books.csv", header=True, inferSchema=True)

# Select relevant columns
selected_columns = ['isbn', 'genres', 'characters', 'awards', 'setting']
df = df.select(selected_columns)

# Define a UDF to split columns by delimiter and clean up data
def split_and_clean(column):
    return split(column, "[',\\[\\]]")

# Apply UDF to preprocess columns
for column in selected_columns[1:]:  # excluding 'isbn'
    df = df.withColumn(column, split_and_clean(df[column]))

# Combine all lists into a single column
df = df.withColumn("items", array_union(array_union(array_union("genres", "characters"), "awards"), "setting"))

# Remove null or empty items
df = df.withColumn("items", expr("filter(items, x -> x is not null and x != '')"))

# Remove duplicate rows
df = df.dropDuplicates()

# Limit DataFrame to 100 rows
df = df.limit(100)

# Group by the unique identifier 'isbn' to create transactions
transactions = df.groupBy("isbn").agg(collect_list("items").alias("items"))

# Apply FP-Growth
fp_growth = FPGrowth(itemsCol="items", minSupport=0.02, minConfidence=0.5)
model = fp_growth.fit(transactions)

# Set Spark configuration for better display
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows", 100)
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100)

# Display frequent itemsets
print("Frequent Itemsets:")
model.freqItemsets.show(truncate=False)

# Display association rules
print("Association Rules:")
model.associationRules.show(truncate=False)

# Display predictions
print("Predictions:")
model.transform(transactions).show(truncate=False)


                                                                                

Frequent Itemsets:
+-----+----+
|items|freq|
+-----+----+
+-----+----+

Association Rules:
+----------+----------+----------+----+-------+
|antecedent|consequent|confidence|lift|support|
+----------+----------+----------+----+-------+
+----------+----------+----------+----+-------+

Predictions:




+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                