## Setup

In [84]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
openjdk-8-jdk-headless is already the newest version (8u352-ga-1~18.04).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 20 not upgraded.


## Import Packages

In [85]:
import numpy
import pandas as pd
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [86]:
spark = SparkSession \
    .builder \
    .appName("Spark_Processor") \
    .master("local[*]") \
    .getOrCreate()

sc=spark.sparkContext

## Reading Data

In [87]:
records = sc.textFile('records.txt')

In [88]:
raw_data = records.map(lambda x: x.split())
raw_data.take(5)

[['FRO11987', 'ELE17451', 'ELE89019', 'SNA90258', 'GRO99222'],
 ['GRO99222',
  'GRO12298',
  'FRO12685',
  'ELE91550',
  'SNA11465',
  'ELE26917',
  'ELE52966',
  'FRO90334',
  'SNA30755',
  'ELE17451',
  'FRO84225',
  'SNA80192'],
 ['ELE17451', 'GRO73461', 'DAI22896', 'SNA99873', 'FRO86643'],
 ['ELE17451', 'ELE37798', 'FRO86643', 'GRO56989', 'ELE23393', 'SNA11465'],
 ['ELE17451',
  'SNA69641',
  'FRO86643',
  'FRO78087',
  'SNA11465',
  'GRO39357',
  'ELE28573',
  'ELE11375',
  'DAI54444']]

## In the next cell we flatmap the data to have each items in a rdd.

In [41]:
items = records.flatMap(lambda x: x.split())
items.take(5)

['FRO11987', 'ELE17451', 'ELE89019', 'SNA90258', 'GRO99222']

## In the next cell we map the data to (x,1) and then count each of pairs.

In [89]:
records_map = items.map(lambda x: (x, 1))
count_items = records_map.reduceByKey(lambda x, y: x+y)
count_items.take(5)

[('FRO11987', 104),
 ('SNA90258', 550),
 ('ELE91550', 23),
 ('ELE52966', 380),
 ('FRO90334', 63)]

## In the next cell we try to find the most single frequent items for creating the baskest of 2-member and 3-member in the next parts.

In [90]:
support = 100
sorted_frequent_single_items = count_items.filter(lambda x: x[1]>=support).sortBy(lambda x: x[1], ascending=False)
sorted_frequent_single_items.take(5)

[('DAI62779', 6667),
 ('FRO40251', 3881),
 ('ELE17451', 3875),
 ('GRO73461', 3602),
 ('SNA80324', 3044)]

##  In this cell we find the most single frequent items

In [91]:
print("The most single frequent items: ")
for item in sorted_frequent_single_items.collect():
    print(item)

The most single frequent items: 
('DAI62779', 6667)
('FRO40251', 3881)
('ELE17451', 3875)
('GRO73461', 3602)
('SNA80324', 3044)
('ELE32164', 2851)
('DAI75645', 2736)
('SNA45677', 2455)
('FRO31317', 2330)
('DAI85309', 2293)
('ELE26917', 2292)
('FRO80039', 2233)
('GRO21487', 2115)
('SNA99873', 2083)
('GRO59710', 2004)
('GRO71621', 1920)
('FRO85978', 1918)
('GRO30386', 1840)
('ELE74009', 1816)
('GRO56726', 1784)
('DAI63921', 1773)
('GRO46854', 1756)
('ELE66600', 1713)
('DAI83733', 1712)
('FRO32293', 1702)
('ELE66810', 1697)
('SNA55762', 1646)
('DAI22177', 1627)
('FRO78087', 1531)
('ELE99737', 1516)
('GRO94758', 1489)
('ELE34057', 1489)
('FRO35904', 1436)
('FRO53271', 1420)
('SNA93860', 1407)
('SNA90094', 1390)
('GRO38814', 1352)
('ELE56788', 1345)
('GRO61133', 1321)
('DAI88807', 1316)
('ELE74482', 1316)
('ELE59935', 1311)
('SNA96271', 1295)
('DAI43223', 1290)
('ELE91337', 1289)
('GRO15017', 1275)
('DAI31081', 1261)
('GRO81087', 1220)
('DAI22896', 1219)
('GRO85051', 1214)
('ELE92920', 1197

## In the next cell we find most frequent 2-member itemsets.
For finding them at first we try to remove infrequent items from dataset and then we create 2-member itemsets and then try to find the most frequent itemsets.

In [92]:
list_frequent = sc.broadcast(sorted_frequent_single_items.map(lambda x: x[0]).collect())
raw_data_removed_infrequent_items = raw_data.map(lambda x: set(item for item in x if item in list_frequent.value))
raw_data_removed_infrequent_items.take(5)

[{'ELE17451', 'FRO11987', 'GRO99222', 'SNA90258'},
 {'ELE17451',
  'ELE26917',
  'ELE52966',
  'GRO12298',
  'GRO99222',
  'SNA11465',
  'SNA30755',
  'SNA80192'},
 {'DAI22896', 'ELE17451', 'FRO86643', 'GRO73461', 'SNA99873'},
 {'ELE17451', 'ELE37798', 'FRO86643', 'GRO56989', 'SNA11465'},
 {'DAI54444',
  'ELE11375',
  'ELE17451',
  'FRO78087',
  'FRO86643',
  'GRO39357',
  'SNA11465',
  'SNA69641'}]

In [93]:
def generate_two_member(sets):
    two_members = []
    list_items = list(sets)
    for i in range(len(list_items)):
        for j in range(i+1, len(list_items)):
            two_member = tuple(sorted([list_items[i], list_items[j]]))
            two_members.append((two_member, 1))
    return  two_members

frequent_two_member_items = raw_data_removed_infrequent_items.flatMap(generate_two_member)
frequent_two_member_items_count = frequent_two_member_items.reduceByKey(lambda x, y: x+y)
frequent_two_member_items_filter = frequent_two_member_items_count.filter(lambda x: x[1]>=support)
frequent_two_member_items_sorted = frequent_two_member_items_filter.sortBy(lambda x: x[1], ascending=False)
frequent_two_member_items_sorted.take(5)

[(('DAI62779', 'ELE17451'), 1592),
 (('FRO40251', 'SNA80324'), 1412),
 (('DAI75645', 'FRO40251'), 1254),
 (('FRO40251', 'GRO85051'), 1213),
 (('DAI62779', 'GRO73461'), 1139)]

In [94]:
print("The most 2-member frequent itemsets: ")
for item in frequent_two_member_items_sorted.collect():
    print(item)

The most 2-member frequent itemsets: 
(('DAI62779', 'ELE17451'), 1592)
(('FRO40251', 'SNA80324'), 1412)
(('DAI75645', 'FRO40251'), 1254)
(('FRO40251', 'GRO85051'), 1213)
(('DAI62779', 'GRO73461'), 1139)
(('DAI75645', 'SNA80324'), 1130)
(('DAI62779', 'FRO40251'), 1070)
(('DAI62779', 'SNA80324'), 923)
(('DAI62779', 'DAI85309'), 918)
(('ELE32164', 'GRO59710'), 911)
(('FRO40251', 'GRO73461'), 882)
(('DAI62779', 'DAI75645'), 882)
(('DAI62779', 'ELE92920'), 877)
(('FRO40251', 'FRO92469'), 835)
(('DAI62779', 'ELE32164'), 832)
(('DAI75645', 'GRO73461'), 712)
(('DAI43223', 'ELE32164'), 711)
(('DAI62779', 'GRO30386'), 709)
(('ELE17451', 'FRO40251'), 697)
(('DAI85309', 'ELE99737'), 659)
(('DAI62779', 'ELE26917'), 650)
(('GRO21487', 'GRO73461'), 631)
(('DAI62779', 'SNA45677'), 604)
(('ELE17451', 'SNA80324'), 597)
(('DAI62779', 'GRO71621'), 595)
(('DAI62779', 'SNA55762'), 593)
(('DAI62779', 'DAI83733'), 586)
(('ELE17451', 'GRO73461'), 580)
(('GRO73461', 'SNA80324'), 562)
(('DAI62779', 'GRO59710'), 

## In the next cell we find most frequent 3-member itemsets.
For finding them we attach 3 items for creating 3-member itemsets. we define a function for this approach. the result is as below:

In [96]:
def generate_three_member(sets):
    three_members = []
    list_items = list(sets)
    
    for i in range(len(list_items)):
        for j in range(i+1, len(list_items)):
            for k in range(j+1, len(list_items)):
                three_member = tuple(sorted([list_items[i], list_items[j], list_items[k]]))
                three_members.append((three_member, 1))
    return three_members


frequent_three_member_items = raw_data_removed_infrequent_items.flatMap(generate_three_member)
frequent_three_member_items_count = frequent_three_member_items.reduceByKey(lambda x, y: x+y)
frequent_three_member_items_filter = frequent_three_member_items_count.filter(lambda x: x[1]>=support)
frequent_three_member_items_sorted = frequent_three_member_items_filter.sortBy(lambda x: x[1], ascending=False)
frequent_three_member_items_sorted.take(5)

[(('DAI75645', 'FRO40251', 'SNA80324'), 550),
 (('DAI62779', 'FRO40251', 'SNA80324'), 476),
 (('FRO40251', 'GRO85051', 'SNA80324'), 471),
 (('DAI62779', 'ELE92920', 'SNA18336'), 432),
 (('DAI62779', 'DAI75645', 'SNA80324'), 421)]

In [97]:
print("The most 3-member frequent itemsets: ")
for item in frequent_three_member_items_sorted.collect():
    print(item)

The most 3-member frequent itemsets: 
(('DAI75645', 'FRO40251', 'SNA80324'), 550)
(('DAI62779', 'FRO40251', 'SNA80324'), 476)
(('FRO40251', 'GRO85051', 'SNA80324'), 471)
(('DAI62779', 'ELE92920', 'SNA18336'), 432)
(('DAI62779', 'DAI75645', 'SNA80324'), 421)
(('DAI62779', 'ELE17451', 'SNA80324'), 417)
(('DAI62779', 'DAI75645', 'FRO40251'), 412)
(('DAI62779', 'ELE17451', 'FRO40251'), 406)
(('DAI75645', 'FRO40251', 'GRO85051'), 395)
(('DAI62779', 'FRO40251', 'GRO85051'), 381)
(('ELE17451', 'FRO40251', 'SNA80324'), 353)
(('DAI62779', 'ELE17451', 'ELE92920'), 345)
(('FRO40251', 'FRO92469', 'SNA80324'), 343)
(('DAI62779', 'DAI85309', 'ELE17451'), 339)
(('DAI62779', 'DAI75645', 'ELE17451'), 328)
(('DAI62779', 'FRO40251', 'GRO73461'), 315)
(('DAI62779', 'ELE32164', 'GRO59710'), 301)
(('DAI75645', 'ELE17451', 'SNA80324'), 300)
(('DAI75645', 'FRO40251', 'GRO73461'), 293)
(('DAI75645', 'ELE17451', 'FRO40251'), 292)
(('DAI43223', 'ELE32164', 'GRO59710'), 287)
(('DAI43223', 'DAI62779', 'ELE32164'),