In [3]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

spark = SparkSession\
        .builder\
        .master('local[*]')\
        .getOrCreate()
sc = spark.sparkContext

In [4]:
data = sc.parallelize([1,2,3,4,5])
print (data)

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274


In [5]:
data = sc.parallelize([(1,2),(3,4),(3,6),(3,4)])
data.collect()

[(1, 2), (3, 4), (3, 6), (3, 4)]

In [6]:
type(data)

pyspark.rdd.RDD

In [13]:
data.count()

4

In [8]:
num = sc.parallelize([5,5,4,3,2,9,2],5)

num.getNumPartitions()


5

In [9]:
num.map(lambda a : a*2).collect()

[10, 10, 8, 6, 4, 18, 4]

In [10]:
num.map(lambda a : pow(a,3)).collect()

[125, 125, 64, 27, 8, 729, 8]

In [11]:
names = sc.parallelize([ "Bills", "Mark","Brain","Mick"])
names.map(lambda a : "Mr. "+ a).collect()

['Mr. Bills', 'Mr. Mark', 'Mr. Brain', 'Mr. Mick']

In [14]:
rdd = sc.parallelize([ 2, 3,4])
rdd.collect()

[2, 3, 4]

In [15]:
rdd.map(lambda x: range(1, x)).collect()

[range(1, 2), range(1, 3), range(1, 4)]

In [16]:
rdd.flatMap(lambda x: range(1, x)).collect()

[1, 1, 2, 1, 2, 3]

In [17]:
a = sc.parallelize([1,2,3])
print(a.map(lambda x: (x,x*10,57)).collect())
print(a.flatMap(lambda x: (x,x*10,57)).collect())

[(1, 10, 57), (2, 20, 57), (3, 30, 57)]
[1, 10, 57, 2, 20, 57, 3, 30, 57]


In [18]:
rdd = sc.parallelize([1, 2, 3, 4, 5])
result_map = rdd.map(lambda x: x * 2).collect()
print("Résultat avec map:", result_map)

Résultat avec map: [2, 4, 6, 8, 10]


In [19]:
def multiply_by_two(iterator):
    return (x * 2 for x in iterator)

result_mapPartitions = rdd.mapPartitions(multiply_by_two).collect()

print("Résultat avec mapPartitions:", result_mapPartitions)

Résultat avec mapPartitions: [2, 4, 6, 8, 10]


In [20]:
num.collect()

[5, 5, 4, 3, 2, 9, 2]

In [21]:
num.filter(lambda x : x%3 == 1).collect()

[4]

In [22]:
names.collect()
names.filter(lambda x : "a" in x).collect()

['Mark', 'Brain']

In [23]:
print(names.sortBy(lambda x:x[0],1).collect())
print(names.sortBy(lambda x:x[0],0).collect())

['Bills', 'Brain', 'Mark', 'Mick']
['Mark', 'Mick', 'Bills', 'Brain']


In [24]:
num2 = sc.parallelize([1,1,7,9,4,10,15])
num2.collect()

[1, 1, 7, 9, 4, 10, 15]

In [25]:
num3=num2.union(num)
num3.collect()

[1, 1, 7, 9, 4, 10, 15, 5, 5, 4, 3, 2, 9, 2]

In [26]:
num3.sortBy(lambda x:x,0).collect()

[15, 10, 9, 9, 7, 5, 5, 4, 4, 3, 2, 2, 1, 1]

In [27]:
num3.distinct().collect()

[1, 2, 3, 4, 5, 7, 9, 10, 15]

In [28]:
parallel = sc.parallelize(range(1,20))
parallel.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [29]:
parallel.sample(True,.2,).collect()

[1, 4, 15]

In [31]:
names.collect()
names_gr = names.groupBy(lambda x : x[0]).collect()

In [32]:
names_gr

[('M', <pyspark.resultiterable.ResultIterable at 0x7f138021a770>),
 ('B', <pyspark.resultiterable.ResultIterable at 0x7f138024bd60>)]

In [33]:
for (k,v) in names_gr:
    print(k,list(v))

M ['Mark', 'Mick']
B ['Bills', 'Brain']


In [35]:
num.intersection(num2).collect()

0 [3]
1 [1, 1]
2 [2, 5, 8]


In [36]:
num.intersection(num2).collect()

[4, 9]

In [37]:
num2.intersection(num).collect()

[4, 9]

In [38]:
print(num.subtract(num2).collect())
print(num2.subtract(num).collect())

[2, 2, 3, 5, 5]
[1, 1, 7, 10, 15]


In [39]:
dataStr = sc.parallelize([(3,'mike'),(2,'john'),(3,'rambo'),(4,'bill'),(1,'mike')])
dataStr.collect()

[(3, 'mike'), (2, 'john'), (3, 'rambo'), (4, 'bill'), (1, 'mike')]

In [40]:
dataStr.countByValue()

defaultdict(int,
            {(3, 'mike'): 1,
             (2, 'john'): 1,
             (3, 'rambo'): 1,
             (4, 'bill'): 1,
             (1, 'mike'): 1})

In [41]:
data1= sc.parallelize([(1, 2), (3, 6), (3, 1), (2, 4)])
data1.sortByKey().collect()

[(1, 2), (2, 4), (3, 6), (3, 1)]

In [42]:
data1.lookup(3)

[6, 1]

In [43]:
data.keys().collect()

[1, 3, 3, 3]

In [44]:
data.mapValues(lambda a : a*a).collect()

[(1, 4), (3, 16), (3, 36), (3, 16)]

In [45]:
print(data.collect())
print(data.reduceByKey(lambda x, y : x+y).collect())

[(1, 2), (3, 4), (3, 6), (3, 4)]
[(1, 2), (3, 14)]


In [46]:
data.reduceByKey(max).collect()

[(1, 2), (3, 6)]

In [47]:
result = data.groupByKey().collect()
for (k,v) in result:
    print(k, list(v))

1 [2]
3 [4, 6, 4]


In [48]:
names = sc.parallelize(['Adam','Cray','Shaun','Brain','Mark','Christ','Shail','Satya','Mark','Norby','Frans','Mark','Bill'])
print(names.collect())

['Adam', 'Cray', 'Shaun', 'Brain', 'Mark', 'Christ', 'Shail', 'Satya', 'Mark', 'Norby', 'Frans', 'Mark', 'Bill']


In [49]:
names.countByValue()

defaultdict(int,
            {'Adam': 1,
             'Cray': 1,
             'Shaun': 1,
             'Brain': 1,
             'Mark': 3,
             'Christ': 1,
             'Shail': 1,
             'Satya': 1,
             'Norby': 1,
             'Frans': 1,
             'Bill': 1})

In [50]:
names.take(5)

['Adam', 'Cray', 'Shaun', 'Brain', 'Mark']

In [51]:
names.count()

13

In [52]:
names.distinct().count()

11