In [0]:
!wget -q http://mirror.its.dal.ca/apache/spark/spark-2.3.4/spark-2.3.4-bin-hadoop2.7.tgz 

In [0]:
!ls /content/
!tar xf /content/spark-2.3.4-bin-hadoop2.7.tgz
!pip -qq install findspark
!pip -qq install numpy
!pip -qq install pandas
!pip -qq install matplotlib

sample_data  spark-2.3.4-bin-hadoop2.7.tgz


In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.4-bin-hadoop2.7/"
print (os.listdir('/content/spark-2.3.4-bin-hadoop2.7/'))

import findspark
findspark.init()

['bin', 'sbin', 'kubernetes', 'yarn', 'python', 'data', 'licenses', 'NOTICE', 'jars', 'examples', 'conf', 'LICENSE', 'R', 'README.md', 'RELEASE']


In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!ls /content/drive/My\ Drive/bigFiles

AOL-01.txt  AOL_README.txt  author-large.txt


In [0]:
cp -r /content/drive/My\ Drive/bigFiles /content/

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Learning_Spark") \
    .getOrCreate()

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

schema = StructType([
    StructField("Author", StringType(), True),
    StructField("Field", StringType(), True),
    StructField("BookName", StringType(), True),
    StructField("Year", IntegerType(), True)])


data = spark.read.csv('bigFiles/author-large.txt',inferSchema=True, header=False,sep='\t',schema=schema)
data.count(), len(data.columns)

(2225370, 4)

In [0]:
rdd = spark.sparkContext.textFile('bigFiles/AOL_README.txt')

def splitLine(line):
  return line.split(" ")

def StopWords(listWords):
  stopWords = ['this','is','of',"","for"]
  for word in listWords:
    if word and word.casefold() in stopWords:
      listWords.remove(word)
  return listWords

def StopWords2(word):
  stopWords = ['this','is','of',"","for"]
  return word if word and word.casefold() not in stopWords else ''

rdd2 = rdd.flatMap(splitLine)

rdd3 = rdd2.map(StopWords2)

rdd3.filter(lambda x: x not in '' and '--' not in x).distinct().take(6)



['500k', 'User', 'Session', 'Collection', 'collection', 'NON-COMMERCIAL']

In [0]:
data.show(5)
data.printSchema()

+------------------+--------------------+--------------------+----+
|            Author|               Field|            BookName|Year|
+------------------+--------------------+--------------------+----+
| Jurgen Annevelink|Modern Database S...|Object SQL - A La...|1995|
|       Rafiul Ahad|Modern Database S...|Object SQL - A La...|1995|
|    Amelia Carlson|Modern Database S...|Object SQL - A La...|1995|
| Daniel H. Fishman|Modern Database S...|Object SQL - A La...|1995|
|Michael L. Heytens|Modern Database S...|Object SQL - A La...|1995|
+------------------+--------------------+--------------------+----+
only showing top 5 rows

root
 |-- Author: string (nullable = true)
 |-- Field: string (nullable = true)
 |-- BookName: string (nullable = true)
 |-- Year: integer (nullable = true)



In [0]:
data.select('*').show(15, truncate=False)

+-----------------------+-----------------------+------------------------------------------------------------------------------+----+
|Author                 |Field                  |BookName                                                                      |Year|
+-----------------------+-----------------------+------------------------------------------------------------------------------+----+
|Jurgen Annevelink      |Modern Database Systems|Object SQL - A Language for the Design and Implementation of Object Databases.|1995|
|Rafiul Ahad            |Modern Database Systems|Object SQL - A Language for the Design and Implementation of Object Databases.|1995|
|Amelia Carlson         |Modern Database Systems|Object SQL - A Language for the Design and Implementation of Object Databases.|1995|
|Daniel H. Fishman      |Modern Database Systems|Object SQL - A Language for the Design and Implementation of Object Databases.|1995|
|Michael L. Heytens     |Modern Database Systems|Object SQL - 

In [0]:
data.describe(['Year']).show()

+-------+------------------+
|summary|              Year|
+-------+------------------+
|  count|           2225362|
|   mean|2002.8766793896903|
| stddev| 6.021949010636045|
|    min|              1959|
|    max|              2010|
+-------+------------------+



In [0]:
cond = data.Year == '' 
data.filter(cond).select('*').show(20, truncate=False)

+------+-----+--------+----+
|Author|Field|BookName|Year|
+------+-----+--------+----+
+------+-----+--------+----+



In [0]:
from pyspark.sql.functions import col
notNulYearCount = data.withColumn("is_year_null",  col("Year").isNull()).filter(col('is_year_null') == False).count()

nulYearCount = data.withColumn("is_year_null",  col("Year").isNull()).filter(col('is_year_null') == True).count()

nulAuthorCount = data.withColumn("is_author_null",  col("Author").isNull()).filter(col('is_author_null') == True).count()

nulBookNameCount = data.withColumn("is_BookName_null",  col("BookName").isNull()).filter(col('is_BookName_null') == True).count()

nulFieldCount = data.withColumn("is_Field_null",  col("Field").isNull()).filter(col('is_Field_null') == True).count()

nulYearCount, nulAuthorCount, nulBookNameCount, nulFieldCount

(8, 0, 0, 0)

In [0]:
(data.count() - notNulYearCount)

2225362

In [0]:
yearsCount = data.groupBy("Year").count().sort(col("Year"))
yearsCount.show(15)

+----+-----+
|Year|count|
+----+-----+
|null|    8|
|1959|  123|
|1960|   10|
|1961|   38|
|1962|  204|
|1963|   16|
|1964|   33|
|1965|   52|
|1966|   44|
|1967|   59|
|1968|  562|
|1969|  191|
|1970|  170|
|1971|  719|
|1972|  257|
+----+-----+
only showing top 15 rows



In [0]:
arr = list(range(1,10))
rd = spark.sparkContext.parallelize(arr)


In [0]:
def factoriel(number):
  if number <= 1:
    return 1
  else:
    return number*factoriel(number-1)

rdd_fact = rd.map(factoriel)

sum = rdd_fact.reduce(lambda x,y: x+y)

# count = rdd_fact.reduce(lambda x,y: x[1]+y[1])

count = rdd_fact.count()
print(sum/count)

rdd_fact.take(10)

# rdd_fact.reduce(lambda x,y: x+y)




45457.0


[1, 2, 6, 24, 120, 720, 5040, 40320, 362880]