In [34]:
from pyspark.sql import SparkSession

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.87:7077") \
        .appName("Part_A_Raheel_Ali")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()


# Old API (RDD)
spark_context = spark_session.sparkContext

In [35]:
### A1.1

#counting lines
def countLines(myrdd):
    lineCount = myrdd.map(lambda s: 1)
    lineCount.persist()
    totalLines = lineCount.reduce(lambda x,y:x+y)
    return totalLines

eng_1 = spark_context.textFile("hdfs://192.168.2.87:9000/europarl/europarl-v7.sv-en.en")
totalLines_eng = countLines(eng_1)
print(totalLines_eng)

1862234


In [36]:
### A1.2

sv_1 = spark_context.textFile("hdfs://192.168.2.87:9000/europarl/europarl-v7.sv-en.sv")
totalLines_sv = countLines(sv_1)
print(totalLines_sv)

1862234


In [37]:
### A1.3

print("Is the line count same for english and swedish language? "+ str(totalLines_eng == totalLines_sv) )

Is the line count same for english and swedish language? True


In [38]:
### A1.4

print("Partitions for English transcripts : "+ str(eng_1.getNumPartitions()))
print("Partitions for Swedish transcripts : "+ str(sv_1.getNumPartitions()))

Partitions for English transcripts : 2
Partitions for Swedish transcripts : 3


In [39]:
### A2.1

# Function for Text Preprocessing
def PreProcessText(myrdd):
    myrdd = myrdd.lower()
    myrdd = myrdd.split(' ')
    return myrdd

eng_2 = eng_1.map(PreProcessText)
sv_2 = sv_1.map(PreProcessText)

In [40]:
### A2.2 (Part 1)

eng_2.take(10)

[['resumption', 'of', 'the', 'session'],
 ['i',
  'declare',
  'resumed',
  'the',
  'session',
  'of',
  'the',
  'european',
  'parliament',
  'adjourned',
  'on',
  'friday',
  '17',
  'december',
  '1999,',
  'and',
  'i',
  'would',
  'like',
  'once',
  'again',
  'to',
  'wish',
  'you',
  'a',
  'happy',
  'new',
  'year',
  'in',
  'the',
  'hope',
  'that',
  'you',
  'enjoyed',
  'a',
  'pleasant',
  'festive',
  'period.'],
 ['although,',
  'as',
  'you',
  'will',
  'have',
  'seen,',
  'the',
  'dreaded',
  "'millennium",
  "bug'",
  'failed',
  'to',
  'materialise,',
  'still',
  'the',
  'people',
  'in',
  'a',
  'number',
  'of',
  'countries',
  'suffered',
  'a',
  'series',
  'of',
  'natural',
  'disasters',
  'that',
  'truly',
  'were',
  'dreadful.'],
 ['you',
  'have',
  'requested',
  'a',
  'debate',
  'on',
  'this',
  'subject',
  'in',
  'the',
  'course',
  'of',
  'the',
  'next',
  'few',
  'days,',
  'during',
  'this',
  'part-session.'],
 ['in',
  

In [41]:
### A2.2 (Part 2)

sv_2.take(10)

[['återupptagande', 'av', 'sessionen'],
 ['jag',
  'förklarar',
  'europaparlamentets',
  'session',
  'återupptagen',
  'efter',
  'avbrottet',
  'den',
  '17',
  'december.',
  'jag',
  'vill',
  'på',
  'nytt',
  'önska',
  'er',
  'ett',
  'gott',
  'nytt',
  'år',
  'och',
  'jag',
  'hoppas',
  'att',
  'ni',
  'haft',
  'en',
  'trevlig',
  'semester.'],
 ['som',
  'ni',
  'kunnat',
  'konstatera',
  'ägde',
  '"den',
  'stora',
  'år',
  '2000-buggen"',
  'aldrig',
  'rum.',
  'däremot',
  'har',
  'invånarna',
  'i',
  'ett',
  'antal',
  'av',
  'våra',
  'medlemsländer',
  'drabbats',
  'av',
  'naturkatastrofer',
  'som',
  'verkligen',
  'varit',
  'förskräckliga.'],
 ['ni',
  'har',
  'begärt',
  'en',
  'debatt',
  'i',
  'ämnet',
  'under',
  'sammanträdesperiodens',
  'kommande',
  'dagar.'],
 ['till',
  'dess',
  'vill',
  'jag',
  'att',
  'vi,',
  'som',
  'ett',
  'antal',
  'kolleger',
  'begärt,',
  'håller',
  'en',
  'tyst',
  'minut',
  'för',
  'offren',
  'f

In [42]:
### A2.3

totalLines_eng2 = countLines(eng_2)
print("Total line counts for english :" + str(totalLines_eng2))
totalLines_sv2 = countLines(sv_2)
print("Total line counts for swedish :" + str(totalLines_sv2))
print("Is the line count same for the engish and swedish language after pre processing? "+ str(totalLines_eng2 == totalLines_sv2) )

Total line counts for english :1862234
Total line counts for swedish :1862234
Is the line count same for the engish and swedish language after pre processing? True


In [44]:
## A3.1 (English)

eng_2.flatMap(lambda x: x).map(lambda word: (word, 1)).reduceByKey(lambda x, y: x + y).sortBy(lambda x: x[1],ascending=False).take(10)


[('the', 3498375),
 ('of', 1659758),
 ('to', 1539760),
 ('and', 1288401),
 ('in', 1085993),
 ('that', 797516),
 ('a', 773522),
 ('is', 758050),
 ('for', 534242),
 ('we', 522849)]

In [48]:
### A3.1 (Swedish)

sv_2.flatMap(lambda x: x).map(lambda word: (word, 1)).reduceByKey(lambda x, y: x + y).sortBy(lambda x: x[1],ascending=False).take(10)

[('att', 1706293),
 ('och', 1344830),
 ('i', 1050774),
 ('det', 924866),
 ('som', 913276),
 ('för', 908680),
 ('av', 738068),
 ('är', 694381),
 ('en', 620310),
 ('vi', 539797)]

In [51]:
### A4.1 and 4.2

eng_3 = eng_2.zipWithIndex().map(lambda z : (z[1],z[0]))
sv_3 = sv_2.zipWithIndex().map(lambda z : (z[1],z[0]))

In [52]:
#English lines by their line numbers
eng_3.take(2)

[(0, ['resumption', 'of', 'the', 'session']),
 (1,
  ['i',
   'declare',
   'resumed',
   'the',
   'session',
   'of',
   'the',
   'european',
   'parliament',
   'adjourned',
   'on',
   'friday',
   '17',
   'december',
   '1999,',
   'and',
   'i',
   'would',
   'like',
   'once',
   'again',
   'to',
   'wish',
   'you',
   'a',
   'happy',
   'new',
   'year',
   'in',
   'the',
   'hope',
   'that',
   'you',
   'enjoyed',
   'a',
   'pleasant',
   'festive',
   'period.'])]

In [53]:
#Swedish lines by their line numbers
sv_3.take(2)

[(0, ['återupptagande', 'av', 'sessionen']),
 (1,
  ['jag',
   'förklarar',
   'europaparlamentets',
   'session',
   'återupptagen',
   'efter',
   'avbrottet',
   'den',
   '17',
   'december.',
   'jag',
   'vill',
   'på',
   'nytt',
   'önska',
   'er',
   'ett',
   'gott',
   'nytt',
   'år',
   'och',
   'jag',
   'hoppas',
   'att',
   'ni',
   'haft',
   'en',
   'trevlig',
   'semester.'])]

In [54]:
### A4.3
joinedLang = eng_3.join(sv_3)

In [55]:
### A4.4
filterTheMissing = joinedLang.filter(lambda x: x[1][1] and x[1][0])

In [56]:
### A4.4
filterTheMissing.take(4)

[(96010,
  (['in',
    'barcelona,',
    'the',
    'vital',
    'contribution',
    'of',
    'civil',
    'society',
    'to',
    'the',
    'development',
    'of',
    'partnerships',
    'was',
    'recognised',
    'for',
    'the',
    'first',
    'time',
    'in',
    'a',
    'text',
    'to',
    'which',
    'the',
    'european',
    'union',
    'and',
    '15',
    'countries',
    'of',
    'the',
    'southern',
    'mediterranean',
    'subscribed.'],
   ['i',
    'barcelona',
    'erkändes',
    'för',
    'första',
    'gången',
    'i',
    'en',
    'text',
    'som',
    'innebär',
    'åtaganden',
    'för',
    'europeiska',
    'unionen',
    'och',
    'de',
    'femton',
    'länderna',
    'söder',
    'om',
    'medelhavsområdet',
    'det',
    'avsevärda',
    'bidrag',
    'som',
    'det',
    'civila',
    'samhället',
    'kan',
    'lämna',
    'till',
    'utvecklingen',
    'av',
    'partnerskapet.'])),
 (97090,
  (['this',
    'has',
    'nothi

In [57]:
### A4.5 and 4.6
filterWordsPerSent = filterTheMissing.filter(lambda x: ((len(x[1][1]) < 5) and (len(x[1][0]) < 5)) and (len(x[1][1])==len(x[1][0])))

In [58]:
filterWordsPerSent.take(10)

[(746710,
  (['this', 'failure', 'was', 'incendiary.'],
   ['denna', 'underlåtenhet', 'orsakade', 'bränderna.'])),
 (823715, (['-', 'report:', 'gauzès'], ['-', 'betänkande:', 'gauzès'])),
 (811800,
  (['they', 'need', 'our', 'help.'], ['de', 'behöver', 'vår', 'hjälp.'])),
 (767510, (['7.'], ['7.'])),
 (339730,
  (['questions', 'to', 'commissioner', 'diamantopoulou'],
   ['frågor', 'till', 'kommissionsledamot', 'diamantopoulou'])),
 (750605, (['but', 'how?'], ['men', 'hur?'])),
 (58675, (['.'], ['.'])),
 (230775,
  (['thank', 'you,', 'commissioner', 'diamantopoulou.'],
   ['tack,', 'fru', 'kommissionär', 'diamantopoulou.'])),
 (494290, (['no.'], ['nej.'])),
 (879535,
  (['written', 'statements', '(rule', '142)'],
   ['skriftliga', 'förklaringar', '(artikel', '142)']))]

In [59]:
### A4.7
sentencePairs = filterWordsPerSent.map(lambda x: list(zip(x[1][0],x[1][1]))).flatMap(lambda x: x).map(lambda word: (word, 1))

In [60]:
### A4.7
sentencePairs.take(8)

[(('.', '.'), 1),
 (('tashi', 'tashi'), 1),
 (('delek!', 'delek!'), 1),
 (('.', 'skriftlig.'), 1),
 (('the', 'jag'), 1),
 (('debate', 'förklarar'), 1),
 (('is', 'debatten'), 1),
 (('closed.', 'avslutad.'), 1)]

In [61]:
### A4.8
wordTranPairCount = sentencePairs.reduceByKey(lambda x, y: x + y).sortBy(lambda x: x[1],ascending=False)

In [62]:
### A4.8
wordTranPairCount.take(25)

[(('(applause)', '(applåder)'), 2546),
 (('closed.', 'avslutad.'), 2534),
 (('is', 'är'), 2380),
 (('', '.'), 2223),
 (('.', '.'), 2082),
 (('the', 'jag'), 1324),
 (('is', 'debatten'), 1324),
 (('debate', 'förklarar'), 1317),
 (('the', 'debatten'), 1225),
 (('is', 'härmed'), 1215),
 (('debate', 'är'), 1187),
 (('(rule', '(artikel'), 893),
 (('that', 'det'), 852),
 (('written', 'skriftliga'), 847),
 (('\xa0\xa0', '\xa0\xa0'), 842),
 (('statements', 'förklaringar'), 801),
 (('we', 'vi'), 636),
 (('i', 'jag'), 629),
 (('this', 'detta'), 582),
 (('142)', '142)'), 557),
 (('it', 'det'), 515),
 (('applause', 'applåder'), 461),
 (('2.', '2.'), 438),
 (('1.', '1.'), 438),
 (('there', 'det'), 429)]

In [63]:
### A4.9
wordTranPairCount2 = wordTranPairCount.top(10, lambda x: x[1])
for word, freq in wordTranPairCount2:
    print(word, freq)

('(applause)', '(applåder)') 2546
('closed.', 'avslutad.') 2534
('is', 'är') 2380
('', '.') 2223
('.', '.') 2082
('is', 'debatten') 1324
('the', 'jag') 1324
('debate', 'förklarar') 1317
('the', 'debatten') 1225
('is', 'härmed') 1215


In [64]:
# release the cores for another application!
spark_context.stop()