# PySpark Part of Speech (POS) analysis
Text taken from [Reuters](https://www.reuters.com/business/finance/banks-beware-outsiders-are-cracking-code-finance-2021-09-17/).

In [1]:
import nltk
from pyspark import SparkContext

In [2]:
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to C:\Users\Alfredo
[nltk_data]     Salazar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Alfredo Salazar\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# Entry point for working with RDD
sc = SparkContext(appName = "pyspark-pos-analysis")

In [4]:
# Loading a text file
rdd_reuters = sc.textFile("./data/reuters.txt")

In [5]:
# Line counter inside text file
rdd_reuters.count()

87

In [6]:
# Retrieve first words
rdd_reuters.flatMap(lambda x: x.split(" ")).take(15)

['Banks',
 'beware,',
 'Amazon',
 'and',
 'Walmart',
 'are',
 'cracking',
 'the',
 'code',
 'for',
 'finance',
 '',
 'LONDON,',
 'Sept',
 '17']

In [7]:
# Data frame
rdd_reuters.map(lambda x: x.word_tokenize().take(87))

PythonRDD[4] at RDD at PythonRDD.scala:53

In [8]:
# Classified data frame
lines = sc.textFile("./data/reuters.txt")
lineLengths = lines.map(lambda s: len(s))
totalLength = lineLengths.reduce(lambda a, b: a + b)
lineLengths.persist()

PythonRDD[8] at RDD at PythonRDD.scala:53

In [9]:
# We recover lines of text
rdd_reuters.take(10)

['Banks beware, Amazon and Walmart are cracking the code for finance',
 '',
 'LONDON, Sept 17 (Reuters) - Anyone can be a banker these days, you just need the right code.',
 '',
 'Global brands from Mercedes and Amazon (AMZN.O) to IKEA and Walmart (WMT.N) are cutting out the traditional financial middleman and plugging in software from tech startups to offer customers everything from banking and credit to insurance.',
 '',
 '',
 'So-called embedded finance - a fancy term for companies integrating software to offer financial services - means Amazon can let customers "buy now pay later" when they check out and Mercedes drivers can get their cars to pay for their fuel.',
 '']

In [10]:
# We collect the text
rdd_reuters.collect()

['Banks beware, Amazon and Walmart are cracking the code for finance',
 '',
 'LONDON, Sept 17 (Reuters) - Anyone can be a banker these days, you just need the right code.',
 '',
 'Global brands from Mercedes and Amazon (AMZN.O) to IKEA and Walmart (WMT.N) are cutting out the traditional financial middleman and plugging in software from tech startups to offer customers everything from banking and credit to insurance.',
 '',
 '',
 'So-called embedded finance - a fancy term for companies integrating software to offer financial services - means Amazon can let customers "buy now pay later" when they check out and Mercedes drivers can get their cars to pay for their fuel.',
 '',
 'To be sure, banks are still behind most of the transactions but investors and analysts say the risk for traditional lenders is that they will get pushed further away from the front end of the finance chain.',
 '',
 "And that means they'll be further away from the mountains of data others are hoovering up about the 

In [11]:
# Word frequency
rdd_reuters = sc.textFile("./data/reuters.txt")
rdd_reuters.flatMap(lambda line: line.split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .map(lambda x: (x[1], x[0])) \
    .sortByKey(ascending = False) \
    .map(lambda x: (x[1], x[0])) \
    .take(200)

[('', 43),
 ('the', 41),
 ('to', 41),
 ('and', 34),
 ('of', 25),
 ('in', 18),
 ('for', 18),
 ('a', 16),
 ('is', 14),
 ('are', 12),
 ('financial', 12),
 ('that', 12),
 ('banks', 11),
 ('said', 10),
 ('be', 10),
 ('their', 10),
 ('at', 9),
 ('from', 9),
 ('they', 9),
 ('finance', 8),
 ('-', 8),
 ('with', 8),
 ('as', 7),
 ('we', 7),
 ('will', 7),
 ('on', 7),
 ('companies', 6),
 ('have', 6),
 ('more', 6),
 ('services', 6),
 ('data', 6),
 ('by', 6),
 ("don't", 6),
 ('out', 5),
 ('embedded', 5),
 ('this', 5),
 ('Amazon', 5),
 ('can', 5),
 ('offer', 5),
 ('customers', 5),
 ('said.', 5),
 ('digital', 5),
 ('payments', 5),
 ('firm', 5),
 ('traditional', 4),
 ('banking', 4),
 ('but', 4),
 ('he', 4),
 ('technology', 4),
 ('into', 4),
 ('last', 4),
 ('other', 4),
 ('ask', 4),
 ('billion', 4),
 ('products', 4),
 ('which', 4),
 ('means', 3),
 ('now', 3),
 ('pay', 3),
 ('lenders', 3),
 ('end', 3),
 ('an', 3),
 ('consumer', 3),
 ('Capital', 3),
 ('go', 3),
 ('bank', 3),
 ('month', 3),
 ('BNPL', 3),
 (

In [12]:
# Txt Collect Array
rdd_reuters = sc.textFile("./data/reuters.txt").collect()
tokens = []
for i in rdd_reuters : 
    tokens += nltk.word_tokenize(i)
print(tokens)



In [13]:
# Txt tokenization
tokens = nltk.pos_tag(tokens)
print(tokens)



In [14]:
# Word NLTK frequency
VB = 0
VBG = 0
VBZ = 0 
VBP = 0
NN = 0
NNS = 0 
NNP = 0
NNPS = 0
JJ = 0 
JJR = 0
JJS = 0
DT = 0
WDT = 0 
RB = 0
RBS = 0
RBR = 0
WP = 0
IN = 0
CC = 0


for i in tokens:
    if (i[1] == "VB"):
        VB +=1
    elif (i[1] == "VBG"):
        VBG +=1
    elif (i[1] == "VBZ"):
        VBZ +=1
    elif  (i[1] == "VBP"):
        VBP +=1
    elif (i[1] == "NN"):
        NN +=1
    elif (i[1] == "NNS"):
        NNS +=1
    elif (i[1] == "NNP"):
        NNP +=1
    elif (i[1] == "NNPS"):
        NNPS +=1
    elif (i[1] == "JJ"):
        JJ +=1
    elif (i[1] == "JJR"):
        JJR +=1
    elif (i[1] == "JJS"):
        JJS +=1
    elif (i[1] == "DT"):
        DT +=1
    elif (i[1] == "WDT"):
        WDT +=1
    elif (i[1] == "RB"):
        RB +=1
    elif (i[1] == "RBS"):
        RBS +=1
    elif (i[1]=="RBR"):
        RBR +=1
    elif (i[1]=="WP"):
        WP +=1
    elif i[1] == "IN":
        IN +=1
    elif (i[1] == "CC"):
        CC +=1
            
print("VB = "+str(VB))
print("VBG = "+str(VBG))
print("VBZ = "+str(VBZ))
print("VBP = "+str(VBP))
print("NN = "+str(NN))
print("NNS = "+str(NNS))
print("NNP = "+str(NNP))
print("NNPS = "+str(NNPS))
print("JJ = "+str(JJ))
print("JJR = "+str(JJR))
print("JJS = "+str(JJS))
print("DT = "+str(DT))
print("WDT = "+str(WDT))
print("RB = "+str(RB))
print("RBS = "+str(RBS))
print("RBR = "+str(RBR))
print("IN = "+str(IN))
print("CC = "+str(CC))


VB = 61
VBG = 28
VBZ = 39
VBP = 41
NN = 165
NNS = 123
NNP = 133
NNPS = 1
JJ = 87
JJR = 5
JJS = 3
DT = 81
WDT = 8
RB = 51
RBS = 1
RBR = 1
IN = 143
CC = 45


In [15]:
sc.stop()