In [48]:
#Dataframe manipulation 
import pandas as pd
import numpy as np

#NLTK library
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag

# country library
import pycountry

#plotting library
import matplotlib.pyplot as plt

In [2]:
#import csv
df_1 = pd.read_csv('articles_1.csv')
df_2 = pd.read_csv('articles_2.csv')

In [3]:
# concatenating the two dataframes
df = pd.concat([df_1, df_2], ignore_index=True)


In [6]:
#convert the data to datetime
df.article_date = pd.to_datetime(df.article_date)


In [84]:
#Select only the articles from 2019
df = df[(df['article_date'].dt.year == 2019)]

In [85]:
# Create a dataframe of only the articles  
df_articles = pd.DataFrame(df.article_body)

In [86]:
df_articles = df_articles.dropna(how='all')

In [96]:
articles_list =  []
for article in df_articles.article_body:
    articles_list.append((word_tokenize(article)))

In [51]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [112]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [114]:
for index,title in zip(df.index,df.article_title):
    for country in pycountry.countries:
        if country.name in title:
#             print(index,country.name)
            pass

In [117]:
# experimenting with array of texts
text_array = []
for article in df.article_body:
    text_array.append(article)

In [119]:
len(text_array)

1730

In [141]:
sent = preprocess(text_array[0])

In [151]:
pattern = 'NNP: {<DT>?<NN>*<JS>}'

In [152]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  Trina/NNP
  Solar/NNP
  recently/RB
  announced/VBD
  that/IN
  it/PRP
  has/VBZ
  begun/VBN
  mass/JJ
  production/NN
  of/IN
  N-type/JJ
  i-TOPCon/JJ
  double-glass/NN
  bifacial/JJ
  modules/NNS
  ./.
  Image/NN
  :/:
  Trina/NNP
  Solar/NNP
  's/POS
  new/JJ
  high/JJ
  performanceTSM-NEG15MC.20/NN
  (/(
  II/NNP
  )/)
  module/NN
  incorporates/VBZ
  i-TOPCon/JJ
  cell/NN
  technology/NN
  ./.
  Photo/NNP
  courtesy/NN
  of/IN
  Trina/NNP
  Solar/NNP
  Co./NNP
  ,/,
  Ltd.The/NNP
  best/JJS
  front/JJ
  side/NN
  power/NN
  output/NN
  of/IN
  a/DT
  module/NN
  with/IN
  144/CD
  half-cut/JJ
  i-TOPCon/JJ
  cells/NNS
  reaches/VBZ
  425/CD
  Wp/NNP
  ,/,
  and/CC
  the/DT
  best/JJS
  module/NN
  efficiency/NN
  reaches/VBZ
  20.7/CD
  %/NN
  .The/JJ
  new/JJ
  i-TOPCon/JJ
  double/JJ
  glass/NN
  PV/NNP
  modules/VBZ
  integrate/VB
  these/DT
  N-type/JJ
  bifacial/JJ
  i-TOPCon/JJ
  cells/NNS
  with/IN
  over/IN
  80/CD
  %/NN
  bifaciality/NN
  ,/,
  multi-busbar/NN
  (/

In [153]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

In [154]:
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('Trina', 'NNP', 'O'),
 ('Solar', 'NNP', 'O'),
 ('recently', 'RB', 'O'),
 ('announced', 'VBD', 'O'),
 ('that', 'IN', 'O'),
 ('it', 'PRP', 'O'),
 ('has', 'VBZ', 'O'),
 ('begun', 'VBN', 'O'),
 ('mass', 'JJ', 'O'),
 ('production', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('N-type', 'JJ', 'O'),
 ('i-TOPCon', 'JJ', 'O'),
 ('double-glass', 'NN', 'O'),
 ('bifacial', 'JJ', 'O'),
 ('modules', 'NNS', 'O'),
 ('.', '.', 'O'),
 ('Image', 'NN', 'O'),
 (':', ':', 'O'),
 ('Trina', 'NNP', 'O'),
 ('Solar', 'NNP', 'O'),
 ("'s", 'POS', 'O'),
 ('new', 'JJ', 'O'),
 ('high', 'JJ', 'O'),
 ('performanceTSM-NEG15MC.20', 'NN', 'O'),
 ('(', '(', 'O'),
 ('II', 'NNP', 'O'),
 (')', ')', 'O'),
 ('module', 'NN', 'O'),
 ('incorporates', 'VBZ', 'O'),
 ('i-TOPCon', 'JJ', 'O'),
 ('cell', 'NN', 'O'),
 ('technology', 'NN', 'O'),
 ('.', '.', 'O'),
 ('Photo', 'NNP', 'O'),
 ('courtesy', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('Trina', 'NNP', 'O'),
 ('Solar', 'NNP', 'O'),
 ('Co.', 'NNP', 'O'),
 (',', ',', 'O'),
 ('Ltd.The', 'NNP', 'O'),
 ('

In [157]:
ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(text_array[0])))
print(ne_tree)

(S
  (GPE Trina/NNP)
  (PERSON Solar/NNP)
  recently/RB
  announced/VBD
  that/IN
  it/PRP
  has/VBZ
  begun/VBN
  mass/JJ
  production/NN
  of/IN
  N-type/JJ
  i-TOPCon/JJ
  double-glass/NN
  bifacial/JJ
  modules/NNS
  ./.
  Image/NN
  :/:
  (PERSON Trina/NNP Solar/NNP)
  's/POS
  new/JJ
  high/JJ
  performanceTSM-NEG15MC.20/NN
  (/(
  II/NNP
  )/)
  module/NN
  incorporates/VBZ
  i-TOPCon/JJ
  cell/NN
  technology/NN
  ./.
  (PERSON Photo/NNP)
  courtesy/NN
  of/IN
  (GPE Trina/NNP)
  Solar/NNP
  Co./NNP
  ,/,
  Ltd.The/NNP
  best/JJS
  front/JJ
  side/NN
  power/NN
  output/NN
  of/IN
  a/DT
  module/NN
  with/IN
  144/CD
  half-cut/JJ
  i-TOPCon/JJ
  cells/NNS
  reaches/VBZ
  425/CD
  Wp/NNP
  ,/,
  and/CC
  the/DT
  best/JJS
  module/NN
  efficiency/NN
  reaches/VBZ
  20.7/CD
  %/NN
  .The/JJ
  new/JJ
  i-TOPCon/JJ
  double/JJ
  glass/NN
  PV/NNP
  modules/VBZ
  integrate/VB
  these/DT
  N-type/JJ
  bifacial/JJ
  i-TOPCon/JJ
  cells/NNS
  with/IN
  over/IN
  80/CD
  %/NN
  bifaci

In [None]:
import nltk
import re
import time

exampleArray = ['The incredibly intimidating NLP scares people away who are sissies.']


contentArray =['Starbucks is not doing very well lately.',
               'Overall, while it may seem there is already a Starbucks on every corner, Starbucks still has a lot of room to grow.',
               'They just began expansion into food products, which has been going quite well so far for them.',
               'I can attest that my own expenditure when going to Starbucks has increased, in lieu of these food products.',
               'Starbucks is also indeed expanding their number of stores as well.',
               'Starbucks still sees strong sales growth here in the united states, and intends to actually continue increasing this.',
               'Starbucks also has one of the more successful loyalty programs, which accounts for 30%  of all transactions being loyalty-program-based.',
               'As if news could not get any more positive for the company, Brazilian weather has become ideal for producing coffee beans.',
               'Brazil is the world\'s #1 coffee producer, the source of about 1/3rd of the entire world\'s supply!',
               'Given the dry weather, coffee farmers have amped up production, to take as much of an advantage as possible with the dry weather.',
               'Increase in supply... well you know the rules...',]



##let the fun begin!##
def processLanguage():
    try:
        for item in contentArray:
            tokenized = nltk.word_tokenize(item)
            tagged = nltk.pos_tag(tokenized)
            print (tagged)

            namedEnt = nltk.ne_chunk(tagged)
            namedEnt.draw()

            time.sleep(1)

    except (Exception, e):
        print (e)
        

processLanguage()

[('Starbucks', 'NNS'), ('is', 'VBZ'), ('not', 'RB'), ('doing', 'VBG'), ('very', 'RB'), ('well', 'RB'), ('lately', 'RB'), ('.', '.')]
[('Overall', 'JJ'), (',', ','), ('while', 'IN'), ('it', 'PRP'), ('may', 'MD'), ('seem', 'VB'), ('there', 'EX'), ('is', 'VBZ'), ('already', 'RB'), ('a', 'DT'), ('Starbucks', 'NNS'), ('on', 'IN'), ('every', 'DT'), ('corner', 'NN'), (',', ','), ('Starbucks', 'NNP'), ('still', 'RB'), ('has', 'VBZ'), ('a', 'DT'), ('lot', 'NN'), ('of', 'IN'), ('room', 'NN'), ('to', 'TO'), ('grow', 'VB'), ('.', '.')]
[('They', 'PRP'), ('just', 'RB'), ('began', 'VBD'), ('expansion', 'NN'), ('into', 'IN'), ('food', 'NN'), ('products', 'NNS'), (',', ','), ('which', 'WDT'), ('has', 'VBZ'), ('been', 'VBN'), ('going', 'VBG'), ('quite', 'RB'), ('well', 'RB'), ('so', 'RB'), ('far', 'RB'), ('for', 'IN'), ('them', 'PRP'), ('.', '.')]
[('I', 'PRP'), ('can', 'MD'), ('attest', 'VB'), ('that', 'IN'), ('my', 'PRP$'), ('own', 'JJ'), ('expenditure', 'NN'), ('when', 'WRB'), ('going', 'VBG'), ('to