In [3]:
from nltk_verb_tense import get_verb_tenses
import data_parse
import string
import json

### File Parsing

In [4]:
news = data_parse.parse_files('data')

Parsing file: weather_news.txt
Parsed weather_news to: data/weather_news.json
Parsing file: technology_news1.txt
Parsed technology_news1 to: data/technology_news1.json
Parsing file: politics1.txt
Parsed politics1 to: data/politics1.json
Parsing file: business_news.txt
Parsed business_news to: data/business_news.json
Parsing file: technology_news.txt
Parsed technology_news to: data/technology_news.json
Parsing file: politics.txt
Parsed politics to: data/politics.json
Parsing file: sports_news.txt
Parsed sports_news to: data/sports_news.json
Parsing file: sports_news1.txt
Parsed sports_news1 to: data/sports_news1.json
Parsing file: crime_news.txt
Parsed crime_news to: data/crime_news.json


In [5]:
print(news.keys())

dict_keys(['weather_news', 'technology_news1', 'politics1', 'business_news', 'technology_news', 'politics', 'sports_news', 'sports_news1', 'crime_news'])


Append similar categories together

In [6]:
appended_news = {}
for each_news in news:
    category_news = each_news.rstrip(string.digits)
    if category_news in appended_news:
        appended_news[category_news] += news[each_news]
    else:
        appended_news[category_news] = news[each_news]
print(appended_news.keys())

dict_keys(['weather_news', 'technology_news', 'politics', 'business_news', 'sports_news', 'crime_news'])


### Analyse Verb Tenses

In [7]:
text1 = "The initial moisture will likely start as rain, and some thunderstorms could develop with heavy rain and small hail overnight, according to the National Weather Service in Boulder. But it will change over to a wintry mix and then to snow over the plains as we move into early Saturday morning. A series of storm systems will bring colder temps & snow beginning Friday night & continuing through the weekend. Precipitation articleegins as rain Friday & early Saturday for lower elevations. Roads are expected to become slick at times, especially in the mountains. #cowx pic.twitter.com/fRJlcOuWDs"

x, y = get_verb_tenses(text1)
print(x)

{'sequence': ['future', 'future', 'present', 'future', 'present', 'future', 'present', 'present', 'present', 'past'], 'tenses': {'future': 4, 'present': 5, 'past': 1}, 'words': [(3, 'will', 'future'), (12, 'could', 'future'), (22, 'according', 'present'), (33, 'will', 'future'), (49, 'move', 'present'), (60, 'will', 'future'), (70, 'continuing', 'present'), (76, 'articleegins', 'present'), (88, 'are', 'present'), (89, 'expected', 'past')]}


In [8]:
tense_tags = []
for i in y:
    if (i[1] not in tense_tags):
        tense_tags.append(i[1])
print(tense_tags)

['DT', 'JJ', 'NN', 'MD', 'RB', 'VB', 'IN', ',', 'CC', 'NNS', 'VBG', 'TO', 'NNP', '.', 'PRP', 'VBP', 'VBZ', 'JJR', 'VBN', '#']


In [9]:
for index, i in enumerate(y):
    if i[1] == "VBP":
        print(index, i)

49 ('move', 'VBP')
88 ('are', 'VBP')


### Bigram Tense Sequence Finder

In [12]:
def get_bigram_seq(article):
    if (type(article) == str):
        article = [article]
    map = {}
    total_tense_count = 0
    tense_list = []
    for sentence in article:
        tense, _ = get_verb_tenses(sentence)
        tense_seq = tense['sequence'] 
        if (tense_seq and len(tense_seq) > 1):
            tense_list.append(tense)
            for i in range(len(tense_seq) - 1):
                current = tense_seq[i] + " " + tense_seq[i+1]
                if current in map:
                    map[current] += 1
                else:
                    map[current] = 1
                total_tense_count += 1
                # if tense_seq[i] in map:
                #     map[tense_seq[i]] += 1
                # else:
                #     map[tense_seq[i]] = 1
            # if tense_seq[-1] in map:
            #     map[tense_seq[-1]] += 1
            # else:
            #     map[tense_seq[-1]] = 1
    for seq in map:
        map[seq] = map[seq] / total_tense_count
    return map, tense_list

#### Testing 

In [13]:
x, y = get_bigram_seq(["The initial moisture will likely start as rain, and some thunderstorms could develop with heavy rain and small hail overnight, according to the National Weather Service in Boulder. ", "But it will change over to a wintry mix and then to snow over the plains as we move into early Saturday morning. A series of storm systems will bring colder temps & snow beginning Friday night & continuing through the weekend. Precipitation begins as rain Friday & early Saturday for lower elevations. Roads are expected to become slick at times, especially in the mountains. #cowx pic.twitter.com/fRJlcOuWDs"])

In [14]:
x

{'future future': 0.125,
 'future present': 0.375,
 'present future': 0.125,
 'present present': 0.25,
 'present past': 0.125}

In [15]:
y

[{'sequence': ['future', 'future', 'present'],
  'tenses': {'future': 2, 'present': 1},
  'words': [(3, 'will', 'future'),
   (12, 'could', 'future'),
   (22, 'according', 'present')]},
 {'sequence': ['future',
   'present',
   'future',
   'present',
   'present',
   'present',
   'past'],
  'tenses': {'future': 2, 'present': 4, 'past': 1},
  'words': [(2, 'will', 'future'),
   (18, 'move', 'present'),
   (29, 'will', 'future'),
   (39, 'continuing', 'present'),
   (45, 'begins', 'present'),
   (57, 'are', 'present'),
   (58, 'expected', 'past')]}]

In [16]:
def getArticleText(article_category, article_id):
    for category in appended_news:
        if category == article_category:
            for article in appended_news[category]:
                if article["id"] == article_id:
                    return article['text']
    return None

In [17]:
text = getArticleText("weather_news", "weather_news-2")

### Finding Bigram Tense sequence for news

In [18]:
news_tense = {}
for category in appended_news:
    news_tense[category] = []
    for article in appended_news[category]:
        print(f"\rAnalyzing: {category} ({article['id']})     ", end='')
        article_tense = {}
        article_tense["id"] = article["id"] 
        article_tense["sentence_tense_seq"], _ = get_bigram_seq(article['text'])
        article_tense["full_tense_seq"], _ = get_bigram_seq(" ".join(article['text']))
        news_tense[category].append(article_tense)
    print(f"\rAnalyzed: {category}                                     ")

Analyzed: weather_news                                     
Analyzed: technology_news                                     
Analyzed: politics                                     
Analyzed: business_news                                     
Analyzed: sports_news                                     
Analyzed: crime_news                                     


In [19]:
news_tense

{'weather_news': [{'id': 'weather_news-0',
   'sentence_tense_seq': {'present present': 1.0},
   'full_tense_seq': {'present present': 1.0}},
  {'id': 'weather_news-1',
   'sentence_tense_seq': {'present past': 0.25,
    'past past': 0.375,
    'past present': 0.21875,
    'present present': 0.0625,
    'future present': 0.03125,
    'past future': 0.03125,
    'future past': 0.03125},
   'full_tense_seq': {'present past': 0.24489795918367346,
    'past past': 0.3673469387755102,
    'past present': 0.22448979591836735,
    'present present': 0.08163265306122448,
    'present future': 0.02040816326530612,
    'future present': 0.02040816326530612,
    'past future': 0.02040816326530612,
    'future past': 0.02040816326530612}},
  {'id': 'weather_news-2',
   'sentence_tense_seq': {'present future': 0.5, 'future present': 0.5},
   'full_tense_seq': {'present present': 0.6923076923076923,
    'present past': 0.07692307692307693,
    'past present': 0.07692307692307693,
    'present future

### JSON Output

In [20]:
def json_output(path, data):
    with open(path, 'w+') as of:
        json.dump(data, of)

In [21]:
for category in news_tense:
    json_output(f'output/{category}.json', news_tense[category])
json_output(f'output/all_data.json', news_tense)

### CSV Output 

In [22]:
import pandas as pd

In [23]:
news_tense_seq_flat = []

for category in news_tense:
    for article in news_tense[category]:
        rows = {}
        rows['category'] = category
        rows['id'] = article['id']
        for tense_seq in article['sentence_tense_seq']:
            rows[f's_{tense_seq}'] = article['sentence_tense_seq'][tense_seq]
        for tense_seq in article['full_tense_seq']:
            rows[f'f_{tense_seq}'] = article['full_tense_seq'][tense_seq]
        news_tense_seq_flat.append(rows)       

In [24]:
df = pd.DataFrame(news_tense_seq_flat).fillna(0)
df.to_csv('output/all_data_flat.csv')

In [25]:
df

Unnamed: 0,category,id,s_present present,f_present present,s_present past,s_past past,s_past present,s_future present,s_past future,s_future past,f_present past,f_past past,f_past present,f_present future,f_future present,f_past future,f_future past,s_present future,s_future future,f_future future
0,weather_news,weather_news-0,1.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00
1,weather_news,weather_news-1,0.062500,0.081633,0.250000,0.375000,0.218750,0.031250,0.031250,0.031250,0.244898,0.367347,0.224490,0.020408,0.020408,0.020408,0.020408,0.000000,0.000000,0.00
2,weather_news,weather_news-2,0.000000,0.692308,0.000000,0.000000,0.000000,0.500000,0.000000,0.000000,0.076923,0.000000,0.076923,0.076923,0.076923,0.000000,0.000000,0.500000,0.000000,0.00
3,weather_news,weather_news-3,0.594595,0.545455,0.108108,0.000000,0.081081,0.081081,0.000000,0.000000,0.090909,0.000000,0.064935,0.129870,0.142857,0.025974,0.000000,0.135135,0.000000,0.00
4,weather_news,weather_news-4,0.461538,0.300000,0.115385,0.000000,0.000000,0.346154,0.000000,0.000000,0.060000,0.000000,0.040000,0.240000,0.260000,0.020000,0.000000,0.000000,0.076923,0.08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14815,crime_news,crime_news-2029,0.142857,0.142857,0.285714,0.285714,0.285714,0.000000,0.000000,0.000000,0.285714,0.285714,0.285714,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00
14816,crime_news,crime_news-2030,0.218750,0.208333,0.250000,0.250000,0.218750,0.031250,0.031250,0.000000,0.229167,0.312500,0.208333,0.000000,0.020833,0.020833,0.000000,0.000000,0.000000,0.00
14817,crime_news,crime_news-2031,0.057143,0.039216,0.114286,0.600000,0.142857,0.028571,0.028571,0.028571,0.156863,0.588235,0.137255,0.000000,0.019608,0.039216,0.019608,0.000000,0.000000,0.00
14818,crime_news,crime_news-2032,0.068493,0.069307,0.191781,0.534247,0.164384,0.000000,0.000000,0.027397,0.188119,0.504950,0.198020,0.009901,0.000000,0.009901,0.019802,0.013699,0.000000,0.00
