In [1]:
with open('lemmatized.txt', 'r') as lemmatized_file:
    lemmatized = lemmatized_file.read().splitlines()

In [2]:
# Concatenate all preprocessed texts into a single string
concatenated_text = ' '.join(lemmatized)

# Tokenize the text into words
tokens = concatenated_text.split()

# Calculate N (number of tokens)
N = len(tokens)

# Calculate V (vocabulary size)
V = len(set(tokens))

# Output the results
print("Number of Tokens (N):", N)
print("Vocabulary Size (V):", V)

Number of Tokens (N): 5696406
Vocabulary Size (V): 123026


In [3]:
from nltk import ngrams
from collections import Counter

trigrams = list(ngrams(tokens, 3))

# Count the occurrences of each trigram
trigram_counts = Counter(trigrams)

# Get the top 25 trigrams
top_25_trigrams = trigram_counts.most_common(25)

# Output the results
print("Top 25 Trigrams:")
for trigram, count in top_25_trigrams:
    print(f"{trigram}: {count}")

Top 25 Trigrams:
('one', 'of', 'the'): 2434
('on', 'share', 'of'): 2095
('on', 'the', 'stock'): 1567
('as', 'well', 'a'): 1418
('in', 'research', 'report'): 1415
('in', 'research', 'note'): 1373
('be', 'able', 'to'): 1267
('the', 'united', 'state'): 1223
('for', 'the', 'quarter'): 1221
('average', 'price', 'of'): 1193
('research', 'report', 'on'): 1177
('research', 'note', 'on'): 1138
('the', 'end', 'of'): 1135
('share', 'of', 'the'): 1133
('in', 'report', 'on'): 1124
('earnings', 'per', 'share'): 1121
('cell', 'phone', 'plan'): 1073
('phone', 'plan', 'detail'): 1070
('accord', 'to', 'the'): 1064
('buy', 'rating', 'to'): 1016
('of', 'the', 'company'): 1002
('appear', 'first', 'on'): 994
('day', 'move', 'average'): 993
('price', 'target', 'on'): 981
('be', 'one', 'of'): 970


In [4]:
# Read positive and negative word lists
with open('signal-news1/opinion-lexicon-English/positive-words.txt', 'r') as positive_file:
    positive_words = set(positive_file.read().splitlines())

with open('signal-news1/opinion-lexicon-English/negative-words.txt', 'r') as negative_file:
    negative_words = set(negative_file.read().splitlines())

# Count the occurrences of positive and negative words
positive_word_count = sum(1 for word in tokens if word in positive_words)
negative_word_count = sum(1 for word in tokens if word in negative_words)

# Output the results
print("Positive Word Count:", positive_word_count)
print("Negative Word Count:", negative_word_count)

Positive Word Count: 176422
Negative Word Count: 143142


In [7]:
import nltk

# Initialize counters
more_positive = 0
more_negative = 0

# Iterate through each news story
for news_story in lemmatized:
    # Tokenize the news story into words
    tokens = nltk.word_tokenize(news_story)

    # Count the occurrences of positive and negative words
    positive_word_count = sum(1 for word in tokens if word in positive_words)
    negative_word_count = sum(1 for word in tokens if word in negative_words)

    # Compare positive and negative word counts
    if positive_word_count > negative_word_count:
        more_positive += 1
    elif negative_word_count > positive_word_count:
        more_negative += 1

# Output the results
print("Number of news stories with more positive than negative words:", more_positive)
print("Number of news stories with more negative than positive words:", more_negative)

Number of news stories with more positive than negative words: 10435
Number of news stories with more negative than positive words: 6893
