In [12]:
import nltk
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')

file_path = 'meaningful_corpus.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

def preprocess_text(text):
    tokens = word_tokenize(text.lower()) 
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

tokens = preprocess_text(text)

bigram_finder = BigramCollocationFinder.from_words(tokens)

bigram_measures = BigramAssocMeasures()
collocations = bigram_finder.nbest(bigram_measures.pmi, 10)  # Top 10 collocations

print("Top 10 collocations based on PMI:")
for bigram in collocations:
    print(bigram)

Top 10 collocations based on PMI:
('advancements', 'medicine')
('age', 'distribution')
('around', 'world')
('cities', 'around')
('declining', 'birth')
('degradation', 'resource')
('density', 'varies')
('depletion', 'government')
('different', 'regions')
('due', 'various')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
def pos_tagging(tokens):
    return nltk.pos_tag(tokens)

def filter_bigrams_by_pos(bigrams, pos_tags):
    filtered_bigrams = []
    for bigram in bigrams:
        word1, word2 = bigram
        pos1 = pos_tags.get(word1, None)
        pos2 = pos_tags.get(word2, None)
        if pos1 and pos2 and pos1.startswith('N') and pos2.startswith('N'):  # Example filter: both words are nouns
            filtered_bigrams.append(bigram)
    return filtered_bigrams

pos_tags = dict(pos_tagging(tokens)) 

bigram_finder = BigramCollocationFinder.from_words(tokens)
bigram_freq = bigram_finder.ngram_fd

bigram_measures = BigramAssocMeasures()

collocations = bigram_finder.nbest(bigram_measures.pmi, 10)

sorted_bigrams = sorted(bigram_freq.items(), key=lambda x: x[1], reverse=True)

filtered_bigrams = filter_bigrams_by_pos(collocations, pos_tags)

print("Top 10 collocations based on PMI:")
for bigram in collocations:
    print(bigram)


Top 10 collocations based on PMI:
('advancements', 'medicine')
('age', 'distribution')
('around', 'world')
('cities', 'around')
('declining', 'birth')
('degradation', 'resource')
('density', 'varies')
('depletion', 'government')
('different', 'regions')
('due', 'various')


In [41]:
from scipy import stats

bigram_finder2 = BigramCollocationFinder.from_words(tokens)
bigram_freq2 = bigram_finder2.ngram_fd

bigrams_to_compare = set(bigram_freq.keys()).intersection(bigram_freq2.keys())

freqs1 = [bigram_freq[bigram] for bigram in bigrams_to_compare]
freqs2 = [bigram_freq2[bigram] for bigram in bigrams_to_compare]

t_stat, p_value = stats.ttest_ind(freqs1, freqs2, equal_var=False)

print("\nT-test results:")
print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")


alpha = 0.05
cv=2.576
if p_value < cv:
    print(f"The result is statistically significant at the {cv} level.")
else:
    print(f"The result is not statistically significant at the {cv} level.")



T-test results:
T-statistic: 0.0
P-value: 1.0
The result is statistically significant at the 2.576 level.
