In [1]:
import numpy as np
import pandas as pd
import nltk
import os
import re
from nltk.corpus import stopwords

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vepif\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
data = pd.read_csv('../assets/annotated-corpus/train/0/0.tsv', sep='\t', header= None)

In [36]:
stem = data[0].to_list()

In [37]:
stem

['Venezuelans',
 'Vote',
 'Early',
 'in',
 'Referendum',
 'on',
 'Chavez',
 'Rule',
 'Reuters',
 'Reuters',
 'Venezuelans',
 'turned',
 'out',
 'early',
 'and',
 'in',
 'large',
 'numbers',
 'on',
 'Sunday',
 'to',
 'vote',
 'in',
 'a',
 'historic',
 'referendum',
 'that',
 'will',
 'either',
 'remove',
 'left',
 'wing',
 'President',
 'Hugo',
 'Chavez',
 'from',
 'office',
 'or',
 'give',
 'him',
 'a',
 'new',
 'mandate',
 'to',
 'govern',
 'for',
 'the',
 'next',
 'two',
 'years',
 'S.Koreans',
 'Clash',
 'with',
 'Police',
 'on',
 'Iraq',
 'Troop',
 'Dispatch',
 'Reuters',
 'Reuters',
 'South',
 'Korean',
 'police',
 'used',
 'water',
 'cannon',
 'in',
 'central',
 'Seoul',
 'Sunday',
 'to',
 'disperse',
 'at',
 'least',
 '7',
 ',',
 '000',
 'protesters',
 'urging',
 'the',
 'government',
 'to',
 'reverse',
 'a',
 'controversial',
 'decision',
 'to',
 'send',
 'more',
 'troops',
 'to',
 'Iraq',
 'Palestinians',
 'in',
 'Israeli',
 'Jails',
 'Start',
 'Hunger',
 'Strike',
 'Reuters',

In [38]:
def prepare_data(tokens):
    pattern = r'[^\w\s]'
    stop_words = set(stopwords.words('english'))
    
    cleaned_tokens = []
    for token in tokens:
        clean_token = re.sub(pattern, '', str(token).lower())
        if clean_token != '' and clean_token not in stop_words:
            cleaned_tokens.append(clean_token)
        
    return cleaned_tokens

In [39]:
stem = prepare_data(stem)

In [66]:
corpus_count = len(stem)

In [49]:
import itertools as itertools
from collections import Counter 
from nltk.collocations import TrigramCollocationFinder

def calculate_trigrams(corpus):
    words_cnt = Counter()
    trigrams = Counter()
    for i in range(len(corpus) - 3):
        w1,w2,w3 = corpus[i], corpus[i+1], corpus[i+2]
        words_cnt[w1] += 1
        trigrams[(w1, w2, w3)] += 1
    return words_cnt, trigrams

In [42]:
words_freq, trigrams = calculate_trigrams(stem)

In [63]:
words_freq['people']

2850

In [46]:
text = nltk.Text(stem)

In [75]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()

In [69]:
def calculate_MI(words_freq, trigrams, corpus_count):
    mi_scores = []
    for tr in trigrams.keys():
        without_log = (corpus_count ** 2) * trigrams[tr]
        mult = (words_freq[tr[0]] * words_freq[tr[1]] * words_freq[tr[2]])
        mi_scores.append([" ".join(tr), np.log(without_log / mult)])
    return mi_scores

In [70]:
MI_scores = calculate_MI(words_freq, trigrams, corpus_count)

In [72]:
MI_scores_best = sorted(MI_scores, reverse=True, key=lambda x: x[1])

In [80]:
print(finder_thr.nbest(trigram_measures.pmi, 30))

[('2hrs', '26min', '20sec'), ('absurdity', 'societys', 'cliches'), ('adel', 'aldoori', 'cupped'), ('adi', 'lin', 'xiaochun'), ('allard', 'cosima', 'marriner'), ('anterior', 'cruciate', 'ligament'), ('archeologist', 'zemaryalai', 'tarzi'), ('aromas', 'raspberry', 'gooseberry'), ('avril', 'lavigne', 'norah'), ('bach', 'beethoven', 'jars'), ('bainum', 'outspent', 'mufi'), ('banged', 'pots', 'pans'), ('beefeater', 'gin', 'courvoiser'), ('beyonce', 'avril', 'lavigne'), ('biz', 'heather', 'locklear'), ('bluestar', 'baosteel', 'minmetals'), ('bungaran', 'antonius', 'simanjuntak'), ('burton', 'alfre', 'woodard'), ('caffeine', 'guarana', 'ginseng'), ('camel', 'caging', 'instigator'), ('carlo', 'azeglio', 'ciampi'), ('champa', 'devi', 'shukla'), ('choy', 'yuk', 'wipes'), ('cognac', 'malibu', 'rum'), ('comedienne', 'shabana', 'rehmans'), ('companypetrobras', 'barracuda', 'caratinga'), ('cosmonaut', 'gennady', 'padalka'), ('coultan', 'catharine', 'munro'), ('councilor', 'hua', 'jianmin'), ('courvo

In [81]:
MI_scores_best[:30]

[['sci fi smackdown', 27.230410924658056],
 ['phaithful phlock phinish', 27.230410924658056],
 ['martina navratilova merlene', 27.230410924658056],
 ['navratilova merlene ottey', 27.230410924658056],
 ['pratima kumari sanamacha', 27.230410924658056],
 ['kumari sanamacha chanu', 27.230410924658056],
 ['peirsol brendan hansen', 27.230410924658056],
 ['retd jehangir karamat', 27.230410924658056],
 ['khultoum eissa abdallah', 27.230410924658056],
 ['2hrs 26min 20sec', 27.230410924658056],
 ['comedienne shabana rehmans', 27.230410924658056],
 ['archeologist zemaryalai tarzi', 27.230410924658056],
 ['stepmother bev seated', 27.230410924658056],
 ['kims sequined pasty', 27.230410924658056],
 ['sangakkara avishka gunawardene', 27.230410924658056],
 ['starship enterprise warping', 27.230410924658056],
 ['jasbir kang yuba', 27.230410924658056],
 ['straddles gunma nagano', 27.230410924658056],
 ['giorgio moroder inducted', 27.230410924658056],
 ['ducked ohel moishe', 27.230410924658056],
 ['noodl

In [84]:
trigrams[('sci', 'fi' ,'smackdown')]

1

In [87]:
words_freq['sci']

1

In [88]:
words_freq['fi']

1

In [89]:
words_freq['smackdown']

1