# Embedding

In [70]:
# General libraries

import pandas as pd
import numpy as np

# Word2Vec

from gensim.models import Word2Vec 

# elmo

from simple_elmo import ElmoModel

# fasttext

import fasttext

In [2]:
data = pd.read_csv("preprocessed_data.csv")
data

Unnamed: 0.1,Unnamed: 0,News Articles,Unlike,Summary
0,0,think^real%danger%happens%data%cross%net%argue...,top%gig%award%scissor%sister^new%york%band%sci...,however%careful%may%organisation%trust%persona...
1,1,fast%moving%phone%virus%appear^security%firm%w...,black%sabbath%top%rock%album%poll^black%sabbat...,new%strain%cabir%mobile%phone%virus%use%shortr...
2,2,seaman%sail%biometric%future^luxury%cruise%lin...,farrell%due%make%u%tv%debut^actor%colin%farrel...,said%french%jordanian%nigerian%national%would%...
3,3,cable%offer%videoondemand^cable%firm%ntl%telew...,u%firm%bid%lacroix%label^u%firm%said%final%neg...,cable%firm%ntl%telewest%launched%videoondemand...
4,4,make%greener%computer^hitech%industry%starting...,star%pay%tribute%actor%davis^hollywood%star%in...,seeing%thing%technology%industry%result%billio...
...,...,...,...,...
2220,2220,circuit%city%get%takeover%offer^circuit%city%s...,saintandre%anger%absent%star^sale%shark%direct...,bill%armstrong%retail%analyst%cl%king%associat...
2221,2221,german%business%confidence%slide^german%busine...,mcconnell%drunk%remark%row^scotland%first%mini...,analyst%said%ifo%figure%germany%continuing%pro...
2222,2222,walmart%fight%back%accuser^two%big%u%name%laun...,ray%dvd%beat%box%office%taking^oscarnominated%...,meanwhile%drug%group%eli%lilly%planning%campai...
2223,2223,economy%stronger%forecast^uk%economy%probably%...,whitehall%cut%ahead%target^thousand%civil%serv...,mpc%judge%overall%growth%little%higher%third%q...


In [3]:
data.drop("Unnamed: 0", axis=1, inplace=True)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   News Articles  2225 non-null   object
 1   Unlike         2225 non-null   object
 2   Summary        2225 non-null   object
dtypes: object(3)
memory usage: 52.3+ KB


In [5]:
def revert(sentence):
    x = sentence.split("^")
    return [i.split("%") for i in x]

In [6]:
data["News Articles"] = data["News Articles"].apply(revert)
data["Summary"] = data["Summary"].apply(revert)
data["Unlike"] = data["Unlike"].apply(revert)


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   News Articles  2225 non-null   object
 1   Unlike         2225 non-null   object
 2   Summary        2225 non-null   object
dtypes: object(3)
memory usage: 52.3+ KB


In [8]:
data.head()

Unnamed: 0,News Articles,Unlike,Summary
0,"[[think], [real, danger, happens, data, cross,...","[[top, gig, award, scissor, sister], [new, yor...","[[however, careful, may, organisation, trust, ..."
1,"[[fast, moving, phone, virus, appear], [securi...","[[black, sabbath, top, rock, album, poll], [bl...","[[new, strain, cabir, mobile, phone, virus, us..."
2,"[[seaman, sail, biometric, future], [luxury, c...","[[farrell, due, make, u, tv, debut], [actor, c...","[[said, french, jordanian, nigerian, national,..."
3,"[[cable, offer, videoondemand], [cable, firm, ...","[[u, firm, bid, lacroix, label], [u, firm, sai...","[[cable, firm, ntl, telewest, launched, videoo..."
4,"[[make, greener, computer], [hitech, industry,...","[[star, pay, tribute, actor, davis], [hollywoo...","[[seeing, thing, technology, industry, result,..."


In [9]:
def pad_words(l, sentence):
    n = ["_" for i in range(l-len(sentence))]
    if sentence == []:
        return n
    return n[:len(n)//2] + sentence + n[len(n)//2:]

In [10]:
def pad(text):
    max_sentence = 259
    max_word = 259
    l = max_sentence-len(text)
    f = [pad_words(max_word,[]) for i in range(l)]
    m = [pad_words(max_word,i) for i in text]
    return f[:len(f)//2]+m+f[len(f)//2:]

In [11]:
data["P News Articles"] = data["News Articles"].apply(pad)
data["P Summary"] = data["Summary"].apply(pad)
data["P Unlike"] = data["Unlike"].apply(pad)

In [12]:
data.head()

Unnamed: 0,News Articles,Unlike,Summary,P News Articles,P Summary,P Unlike
0,"[[think], [real, danger, happens, data, cross,...","[[top, gig, award, scissor, sister], [new, yor...","[[however, careful, may, organisation, trust, ...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,..."
1,"[[fast, moving, phone, virus, appear], [securi...","[[black, sabbath, top, rock, album, poll], [bl...","[[new, strain, cabir, mobile, phone, virus, us...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,..."
2,"[[seaman, sail, biometric, future], [luxury, c...","[[farrell, due, make, u, tv, debut], [actor, c...","[[said, french, jordanian, nigerian, national,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,..."
3,"[[cable, offer, videoondemand], [cable, firm, ...","[[u, firm, bid, lacroix, label], [u, firm, sai...","[[cable, firm, ntl, telewest, launched, videoo...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,..."
4,"[[make, greener, computer], [hitech, industry,...","[[star, pay, tribute, actor, davis], [hollywoo...","[[seeing, thing, technology, industry, result,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,..."


In [13]:
tmp = pd.DataFrame(data.iloc[:15,:])

## Note:

For the purposes of saving time and compute power, only the first ten rows have been used for all but the final embedding techniques. This is the usage of the BERT Transformer for embedding,

#### Vocabulary Creation

In [14]:
def create_vocabulary(text):
    words = []
    for i in text:
        words += i
    return set(words)

In [15]:
txt = []
for i in tmp.columns:
    for j in tmp[i]:
        s = []
        for k in j:
            s += k
        txt.append(s)
vocab = create_vocabulary(txt)
vocab

{'',
 'radio',
 '1',
 'falling',
 'provides',
 'hacker',
 'catch',
 'avoided',
 'opt',
 'surprising',
 'ambitous',
 'legal',
 'method',
 'outbreak',
 'called',
 'journey',
 'biometric',
 'standard',
 'hambro',
 'touchstone',
 'gueststarred',
 'keen',
 'sensitive',
 'handhelds',
 'cheerful',
 'exploding',
 'glass',
 'annual',
 'alleged',
 'dual',
 'productivity',
 'graham',
 'diatribe',
 'focusing',
 'keane',
 'game',
 'famous',
 'strong',
 'new',
 'investment',
 'ive',
 'global',
 'house',
 'main',
 'gizmondo',
 'short',
 'crystal',
 'especially',
 'likely',
 'meeting',
 'tracey',
 'criticises',
 'craze',
 'popularise',
 'technical',
 '£5000',
 'african',
 'atmosphere',
 'testing',
 'external',
 'array',
 'loyalty',
 'ozzy',
 '80',
 'checked',
 'interest',
 'cheapest',
 'resource',
 'storage',
 'compatible',
 '2',
 'manic',
 'forward',
 'read',
 'halve',
 '674',
 'recreation',
 'backoffice',
 'sport',
 'developing',
 'millennium',
 'fortunate',
 'securewave',
 'certainly',
 'worked',
 

### Bag of Words

In [16]:
def generate_bow(sentence):    
    bag_vector = np.zeros(len(vocab))
    for word in sentence: 
        for i,w in enumerate(vocab):
            if word == w: 
                bag_vector[i] += 1

    return np.array(bag_vector)[1:]

In [17]:
tmp["Bow News Articles"] = tmp["P News Articles"].apply(generate_bow)
tmp["Bow Summary"] = tmp["P Summary"].apply(generate_bow)
tmp["Bow Unlike"] = tmp["P Unlike"].apply(generate_bow)

In [18]:
tmp.head()

Unnamed: 0,News Articles,Unlike,Summary,P News Articles,P Summary,P Unlike,Bow News Articles,Bow Summary,Bow Unlike
0,"[[think], [real, danger, happens, data, cross,...","[[top, gig, award, scissor, sister], [new, yor...","[[however, careful, may, organisation, trust, ...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"[[fast, moving, phone, virus, appear], [securi...","[[black, sabbath, top, rock, album, poll], [bl...","[[new, strain, cabir, mobile, phone, virus, us...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"[[seaman, sail, biometric, future], [luxury, c...","[[farrell, due, make, u, tv, debut], [actor, c...","[[said, french, jordanian, nigerian, national,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"[[cable, offer, videoondemand], [cable, firm, ...","[[u, firm, bid, lacroix, label], [u, firm, sai...","[[cable, firm, ntl, telewest, launched, videoo...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"[[make, greener, computer], [hitech, industry,...","[[star, pay, tribute, actor, davis], [hollywoo...","[[seeing, thing, technology, industry, result,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


### TF-IDF

In [19]:
def idf(texts):
    for i in vocab:
        n = 0
        for text in texts:
            if i in text:
                    n +=1
                    break
        encodings[i] = np.log(len(texts)/n)

In [20]:
encodings = {}
idf(txt)
encodings

{'': 4.499809670330265,
 'radio': 4.499809670330265,
 '1': 4.499809670330265,
 'falling': 4.499809670330265,
 'provides': 4.499809670330265,
 'hacker': 4.499809670330265,
 'catch': 4.499809670330265,
 'avoided': 4.499809670330265,
 'opt': 4.499809670330265,
 'surprising': 4.499809670330265,
 'ambitous': 4.499809670330265,
 'legal': 4.499809670330265,
 'method': 4.499809670330265,
 'outbreak': 4.499809670330265,
 'called': 4.499809670330265,
 'journey': 4.499809670330265,
 'biometric': 4.499809670330265,
 'standard': 4.499809670330265,
 'hambro': 4.499809670330265,
 'touchstone': 4.499809670330265,
 'gueststarred': 4.499809670330265,
 'keen': 4.499809670330265,
 'sensitive': 4.499809670330265,
 'handhelds': 4.499809670330265,
 'cheerful': 4.499809670330265,
 'exploding': 4.499809670330265,
 'glass': 4.499809670330265,
 'annual': 4.499809670330265,
 'alleged': 4.499809670330265,
 'dual': 4.499809670330265,
 'productivity': 4.499809670330265,
 'graham': 4.499809670330265,
 'diatribe': 4.4

In [21]:
def tf(text,word):
    return text.count(word)/len(text)

In [22]:
def tfidf(text):
    encoded_text = []
    for i in text:
        x = []
        for j in i:
            encoded_text.append(tf(text,j)*encodings[j])
    return encoded_text

In [23]:
tmp["tfidf News Articles"] = tmp["P News Articles"].apply(tfidf)
tmp["tfidf Summary"] = tmp["P Summary"].apply(tfidf)
tmp["tfidf Unlike"] = tmp["P Unlike"].apply(tfidf)

In [24]:
tmp.head()

Unnamed: 0,News Articles,Unlike,Summary,P News Articles,P Summary,P Unlike,Bow News Articles,Bow Summary,Bow Unlike,tfidf News Articles,tfidf Summary,tfidf Unlike
0,"[[think], [real, danger, happens, data, cross,...","[[top, gig, award, scissor, sister], [new, yor...","[[however, careful, may, organisation, trust, ...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"[[fast, moving, phone, virus, appear], [securi...","[[black, sabbath, top, rock, album, poll], [bl...","[[new, strain, cabir, mobile, phone, virus, us...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"[[seaman, sail, biometric, future], [luxury, c...","[[farrell, due, make, u, tv, debut], [actor, c...","[[said, french, jordanian, nigerian, national,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"[[cable, offer, videoondemand], [cable, firm, ...","[[u, firm, bid, lacroix, label], [u, firm, sai...","[[cable, firm, ntl, telewest, launched, videoo...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"[[make, greener, computer], [hitech, industry,...","[[star, pay, tribute, actor, davis], [hollywoo...","[[seeing, thing, technology, industry, result,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


### Word2Vec

Skipgram

In [50]:
skipgram_model = Word2Vec(sentences=data['P News Articles'][0], vector_size=16, window=5, sg=0, min_count=1)

In [51]:
skipgram_model.wv.get_index("_")

0

In [52]:
skipgram_model.wv[0]

array([-0.17890997, -0.04189709,  0.3907857 ,  0.4386635 , -0.30834293,
       -0.31107485,  0.84578437,  0.5611114 , -0.40334895, -0.17294306,
        0.2759021 , -0.02242463, -0.12451392,  0.14434879, -0.42222938,
       -0.24535178], dtype=float32)

In [53]:
def get_w2v_sg(text):
    x = []
    for i in text:
        x.append(skipgram_model.wv[int(i)])
    return x

In [54]:
tmp["w2v_sg News Articles"] = tmp["Bow News Articles"].apply(get_w2v_sg)
tmp["w2v_sg Summary"] = tmp["Bow Summary"].apply(get_w2v_sg)
tmp["w2v_sg Unlike"] = tmp["Bow Unlike"].apply(get_w2v_sg)

In [55]:
tmp.head()

Unnamed: 0,News Articles,Unlike,Summary,P News Articles,P Summary,P Unlike,Bow News Articles,Bow Summary,Bow Unlike,tfidf News Articles,tfidf Summary,tfidf Unlike,w2v_sg News Articles,w2v_sg Summary,w2v_sg Unlike
0,"[[think], [real, danger, happens, data, cross,...","[[top, gig, award, scissor, sister], [new, yor...","[[however, careful, may, organisation, trust, ...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866..."
1,"[[fast, moving, phone, virus, appear], [securi...","[[black, sabbath, top, rock, album, poll], [bl...","[[new, strain, cabir, mobile, phone, virus, us...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866..."
2,"[[seaman, sail, biometric, future], [luxury, c...","[[farrell, due, make, u, tv, debut], [actor, c...","[[said, french, jordanian, nigerian, national,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866..."
3,"[[cable, offer, videoondemand], [cable, firm, ...","[[u, firm, bid, lacroix, label], [u, firm, sai...","[[cable, firm, ntl, telewest, launched, videoo...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866..."
4,"[[make, greener, computer], [hitech, industry,...","[[star, pay, tribute, actor, davis], [hollywoo...","[[seeing, thing, technology, industry, result,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866..."


Continuous Bag Of Words

In [56]:
cbow_model = Word2Vec(sentences=data['P News Articles'][0], vector_size=16, window=5, sg=1, min_count=1)

In [57]:
cbow_model.wv.get_index("_")

0

In [58]:
skipgram_model.wv[0]

array([-0.17890997, -0.04189709,  0.3907857 ,  0.4386635 , -0.30834293,
       -0.31107485,  0.84578437,  0.5611114 , -0.40334895, -0.17294306,
        0.2759021 , -0.02242463, -0.12451392,  0.14434879, -0.42222938,
       -0.24535178], dtype=float32)

In [64]:
def get_w2v_cbow(text):
    x = []
    for i in text:
        x.append(cbow_model.wv[int(i)])
    return x

In [65]:
tmp["w2v_cbow News Articles"] = tmp["Bow News Articles"].apply(get_w2v_cbow)
tmp["w2v_cbow Summary"] = tmp["Bow Summary"].apply(get_w2v_cbow)
tmp["w2v_cbow Unlike"] = tmp["Bow Unlike"].apply(get_w2v_cbow)

In [66]:
tmp.head()

Unnamed: 0,News Articles,Unlike,Summary,P News Articles,P Summary,P Unlike,Bow News Articles,Bow Summary,Bow Unlike,tfidf News Articles,tfidf Summary,tfidf Unlike,w2v_sg News Articles,w2v_sg Summary,w2v_sg Unlike,w2v_cbow News Articles,w2v_cbow Summary,w2v_cbow Unlike
0,"[[think], [real, danger, happens, data, cross,...","[[top, gig, award, scissor, sister], [new, yor...","[[however, careful, may, organisation, trust, ...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.14611764, 0.065097585, 1.3860713, 2.45703...","[[-0.14611764, 0.065097585, 1.3860713, 2.45703...","[[-0.14611764, 0.065097585, 1.3860713, 2.45703..."
1,"[[fast, moving, phone, virus, appear], [securi...","[[black, sabbath, top, rock, album, poll], [bl...","[[new, strain, cabir, mobile, phone, virus, us...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.14611764, 0.065097585, 1.3860713, 2.45703...","[[-0.14611764, 0.065097585, 1.3860713, 2.45703...","[[-0.14611764, 0.065097585, 1.3860713, 2.45703..."
2,"[[seaman, sail, biometric, future], [luxury, c...","[[farrell, due, make, u, tv, debut], [actor, c...","[[said, french, jordanian, nigerian, national,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.14611764, 0.065097585, 1.3860713, 2.45703...","[[-0.14611764, 0.065097585, 1.3860713, 2.45703...","[[-0.14611764, 0.065097585, 1.3860713, 2.45703..."
3,"[[cable, offer, videoondemand], [cable, firm, ...","[[u, firm, bid, lacroix, label], [u, firm, sai...","[[cable, firm, ntl, telewest, launched, videoo...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.14611764, 0.065097585, 1.3860713, 2.45703...","[[-0.14611764, 0.065097585, 1.3860713, 2.45703...","[[-0.14611764, 0.065097585, 1.3860713, 2.45703..."
4,"[[make, greener, computer], [hitech, industry,...","[[star, pay, tribute, actor, davis], [hollywoo...","[[seeing, thing, technology, industry, result,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.17890997, -0.04189709, 0.3907857, 0.43866...","[[-0.14611764, 0.065097585, 1.3860713, 2.45703...","[[-0.14611764, 0.065097585, 1.3860713, 2.45703...","[[-0.14611764, 0.065097585, 1.3860713, 2.45703..."


### ELMo

In [71]:
elmo = ElmoModel()

In [79]:
elmo.load("./")

2024-04-09 20:57:46,755 : INFO : Loading model from ./...
2024-04-09 20:57:46,757 : INFO : No vocabulary file found in the model.
2024-04-09 20:57:46,757 : INFO : No model.hdf5 file found. Using ./elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5 as a model file.
2024-04-09 20:57:46,758 : INFO : No vocabulary file provided; using special tokens only.
2024-04-09 20:57:46,759 : INFO : We will cache the vocabulary of 3 tokens.


ValueError: Invalid shape initializing CNN_proj/W_proj, got [2048, 512], expected (2048, 128)

In [78]:
%tb

SystemExit: Error: invalid number of characters in the options.json file: 261. Set n_characters to 262 for inference.

### Fasttext

In [None]:
tmp = tmp[["News Articles", "Summary", "Unlike"]]

In [None]:
x = ""
for i in tmp.columns:
    s = []
    for j in tmp[i]:
        f = []
        for k in j:
            f.append(" ".join(k))
        s.append(" ".join(f))
    x += " ".join(s)
x

In [None]:
with open("fastext_data.txt","w") as f:
    f.write(x)

In [None]:
smodel = fasttext.train_unsupervised('fastext_data.txt', model='skipgram')

In [None]:
cmodel = fasttext.train_unsupervised('fastext_data.txt', model='cbow')

In [None]:
cmodel["Ambidextrous"]