In [1]:
import pandas as pd
import numpy as np

import fasttext
from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

2024-04-08 08:25:15.371775: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-08 08:25:15.444107: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-08 08:25:15.820866: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-08 08:25:15.822546: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data = pd.read_csv("preprocessed_data.csv")
data.drop("Unnamed: 0", axis=1, inplace=True)

def revert(sentence):
    x = sentence.split("^")
    return [i.split("%") for i in x]

def flatten(text):
    x = []
    for sentence in text:
        x += sentence
    return x

data["News Articles"] = data["News Articles"].apply(revert)
data["Summary"] = data["Summary"].apply(revert)
data["Unlike"] = data["Unlike"].apply(revert)

data["News Articles"] = data["News Articles"].apply(flatten)
data["Summary"] = data["Summary"].apply(flatten)
data["Unlike"] = data["Unlike"].apply(flatten)

data

Unnamed: 0,News Articles,Summary,Unlike
0,"[think, real, danger, happens, data, cross, ne...","[however, careful, may, organisation, trust, p...","[eu, aiming, fuel, development, aid, european,..."
1,"[fast, moving, phone, virus, appear, security,...","[new, strain, cabir, mobile, phone, virus, use...","[uk, house, price, dip, november, uk, house, p..."
2,"[seaman, sail, biometric, future, luxury, crui...","[said, french, jordanian, nigerian, national, ...","[redknapp, poised, saint, southampton, set, un..."
3,"[cable, offer, videoondemand, cable, firm, ntl...","[cable, firm, ntl, telewest, launched, videoon...","[green, report, shun, supply, chain, nearly, 2..."
4,"[make, greener, computer, hitech, industry, st...","[seeing, thing, technology, industry, result, ...","[adrianos, chelsea, link, rejected, adrianos, ..."
...,...,...,...
2220,"[circuit, city, get, takeover, offer, circuit,...","[bill, armstrong, retail, analyst, cl, king, a...","[roddick, san, jose, final, andy, roddick, pla..."
2221,"[german, business, confidence, slide, german, ...","[analyst, said, ifo, figure, germany, continui...","[richard, judy, choose, top, book, 10, author,..."
2222,"[walmart, fight, back, accuser, two, big, u, n...","[meanwhile, drug, group, eli, lilly, planning,...","[kilroy, launch, veritas, party, exbbc, chat, ..."
2223,"[economy, stronger, forecast, uk, economy, pro...","[mpc, judge, overall, growth, little, higher, ...","[budget, bring, smiling, voter, tory, spokesma..."


In [3]:
def fasttext_training(tmp):
    model_type = "skipgram"
    x = ""
    for i in ["News Articles", "Summary","Unlike"]:
        s = []
        for j in tmp[i]:
            s.append(" ".join(j))
        x += " ".join(s)
    with open("fastext_data.txt","w") as f:
        f.write(x)
    return fasttext.train_unsupervised('fastext_data.txt', model=model_type)

In [4]:
embedding_model = fasttext_training(data)

Read 1M words
Number of words:  16839
Number of labels: 0
Progress: 100.0% words/sec/thread:   24305 lr:  0.000000 avg.loss:  2.023170 ETA:   0h 0m 0s


In [5]:
f = 0
c = ""
for i in data.columns:
    for j in data[i]:
        if f<len(j):
            f = len(j)
            c = j
f

2440

In [6]:
def pad_words(sentence):
    l = 2440
    n = ["_" for i in range(l-len(sentence))]
    if sentence == []:
        return n
    return n[:len(n)//2] + sentence + n[len(n)//2:]

In [7]:
data["P News Articles"] = data["News Articles"].apply(pad_words)
data["P Summary"] = data["Summary"].apply(pad_words)
data["P Unlike"] = data["Unlike"].apply(pad_words)

In [8]:
def embed(text):
    embedding = []
    for i in text:
        embedding.append(embedding_model[i])
    return embedding

In [9]:
data["E News Articles"] = data["P News Articles"].apply(embed)
data["E Summary"] = data["P Summary"].apply(embed)
data["E Unlike"] = data["P Unlike"].apply(embed)

In [10]:
data.drop([i for i in data.columns if i[0]!="E"], axis=1, inplace=True)
data = data[data.columns][:100]

In [11]:
x = []
y = []

for i in range(len(data["E News Articles"])):
    x.append(data["E News Articles"][i] + data["E News Articles"][i])
    x.append(data["E News Articles"][i] + data["E Summary"][i])
    x.append(data["E News Articles"][i] + data["E Unlike"][i])
    y.append(1.0)
    y.append(0.5)
    y.append(0.0)

In [12]:
del data

In [13]:
x = np.asarray(x)
x.shape

(300, 4880, 100)

In [14]:
y = np.asarray(y)

In [15]:
xtr,xt,ytr,yt = train_test_split(x,y,test_size=0.3,random_state=10)

In [16]:
model = Sequential()
model.add(Dense(2440, activation="relu", input_shape=(4880,100)))
model.add(Dense(1024, activation="relu"))
model.add(Dense(512, activation="relu"))
model.add(Dense(128, activation="relu"))
model.add(Dense(64, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

In [30]:
model.compile(loss="mean_squared_error", optimizer="adam",metrics=["accuracy"])

In [31]:
es = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=10)
reduce_lr = ReduceLROnPlateau(
    monitor="val_loss", factor=0.2, patience=5, min_lr=0.0001
)

In [32]:
xtr = np.asarray(xtr)
ytr = np.asarray(ytr)

In [33]:
model.fit(
    xtr,
    ytr,
    epochs=100,
    batch_size=4,
    validation_split=0.2,
    verbose=1,
    callbacks=[es, reduce_lr],
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 11: early stopping


<keras.callbacks.History at 0x7ff0ef89b700>