In [1]:
import pandas as pd
import numpy as np

import fasttext
from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import Dense, Dropout, Input, LayerNormalization, MultiHeadAttention, Flatten, Add
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

2024-04-09 23:09:38.720084: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-09 23:09:38.722487: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-09 23:09:38.812458: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-09 23:09:39.165706: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data = pd.read_csv("preprocessed_data.csv")
data.drop("Unnamed: 0", axis=1, inplace=True)

def revert(sentence):
    x = sentence.split("^")
    return [i.split("%") for i in x]

def flatten(text):
    x = []
    for sentence in text:
        x += sentence
    return x

data["News Articles"] = data["News Articles"].apply(revert)
data["Summary"] = data["Summary"].apply(revert)
data["Unlike"] = data["Unlike"].apply(revert)

data["News Articles"] = data["News Articles"].apply(flatten)
data["Summary"] = data["Summary"].apply(flatten)
data["Unlike"] = data["Unlike"].apply(flatten)

data

Unnamed: 0,News Articles,Unlike,Summary
0,"[think, real, danger, happens, data, cross, ne...","[top, gig, award, scissor, sister, new, york, ...","[however, careful, may, organisation, trust, p..."
1,"[fast, moving, phone, virus, appear, security,...","[black, sabbath, top, rock, album, poll, black...","[new, strain, cabir, mobile, phone, virus, use..."
2,"[seaman, sail, biometric, future, luxury, crui...","[farrell, due, make, u, tv, debut, actor, coli...","[said, french, jordanian, nigerian, national, ..."
3,"[cable, offer, videoondemand, cable, firm, ntl...","[u, firm, bid, lacroix, label, u, firm, said, ...","[cable, firm, ntl, telewest, launched, videoon..."
4,"[make, greener, computer, hitech, industry, st...","[star, pay, tribute, actor, davis, hollywood, ...","[seeing, thing, technology, industry, result, ..."
...,...,...,...
2220,"[circuit, city, get, takeover, offer, circuit,...","[saintandre, anger, absent, star, sale, shark,...","[bill, armstrong, retail, analyst, cl, king, a..."
2221,"[german, business, confidence, slide, german, ...","[mcconnell, drunk, remark, row, scotland, firs...","[analyst, said, ifo, figure, germany, continui..."
2222,"[walmart, fight, back, accuser, two, big, u, n...","[ray, dvd, beat, box, office, taking, oscarnom...","[meanwhile, drug, group, eli, lilly, planning,..."
2223,"[economy, stronger, forecast, uk, economy, pro...","[whitehall, cut, ahead, target, thousand, civi...","[mpc, judge, overall, growth, little, higher, ..."


In [3]:
def fasttext_training(tmp):
    model_type = "skipgram"
    x = ""
    for i in ["News Articles", "Summary","Unlike"]:
        s = []
        for j in tmp[i]:
            s.append(" ".join(j))
        x += " ".join(s)
    with open("fastext_data.txt","w") as f:
        f.write(x)
    return fasttext.train_unsupervised('fastext_data.txt', model=model_type)

In [4]:
embedding_model = fasttext_training(data)

Read 1M words
Number of words:  16842
Number of labels: 0
Progress: 100.0% words/sec/thread:   24102 lr:  0.000000 avg.loss:  2.001978 ETA:   0h 0m 0s


In [5]:
f = 0
c = ""
for i in data.columns:
    for j in data[i]:
        if f<len(j):
            f = len(j)
            c = j
f

2440

In [6]:
def pad_words(sentence):
    l = 2440
    n = ["_" for i in range(l-len(sentence))]
    if sentence == []:
        return n
    return n[:len(n)//2] + sentence + n[len(n)//2:]

In [7]:
data["P News Articles"] = data["News Articles"].apply(pad_words)
data["P Summary"] = data["Summary"].apply(pad_words)
data["P Unlike"] = data["Unlike"].apply(pad_words)

In [8]:
def embed(text):
    embedding = []
    for i in text:
        embedding.append(embedding_model[i])
    return embedding

In [9]:
data["E News Articles"] = data["P News Articles"].apply(embed)
data["E Summary"] = data["P Summary"].apply(embed)
data["E Unlike"] = data["P Unlike"].apply(embed)

In [10]:
data.drop([i for i in data.columns if i[0]!="E"], axis=1, inplace=True)
data = data[data.columns][:100]

In [11]:
x = []
y = []

for i in range(len(data["E News Articles"])):
    x.append(data["E News Articles"][i])
    y.append(1.0)

In [12]:
del data

In [13]:
x = np.asarray(x)
x.shape

(100, 2440, 100)

In [14]:
y = np.asarray(y)

In [15]:
xtr,xt,ytr,yt = train_test_split(x,y,test_size=0.3,random_state=10)

In [18]:

inp = Input(shape=(2440,100))

x = Dense(1220,activation="relu")(inp)
x = Dense(610,activation="relu")(x)
x = Dense(303,activation="relu")(x)
x = Dense(128,activation="tanh")(x)
x = Dense(303,activation="tanh")(x)
x = Dense(610,activation="tanh")(x)
x = Dense(1220,activation="tanh")(x)

a1 = Model(inp, x)
a2 = Model(inp, x)

inputs = Input(shape=(128, 2))
x = inputs

for _ in range(3):
    # Transformer Block with multi-head attention
    query = LayerNormalization(epsilon=1e-6)(x)
    key = LayerNormalization(epsilon=1e-6)(x)
    value = LayerNormalization(epsilon=1e-6)(x)

    attn_output = MultiHeadAttention(num_heads=4, key_dim=256, dropout=0.3)(query, value, key=key)
    x = Add()([x, attn_output])  # Residual connection
    x = LayerNormalization(epsilon=1e-6)(x)

    # Feed Forward Network inside transformer

    ffn_output = Dense(128, activation="relu")(x)
    ffn_output = Dropout(0.3)(ffn_output)
    ffn_output = Dense(64, activation="relu")(ffn_output)
    ffn_output = Dense(1)(ffn_output)
    x = Add()([x, ffn_output])  # Residual connection
    x = LayerNormalization(epsilon=1e-6)(x)

x = Flatten()(x)
x = Dropout(0.3)(x)
outputs = Dense(1)(x)

model = Model(inputs=inputs, outputs=outputs)

In [19]:
a1.compile(loss="mean_squared_error", optimizer="adam",metrics=["accuracy"])
a2.compile(loss="mean_squared_error", optimizer="adam",metrics=["accuracy"])
model.compile(loss="mean_squared_error", optimizer="adam",metrics=["accuracy"])

In [20]:
es = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=10)
reduce_lr = ReduceLROnPlateau(
    monitor="val_loss", factor=0.2, patience=5, min_lr=0.0001
)

In [21]:
for i in range(10):
    a1.fit(
        xtr,
        xtr,
        epochs=100,
        batch_size=4,
        validation_split=0.2,
        verbose=1,
        callbacks=[es, reduce_lr],
    )

Epoch 1/100


ValueError: Dimensions must be equal, but are 100 and 1220 for '{{node compile_loss/mean_squared_error/sub}} = Sub[T=DT_FLOAT](data_1, functional_7_1/dense_23_1/Tanh)' with input shapes: [4,2440,100], [4,2440,1220].