In [1]:
# importing all necessary libraries

import pandas as pd
import numpy as np
import plotly.express as px
import spacy
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Reading the data using pandas
data = pd.read_json('news.article(2).json')
data.head(5)

Unnamed: 0,articleBody,dateModified,scrapedDate,source,title
0,"Sanjay Raut, a member of the Shiv Sena (UBT) p...",{'$date': '2023-10-25T06:35:50.000Z'},{'$date': '2023-10-27T13:12:18.339Z'},https://www.thehansindia.com/,Shiv Sena MP Sanjay Raut Responds To 'Hamas' R...
1,"Kozhikode (Kerala) [India], October 27 (ANI): ...",,{'$date': '2023-10-27T13:12:45.595Z'},https://www.aninews.in/,At IUML's pro-Palestine rally in Kerala Tharoo...
2,"Mumbai, Oct 24 (PTI) Maharashtra Chief Ministe...",{'$date': '2023-10-25T02:14:27.000Z'},{'$date': '2023-10-27T13:12:18.339Z'},https://thefederal.com/,Uddhav buried Bal Thackeray's 'Hindutva' for p...
3,"Sensex, Nifty rebound over 1 pc after six sess...",,{'$date': '2023-10-27T13:12:41.618Z'},https://english.varthabharati.in/,"New Bills replacing IPC, CrPC, Evidence Act wi..."
4,"October 26, 2023 08:15 pm | Updated 08:38 pm I...",{'$date': '2023-10-26T14:45:24.000Z'},{'$date': '2023-10-27T13:12:45.595Z'},https://www.thehindu.com/,"Israel biggest terrorist nation in the world, ..."


In [3]:
data.shape

(37421, 5)

In [4]:
def clean_dates(x):
    """"Returns datetime objects."""
    #print(x)
    if type(x) is dict:
        for key,val in x.items():
            extract_val = x[key]
            date_ = pd.to_datetime(extract_val)
            return date_
    else:
        return x


data['dateModified'] = data['dateModified'].apply(lambda x : clean_dates(x))
data['scrapedDate'] = data['scrapedDate'].apply(lambda x : clean_dates(x))

In [5]:
# dropping duplicates and NA values
data.dropna(inplace=True)
data.drop_duplicates(subset='dateModified',inplace=True)

In [6]:
#data['dateModified'] = data['dateModified'].dt.date
#data['scrapedDate'] = data['scrapedDate'].dt.date

In [7]:
data['dateModified'].value_counts()

dateModified
2023-10-25 06:35:50+00:00    1
2023-12-18 18:03:39+00:00    1
2023-12-18 19:17:29+00:00    1
2023-12-18 13:29:49+00:00    1
2023-12-17 15:46:21+00:00    1
                            ..
2023-12-09 06:40:44+00:00    1
2023-12-09 06:22:45+00:00    1
2023-12-09 05:24:53+00:00    1
2023-12-08 08:58:00+00:00    1
2024-03-26 16:01:17+00:00    1
Name: count, Length: 14985, dtype: int64

In [8]:
data_sub_1 = data.iloc[0:5000,:]
print(data_sub_1.shape)

data_sub_2 = data.iloc[5001:10001,:]
print(data_sub_2.shape)

(5000, 5)
(5000, 5)


In [9]:
def encode_vec(x,batch_size):
    """Returns vectors arrays"""
    n_batches = (len(x))// batch_size

    embeddings = []
    for i in range(n_batches):

        model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
        start_idx = i*batch_size
        end_idx = min((i+1)*batch_size,len(x))
        batch_text = x[start_idx:end_idx]
        embedding = model.encode(batch_text)
        embeddings.append(embedding)

    return embeddings
    

embeddings_vec_1 = encode_vec(data_sub_1['title'].tolist(),1000)



In [10]:
embeddings_vec_2 = encode_vec(data_sub_2['title'].tolist(),1000)

In [11]:
final_embedding_1 = np.concatenate(embeddings_vec_1,axis=0)
print(final_embedding_1.shape)

final_embedding_2 = np.concatenate(embeddings_vec_2,axis=0)
print(final_embedding_2.shape)

(5000, 384)
(5000, 384)


In [12]:
final_embedding = np.concatenate([final_embedding_1,final_embedding_2],axis=0)
final_embedding.shape

(10000, 384)

In [13]:
from sklearn.cluster import DBSCAN

X = final_embedding
dbscan = DBSCAN(eps = 0.1, min_samples = 3,metric = 'cosine').fit(X)

In [14]:
final_data = pd.concat([data_sub_1,data_sub_2],axis=0)
final_data.shape

(10000, 5)

In [15]:
final_data['embedding'] = final_embedding.tolist()
final_data['target'] = dbscan.labels_

final_data.head(5)

Unnamed: 0,articleBody,dateModified,scrapedDate,source,title,embedding,target
0,"Sanjay Raut, a member of the Shiv Sena (UBT) p...",2023-10-25 06:35:50+00:00,2023-10-27 13:12:18.339000+00:00,https://www.thehansindia.com/,Shiv Sena MP Sanjay Raut Responds To 'Hamas' R...,"[-0.13344551622867584, 0.42215925455093384, 0....",-1
2,"Mumbai, Oct 24 (PTI) Maharashtra Chief Ministe...",2023-10-25 02:14:27+00:00,2023-10-27 13:12:18.339000+00:00,https://thefederal.com/,Uddhav buried Bal Thackeray's 'Hindutva' for p...,"[0.06483079493045807, 0.7982536554336548, -0.2...",-1
4,"October 26, 2023 08:15 pm | Updated 08:38 pm I...",2023-10-26 14:45:24+00:00,2023-10-27 13:12:45.595000+00:00,https://www.thehindu.com/,"Israel biggest terrorist nation in the world, ...","[0.12600095570087433, 0.592846155166626, -0.50...",-1
5,Eight former officers of the Indian Navy have ...,2023-10-26 11:22:00+00:00,2023-10-27 13:12:47.852000+00:00,https://english.jagran.com/,Eight Ex-Indian Navy Officers Get Death Penalt...,"[0.5108010768890381, 0.21970635652542114, -0.0...",-1
6,"October 26, 2023 07:21 pm | Updated October 27...",2023-10-26 13:51:52+00:00,2023-10-27 13:12:45.595000+00:00,https://www.thehindu.com/,Israel’s response to Hamas terrorist attack di...,"[-0.2768397629261017, 0.5264230966567993, 0.16...",-1


In [16]:
final_data.target.value_counts()

target
-1      8559
 229      23
 238      23
 306      16
 17       16
        ... 
 141       3
 147       3
 148       3
 154       3
 317       3
Name: count, Length: 319, dtype: int64

In [17]:
def compute_center(embeddings,labels):
    unique_labels = set(labels)
    cluster_centre = {}
    for label in unique_labels:
        if label != -1:
            cluster_points = embeddings[labels==label]
            cluster_centre[label] = cluster_points.mean(axis=0)
    return cluster_centre


cluster_center = compute_center(final_embedding,final_data['target'])

In [18]:
from sklearn.metrics.pairwise import cosine_similarity


def select_representative(data,embeddings,cluster_centers):
    representative = []
    for label,center in cluster_centers.items():
        cluster_data = data.loc[data['target']==label]
        cluster_embedding = embeddings[data['target'] == label]

        similarities = cosine_similarity(cluster_embedding, center.reshape(1,-1)).flatten()
        most_similarities_idx = similarities.argmax()
        representative.append(cluster_data.iloc[most_similarities_idx])

    return pd.DataFrame(representative)


selected_articles = select_representative(final_data,final_embedding,cluster_center)

In [19]:
selected_articles.dateModified.unique()

<DatetimeArray>
['2023-10-20 10:44:11+00:00', '2023-10-26 04:00:35+00:00',
 '2023-10-24 11:42:02+00:00', '2023-10-27 11:12:09+00:00',
 '2023-10-24 05:45:50+00:00', '2023-10-25 11:47:00+00:00',
 '2023-10-27 02:42:40+00:00', '2023-10-22 07:34:03+00:00',
 '2023-10-22 16:30:00+00:00', '2023-10-27 06:40:21+00:00',
 ...
 '2023-12-18 12:54:49+00:00', '2023-12-17 14:07:19+00:00',
 '2023-12-17 23:02:02+00:00', '2023-12-17 17:55:07+00:00',
 '2023-12-17 20:43:42+00:00', '2023-12-17 18:45:14+00:00',
 '2023-12-18 10:20:32+00:00', '2023-12-18 11:26:00+00:00',
 '2023-12-18 06:38:29+00:00', '2023-12-18 14:43:30+00:00']
Length: 318, dtype: datetime64[ns, UTC]

In [23]:
import plotly.express as px
import plotly.graph_objects as go


selected_articles['dateModified'] = pd.to_datetime(selected_articles['dateModified'])
selected_articles['event_description'] = selected_articles['title'] + " (" + selected_articles['dateModified'].astype(str) + ")"

subset = selected_articles.loc[(selected_articles['dateModified'] >= '2023-10-20 10:44:11+0000')
                               & (selected_articles['dateModified'] < '2023-10-26 06:40:21+00:00')]

#Create timeline chart
#fig = px.timeline(subset,x_start = 'dateModified', x_end = 'dateModified', y = 'title', title='Isreal-Hamas News')
# Create the figure and add the scatter trace
fig = go.Figure()
fig.add_trace(go.Scatter(
    y=subset['dateModified'],
    x=[1]*(subset['title'].shape[0]),
    mode="text",
    text=subset['title'],
    textposition="top center"
))
#Customize layout
fig.update_layout(
    yaxis_title = 'Date',
    xaxis_title = 'Events',
    yaxis= dict(tickmode='array', tickvals=[i for i in range((subset.shape[0]))]),
    #yaxis=dict(
        #tickformat='%Y-%m-%d %H:%M:%S',  # Format the date
        #dtick=86400000
    #),
    width=800,  
    height=1200,
     
)


fig.show()