<a href="https://colab.research.google.com/github/RedietNegash/Machine-Learning/blob/main/semantic_search_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import numpy as np
import re
import string
import unicodedata
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
df_train = pd.read_csv('abcnews-date-text.csv')
df_train.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [14]:
print(df_train.isna().sum())

publish_date     0
headline_text    0
dtype: int64


In [15]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [17]:
def process_text(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    text = " ".join([word for word in str(text).split() if word not in stop_words])
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

-> Createing a new column with processed text

In [18]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [25]:
df_train['processed_headline_text'] = df_train['headline_text'].apply(process_text)


Unnamed: 0,processed_headline_text
0,aba decides community broadcasting licence
1,act fire witness must aware defamation
2,g call infrastructure protection summit
3,air nz staff aust strike pay rise
4,air nz strike affect australian traveller


In [26]:
df_train['processed_headline_text'].head()

Unnamed: 0,processed_headline_text
0,aba decides community broadcasting licence
1,act fire witness must aware defamation
2,g call infrastructure protection summit
3,air nz staff aust strike pay rise
4,air nz strike affect australian traveller


-> Tokenize the processed text

In [24]:
df_train['tokens'] = df_train['processed_headline_text'].apply(lambda x: x.split())

-> Train the model

In [21]:
word2vec_model = Word2Vec(sentences=df_train['tokens'], vector_size=100, window=5, min_count=1, workers=1)

-> convert a document to a vector by averaging its word vectors

In [22]:

def document_vector(doc):
    doc = [word for word in doc if word in word2vec_model.wv]
    if len(doc) > 0:
        return np.mean(word2vec_model.wv[doc], axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)


df_train['vector'] = df_train['tokens'].apply(document_vector)

In [23]:

def semantic_search(query, df_train, word2vec_model):
    processed_query = process_text(query).split()
    query_vector = document_vector(processed_query)
    cosine_similarities = cosine_similarity([query_vector], df_train['vector'].tolist())
    ranked_indices = np.argsort(cosine_similarities[0])[::-1]
    ranked_documents = df_train.iloc[ranked_indices]
    return ranked_documents


query = "latest economic developments"
results = semantic_search(query, df_train, word2vec_model)

print(f"Top relevant documents for query '{query}':")
for index, row in results.head().iterrows():
    print(f"Title: {row['headline_text']}")

Top relevant documents for query 'latest economic developments':
Title: infrastructure vital to economic development
Title: economic development strategy in the pipeline
Title: committee to direct campaspe economic growth
Title: economic growth sluggish
Title: the rbas economic possibilities
