<a href="https://colab.research.google.com/github/Tsharika/CrisisFACTS_challenge_2023/blob/main/CrisiFacts_challenge_2023.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is a part of the crisisfacts challenge. Here, I am implementing several hypotheses to solve this challenge. The main goal was to extract keywords from a given phrase or text. I have implemented three hypotheses here.

CrisisFACTS is a challenge to support disaster-response managers during a crisis event. It's an open data challenge. Details can be found in the link below:


Crisisfacts challenge : https://crisisfacts.github.io/

#H1: Simple linguistic extraction of phrases from query text
Extracting 2-3 keyword phrases from the query.
Limitation: Some rows are blank because the system failed to extract keywords for some queries. So, this method is not very convenient.

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import string
import pandas as pd

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")

def extract_keywords(sentence):
    stop_words = set(stopwords.words("english"))
    tokens = word_tokenize(sentence.lower())
    tokens = [word for word in tokens if word not in string.punctuation]
    tagged_tokens = pos_tag(tokens)

    relevant_keywords = []
    phrase = []
    for word, pos in tagged_tokens:
        if pos.startswith("N") or pos.startswith("V") or pos.startswith("ADJ")or pos.startswith("s")or pos.startswith("ADV"):
            if word not in stop_words:
                phrase.append(word)
                if len(phrase) == 2 or len(phrase)==3:
                    relevant_keywords.append(" ".join(phrase))
                    phrase = []

    return relevant_keywords

def generate_relevant_phrase(sentence):
    keywords = extract_keywords(sentence)
    if keywords:
        return keywords[0]
    else:
        return None


input_csv_path = "crisisfacts_queries.csv"
output_csv_path = "output_with_extended_indicative_Linguistic.csv"
df = pd.read_csv(input_csv_path)


df["extended_indcative_terms_POS"] = df["text"].apply(generate_relevant_phrase)

df.to_csv(output_csv_path, index=False)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.



#H2: Using bert based keyword extraction technique "Keybert"

 Keybert is an algorithm based on SBERT that generates keywords from a document.

 Initially, a document embedding is crafted using Sentences-BERT. After that, word embeddings are derived for N-gram phrases from the file. Subsequently, the cosine similarity is used to gauge the resemblance of every keyphrase to the document. Then the terms that have a higher similarity are choosen and extracted as keyword phrases.

In [6]:
!pip install keybert
from keybert import KeyBERT

Collecting keybert
  Downloading keybert-0.7.0.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.3.8 (from keybert)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers>=0.3.8->keybert)
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers>=0.3.8->keybert)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 

In [7]:
kw_model = KeyBERT(model='all-mpnet-base-v2')

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [8]:
!pip install keybert
import pandas as pd
from keybert import KeyBERT

input_file_path = 'output_with_extended_indicative_Linguistic.csv'
output_file_path = 'output_with_extended_indicative_keybert.csv'

data = pd.read_csv(input_file_path)


kw_model = KeyBERT(model='all-mpnet-base-v2')


def extract_keywords(row):
    full_text = row['text']
    keywords = kw_model.extract_keywords(
        full_text,
        keyphrase_ngram_range=(1, 3),
        stop_words='english',
        highlight=False,
        top_n=1
    )
    keywords_list = list(dict(keywords).keys())
    return keywords_list[0] if keywords_list else ""


data['extended_indicative_terms_keybert'] = data.apply(extract_keywords, axis=1)

data.to_csv(output_file_path, index=False)






#H3: WordNet-based: synonyms of indicative terms based on the provided indicative terms.
Here, indicative terms (a set of keywords) are provided for each query. We are extending that given indicative terms using wordnet.

In [9]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [11]:
import pandas as pd
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

def get_synonyms(word):
    synonyms = []
    synsets = wordnet.synsets(word)
    for synset in synsets:
        for lemma in synset.lemmas():
            synonyms.append(lemma.name().replace("_", " "))
    return synonyms

def generate_phrase_synonyms(phrase):
    words = word_tokenize(phrase)
    phrase_synonyms = []

    for word in words:
        word_synonyms = get_synonyms(word)
        if word_synonyms:
            phrase_synonyms.append(word_synonyms[0])

    if phrase_synonyms:
        generated_phrase = ' '.join(phrase_synonyms)
        return generated_phrase
    else:
        return None


input_file_path = 'query_output_with_extended_indicative_wordnet.csv'


data = pd.read_csv(input_file_path)

# Choose the column for processing
#selected_column = "text"
selected_column = "indicative_terms"
# Generate synonym phrases for each row in the selected column
synonym_phrases = []
for original_phrase in data[selected_column]:
    synonym_phrase = generate_phrase_synonyms(original_phrase)
    synonym_phrases.append(synonym_phrase)

# Add a new column with the synonym phrases
data["Synonym_indicative_terms"] = synonym_phrases
#data["Synonym_extended_indicative"] = synonym_phrases

output_file_path = 'indicative_terms_output_all_four_types.csv'
data.to_csv(output_file_path, index=False)


checking if there is any null values in the data after extracting indicative terms:

In [12]:
import pandas as pd
#df = pd.read_csv('output_with_extended_indicative.csv')
df = pd.read_csv('indicative_terms_output_all_four_types.csv')
#df.head()
# Find and print the number of null values in each column along with their indexes
for column in df.columns:
    null_count = df[column].isnull().sum()
    null_indexes = df[df[column].isnull()].index.tolist()
    print(f"Column '{column}' has {null_count} null values at indexes: {null_indexes}")

Column 'query_id' has 0 null values at indexes: []
Column 'text' has 0 null values at indexes: []
Column 'indicative_terms' has 0 null values at indexes: []
Column 'trecis_category_mapping' has 0 null values at indexes: []
Column 'event_id' has 0 null values at indexes: []
Column 'event_title' has 0 null values at indexes: []
Column 'event_dataset' has 0 null values at indexes: []
Column 'event_description' has 0 null values at indexes: []
Column 'event_trecis_id' has 0 null values at indexes: []
Column 'event_type' has 0 null values at indexes: []
Column 'event_url' has 0 null values at indexes: []
Column 'extended_indcative_terms_POS' has 6 null values at indexes: [3, 4, 11, 26, 28, 48]
Column 'extended_indicative_terms_keybert' has 0 null values at indexes: []
Column 'Synonym_texts' has 0 null values at indexes: []
Column 'Synonym_indicative_terms' has 1 null values at indexes: [29]
