In [1]:
!pip install python-terrier # install pyTerrier



In [2]:
import json
import os
import requests
import ir_datasets
import random
import numpy as np
import pandas as pd
import pyterrier as pt

In [3]:
!pip install git+https://github.com/allenai/ir_datasets.git@crisisfacts

Collecting git+https://github.com/allenai/ir_datasets.git@crisisfacts
  Cloning https://github.com/allenai/ir_datasets.git (to revision crisisfacts) to /tmp/pip-req-build-qc4hrgyn
  Running command git clone --filter=blob:none --quiet https://github.com/allenai/ir_datasets.git /tmp/pip-req-build-qc4hrgyn
  Running command git checkout -b crisisfacts --track origin/crisisfacts
  Switched to a new branch 'crisisfacts'
  Branch 'crisisfacts' set up to track remote branch 'crisisfacts' from 'origin'.
  Resolved https://github.com/allenai/ir_datasets.git to commit e2359e24c9546e2a62284cd1aec6138295bb5ec5
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [4]:
import datetime
import time

In [5]:
credentials = {
    "institution": "University of Foo", # University, Company or Public Agency Name
    "contactname": "Foo Bar", # Your Name
    "email": "foo@bar.edu", # A contact email address
    "institutiontype": "Research" # Either 'Research', 'Industry', or 'Public Sector'
}

home_dir = os.path.expanduser('~')
!mkdir -p ~/.ir_datasets/auth/
with open(home_dir + '/.ir_datasets/auth/crisisfacts.json', 'w') as f:
    json.dump(credentials, f)

In [6]:
# Event numbers as a list
eventNoList = [
          "001", # Lilac Wildfire 2017
          "002", # Cranston Wildfire 2018
          "003", # Holy Wildfire 2018
          "004", # Hurricane Florence 2018
          "005", # 2018 Maryland Flood
          "006", # Saddleridge Wildfire 2019
          "007", # Hurricane Laura 2020
          "008" # Hurricane Sally 2020
]

In [7]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-dat

True

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import random

In [10]:
def similarity(texts,input_string):
  # Preprocess the input string and the texts
  stop_words = set(stopwords.words('english'))
  lemmatizer = WordNetLemmatizer()

  def preprocess(text):
      tokens = word_tokenize(text.lower())
      tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
      return " ".join(tokens)

  input_processed = preprocess(input_string)
  texts_processed = [preprocess(text) for text in texts]

  # Compute the TF-IDF vectors for the input string and the texts
  vectorizer = TfidfVectorizer()
  vectors = vectorizer.fit_transform(texts_processed + [input_processed])

  # Compute the cosine similarity between the input vector and the text vectors
  input_vector = vectors[-1]
  text_vectors = vectors[:-1]
  similarities = np.dot(text_vectors, input_vector.T).toarray().flatten()

  # Find the index of the text with the highest similarity
  max_index = np.argmax(similarities)

  # Return the text with the highest similarity
  most_similar_text = texts[max_index]

  return most_similar_text

In [11]:
# Gets the list of days for a specified event number, e.g. '001'
def getDaysForEventNo(eventNo):
  # We will download a file containing the day list for an event
  url = "http://trecis.org/CrisisFACTs/CrisisFACTS-"+eventNo+".requests.json"
  # Download the list and parse as JSON
  dayList = requests.get(url).json()
  return dayList

In [12]:
import datetime
from transformers import BartTokenizer, BartForConditionalGeneration

# Load the BART-based summarization model and tokenizer
model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)


### BART model

In [14]:
from transformers import BartTokenizer, BartForConditionalGeneration

# Load the BART-based summarization model and tokenizer
model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)


In [15]:
def summary_bart(query,retriever):
  query_df=pd.DataFrame(retriever.search(query))
  texts=list(query_df['text'])
  scores=list(query_df['score'])
  # Generate summaries using BART, with weights based on scores
  weighted_summaries = []
  for i in range(len(texts)):
      # Tokenize the text
      inputs = tokenizer(texts[i], return_tensors="pt", truncation=True, padding=True)

      # Generate a summary with the specified length and weights
      summary = model.generate(
          inputs["input_ids"],
          attention_mask=inputs["attention_mask"],
          length_penalty=2.0,
          min_length=10,
          max_length=40,
          num_beams=4,
          early_stopping=True,
          repetition_penalty=1.0,
          no_repeat_ngram_size=2,
          decoder_start_token_id=model.config.pad_token_id,
          num_return_sequences=1,
          output_scores=True,
      )

      # Decode the summary and append to the list of summaries
      summary_text = tokenizer.decode(summary[0], skip_special_tokens=True)
      weighted_summaries.append(summary_text)

  # Print the summaries
  sum=''
  for summary in weighted_summaries:
      sum+=summary
  return sum

In [16]:
def dataset_pyterier(data_link):
  if not pt.started():
    pt.init()
  data_link = "irds:"+data_link
  print(data_link)
  pyTerrierDataset = pt.get_dataset(data_link)
  print("no")
  indexer = pt.IterDictIndexer("None", type=pt.index.IndexingType(3), meta=['docno', 'text'], meta_lengths=[40, 200])
  index = indexer.index(pyTerrierDataset.get_corpus_iter())
  retriever = pt.BatchRetrieve(index, wmodel="DFReeKLIM", metadata=["docno", "text"])
  return retriever

In [17]:
def summary(event,query):
  event_days = getDaysForEventNo(event)
  # Dates for a particular event
  days_list = []
  for d in event_days:
    days_list.append(d["dateString"])
  print(days_list)
  day=str(input())
  event_day_data = 'crisisfacts/'+event+'/'+day
  # dataset = ir_datasets.load(event_day_data)
  # itemsAsDataFrame = pd.DataFrame(dataset.docs_iter())
  # itemQueries = pd.DataFrame(dataset.queries_iter())

  retriever = dataset_pyterier(event_day_data)
  print("yes")
  a=summary_bart(query,retriever)
  print("*********************************************************************************************************************************")

  return a

In [18]:
summary("001","injuries")

['2017-12-07', '2017-12-08', '2017-12-09', '2017-12-10', '2017-12-11', '2017-12-12', '2017-12-13', '2017-12-14', '2017-12-15']
2017-12-07


PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7



irds:crisisfacts/001/2017-12-07
no


[INFO] [starting] building docstore
[INFO] [starting] requesting access key
[INFO] [finished] requesting access key [1.33s]
docs_iter: 7288doc [00:08, 843.96doc/s]
[INFO] [finished] docs_iter: [00:08] [7288doc] [843.84doc/s]
[INFO] [finished] building docstore [8.68s]


crisisfacts/001/2017-12-07 documents: 0it [00:00, ?it/s]

yes
*********************************************************************************************************************************


"A San Diego County Sheriff's Department deputy suffered minor injuries while directing traffic.Two civilians are being treated for burn injuries suffered in #LilacFire and are taking to a hospital, https://t.co/Q2EOJFPVvIHeadon collision between Sheriff's vehicle and private vehicle in thick smoke #LilacFire  Moderate damage, no injuries reportedUPDATE: #LilacFire now 2000 acres, 0% containment.  Â Two civilians are being treated for burn injuries suffered in https://t.co/Dp4Strong winds and low humidity fueling the Lilac Fire burning in Bonsall. To add insult to injury the Red Flag Warni https://t.co/FlpsJPuTS CAL FIRE/SAN DIEGO COUNTY FIRE (@CALFIRESANDIEGo) December 8, 2017  Four residents sustained burn injuries as they were evacuating their home,Besides the civilian injuries, one firefighter suffered smoke inhalation.It was not clear if it was a factor in the injury."

In [19]:
summary("006","Is the whole forest burnt")

['2019-10-10', '2019-10-11', '2019-10-12', '2019-10-13']
2019-10-10


[INFO] [starting] building docstore


irds:crisisfacts/006/2019-10-10
no


[INFO] [starting] requesting access key
[INFO] [finished] requesting access key [1.01s]
docs_iter: 6993doc [00:06, 1150.93doc/s]
[INFO] [finished] docs_iter: [00:06] [6993doc] [1150.60doc/s]
[INFO] [finished] building docstore [6.09s]


crisisfacts/006/2019-10-10 documents: 0it [00:00, ?it/s]

yes
*********************************************************************************************************************************


"Can we take these people out of the forest @WalkingDead_AMC?! Weve only seen this forest for like 5 years now. Pl https://t.co/i burnt myself cooking and we have zero bandages in my house, so im making do with scotch tape and toilet paperLeafys rooting for LAs urban forest + the @Dodgers tonight! https://t.co/qetbgUeaLZAn illustrious career including prominent roles in films such as Forest Gump, Contact, Apollo 13, Cast Away to an a https://t.co/neekLa8ZTM@FNMATTTT @MannyJoose The sad thing is the Nats have no chance at actually winning the whole thing lolWhy have I been fooling around my whole life. If I wanted to win I shouldve liked the Yankees.@lovelyness_8 you play a whole season to get to the playoffs lol no other reasonThis whole time... I thought they where talking about @KREAYSHAWN and how she did something at the #Dodgersgame... https://t.co/uEB@PatsLadi99 @MollyJongFast The whole gun thing is a joke to them...and don't get me started on the bible.Im about to curse my daddy 

### BART - TIMESTAMP

In [23]:
def summary_bart1(texts):
  # Generate summaries using BART, with weights based on scores
  weighted_summaries = []
  for i in range(len(texts)):
      # Tokenize the text
      inputs = tokenizer(texts[i], return_tensors="pt", truncation=True, padding=True)

      # Generate a summary with the specified length and weights
      summary = model.generate(
          inputs["input_ids"],
          attention_mask=inputs["attention_mask"],
          length_penalty=2.0,
          min_length=10,
          max_length=40,
          num_beams=4,
          early_stopping=True,
          repetition_penalty=1.0,
          no_repeat_ngram_size=2,
          decoder_start_token_id=model.config.pad_token_id,
          num_return_sequences=1,
          output_scores=True,
      )

      # Decode the summary and append to the list of summaries
      summary_text = tokenizer.decode(summary[0], skip_special_tokens=True)
      weighted_summaries.append(summary_text)

  # Print the summaries
  sum=''
  for summary in weighted_summaries:
      sum+=summary
  return sum

In [24]:
def date_timestamp(date_str,start_time_str,end_time_str):
  time_diff = "+00:00"
  start_dt = datetime.datetime.strptime(date_str + " " + start_time_str, "%Y-%m-%d %H:%M:%S")
  end_dt = datetime.datetime.strptime(date_str + " " + end_time_str, "%Y-%m-%d %H:%M:%S")

  # Convert datetime objects to Unix timestamps
  start_ts = int(start_dt.replace(tzinfo=datetime.timezone(datetime.timedelta(hours=int(time_diff[1:3])))).timestamp())
  end_ts = int(end_dt.replace(tzinfo=datetime.timezone(datetime.timedelta(hours=int(time_diff[1:3])))).timestamp())

  return start_ts,end_ts

def timestamp_date(start_ts,end_ts):
  # Convert Unix timestamps to datetime objects
  start_dt = datetime.datetime.utcfromtimestamp(start_ts)
  end_dt = datetime.datetime.utcfromtimestamp(end_ts)

  # Convert datetime objects to date and time strings
  date_str = start_dt.strftime("%Y-%m-%d")
  start_time_str = start_dt.strftime("%H:%M:%S")
  end_time_str = end_dt.strftime("%H:%M:%S")

  return start_time_str,end_time_str



def unixtotime(ts_list,date_str):
  # Convert date string to datetime object
  date_dt = datetime.datetime.strptime(date_str, "%Y-%m-%d").date()

  # Filter the list to timestamps for the given date
  ts_for_date = [ts for ts in ts_list if datetime.datetime.utcfromtimestamp(ts).date() == date_dt]
  min_ts = min(ts_for_date)
  max_ts = max(ts_for_date)
  min_dt = datetime.datetime.utcfromtimestamp(min_ts)
  max_dt = datetime.datetime.utcfromtimestamp(max_ts)

    # Convert datetime objects to time strings
  min_time_str = min_dt.strftime("%H:%M:%S")
  max_time_str = max_dt.strftime("%H:%M:%S")

  return min_time_str , max_time_str



def important_texts(que,items_df,mini_stamp,maxi_stamp):
  que_l = que.split(" ")
  txts = list(items_df["text"])
  t_stmps = list(items_df["unix_timestamp"])
  imp_txts = []
  for i in range(len(txts)):
    cnt = 0
    for q in que_l:
      if q in txts[i] and (mini_stamp<= t_stmps[i] <=maxi_stamp):
        imp_txts.append(txts[i])
        break
    #     cnt+=1
    # if cnt == len(que_l):
    #   imp_txts.append(txts[i])

  return imp_txts

In [25]:
def summary(event):
  event_days = getDaysForEventNo(event)
  # Dates for a particular event
  days_list = []
  for d in event_days:
    days_list.append(d["dateString"])
  print(days_list)
  day=str(input())
  event_day_data = 'crisisfacts/'+event+'/'+day
  dataset = ir_datasets.load(event_day_data)
  itemsAsDataFrame = pd.DataFrame(dataset.docs_iter())
  itemQueries = pd.DataFrame(dataset.queries_iter())

  # --------- for queries --------------

  ind_terms = list(itemQueries["indicative_terms"])
  query = random.choice(ind_terms)


  # ----------for timestamps----------
  time_stamps = list(itemsAsDataFrame["unix_timestamp"])

  mini_time , maxi_time = unixtotime(time_stamps,day)

  print(f"Select the time range from {mini_time} to {maxi_time}")
  time_st = str(input())
  time_en = str(input())
  mini_stamp , maxi_stamp = date_timestamp(day,time_st,time_en)
  # print(mini_stamp , maxi_stamp)

  print(query)
  req_texts = important_texts(query,itemsAsDataFrame,mini_stamp,maxi_stamp)
  # -------- BART MODEL -----------
  final_Summ = summary_bart1(req_texts)

  return final_Summ

In [26]:
summary("001")

['2017-12-07', '2017-12-08', '2017-12-09', '2017-12-10', '2017-12-11', '2017-12-12', '2017-12-13', '2017-12-14', '2017-12-15']
2017-12-07
Select the time range from 00:00:00 to 23:59:56
00:02:03
22:58:59
volunteer


"Great volunteers and friends at @lgbtcenter’s GGG Game & Lamp; Trivia… https://t.co/CkhaXrOPRXOur parent volunteers were snapping tons of photos and video from tonights Advent Candle Celebration. More video cl… https://t.co/kRsCmv4kFlThe Del Mar Fairgrounds Animal Evacuation Center is currently fully staffed with volunteers but a spokeswoman for the center said people who wish to volunteer on Friday can show up beginning at 6 aFor more information on how to volunteer visit here.VOLUNTEERING  San Diego County was looking for volunteers for their 211 call center.Anyone interested in volunteering through Red Cross can sign up here.Also, a uniform dispatch system provides seamless communication between agencies, small volunteer fire districts have been consolidated and strengthened and power lines are even more closely monitoredOfficials for the county's information line, 2-1- 1, said Thursday evening they were in need of volunteers at their call center, where people affected by the fire 

In [29]:
summary("004")

['2018-09-01', '2018-09-04', '2018-09-05', '2018-09-07', '2018-09-08', '2018-09-09', '2018-09-10', '2018-09-11', '2018-09-12', '2018-09-13', '2018-09-14', '2018-09-15', '2018-09-16', '2018-09-17', '2018-09-18']
2018-09-01


[INFO] [starting] building docstore
[INFO] [starting] requesting access key
[INFO] [finished] requesting access key [240ms]
docs_iter: 7959doc [00:01, 5052.44doc/s]
[INFO] [finished] docs_iter: [00:01] [7959doc] [5046.12doc/s]
[INFO] [finished] building docstore [1.58s]
[INFO] [starting] requesting access key
[INFO] [finished] requesting access key [193ms]


Select the time range from 00:18:48 to 22:52:50
00:20:50
20:50:50
shelters


"Per weather channel: “Authorities in Charleston say shelters cant sustain above cat 3”  Well. Fuck.Just in case anything bad happens,  does anybody know if any shelters in South Carolina allow dogs?Have them current on their shots and have records, I think. Once shelters start to open they announce which ones are pet friendly.Too soon for that, but yes there will be shelters for people with animals.I am not aware of any shelters that take animals only, based on past evacuations, but could be wrong.Edit: And there are always local shelters that would allow her to stay relatively close to home.https://em911.nhcgov.com/be-prepared-ready-nuhc/shelters-evacuation/Even if you have to resort to emergency shelters that's better than being in a flooded home or a mobile/premanufactured home.If you want, leave him a list of shelters in the area and your counties emergency management number."