In [2]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
__author__ = 'Author'
__email__ = 'Email'

# Detecting Contradiction at the Lexical Level
## Wiki

In [3]:
# dependency
# built-in
import os, time, string, requests
# public
import torch
from tqdm import tqdm
import pandas as pd
import torch.nn.functional as F
# # private
from config import Config


%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

## I/O

# Helpder

In [4]:
# remove_punctuations
def remove_punctuations(text):
    puns = string.punctuation + '《》【】、，。！？：；“”‘’（）…￥'
    return text.translate(str.maketrans('', '', puns))

# Load SemEval25 Task2 Data

In [5]:
# jsonl path
zh_jsonl = 'res/data/semeval25task2/zh_T2S.jsonl'
# load jsonl 
raw_df = pd.read_json(zh_jsonl, lines=True)
raw_df.head()


Unnamed: 0,id,wikidata_id,entity_types,source,targets,source_locale,target_locale
0,Q1036741_0,Q1036741,[Food],Are soybean sprouts a common ingredient in Asi...,"[{'translation': '黄豆芽是亚洲美食的常见食材吗？', 'mention':...",en,zh
1,Q1036741_1,Q1036741,[Food],What are the health benefits of consuming soyb...,"[{'translation': '食用黄豆芽对健康有什么好处？', 'mention': ...",en,zh
2,Q1036741_2,Q1036741,[Food],Can soybean sprouts be grown at home?,"[{'translation': '黄豆芽可以在家里种植吗？', 'mention': '黄...",en,zh
3,Q104007933_0,Q104007933,[Movie],How long is the runtime of Munich: The Edge of...,"[{'translation': '慕尼黑交锋》的片长是多少？', 'mention': '...",en,zh
4,Q104007933_1,Q104007933,[Movie],What is the genre of the movie Munich: The Edg...,"[{'translation': '电影慕尼黑交锋》属于什么类型？', 'mention':...",en,zh


In [6]:
# get metion column from tergats column
raw_df['mention'] = raw_df['targets'].apply(lambda x: x[0]['mention']).apply(remove_punctuations)
raw_df.head()

Unnamed: 0,id,wikidata_id,entity_types,source,targets,source_locale,target_locale,mention
0,Q1036741_0,Q1036741,[Food],Are soybean sprouts a common ingredient in Asi...,"[{'translation': '黄豆芽是亚洲美食的常见食材吗？', 'mention':...",en,zh,黄豆芽
1,Q1036741_1,Q1036741,[Food],What are the health benefits of consuming soyb...,"[{'translation': '食用黄豆芽对健康有什么好处？', 'mention': ...",en,zh,黄豆芽
2,Q1036741_2,Q1036741,[Food],Can soybean sprouts be grown at home?,"[{'translation': '黄豆芽可以在家里种植吗？', 'mention': '黄...",en,zh,黄豆芽
3,Q104007933_0,Q104007933,[Movie],How long is the runtime of Munich: The Edge of...,"[{'translation': '慕尼黑交锋》的片长是多少？', 'mention': '...",en,zh,慕尼黑交锋
4,Q104007933_1,Q104007933,[Movie],What is the genre of the movie Munich: The Edg...,"[{'translation': '电影慕尼黑交锋》属于什么类型？', 'mention':...",en,zh,慕尼黑交锋


In [7]:
# get the wikidata_id
wiki_ids = raw_df['wikidata_id'].unique()
print(f'wiki_ids: {len(wiki_ids)}')

wiki_ids: 250


In [8]:
# get the entity types
entity_types = raw_df['entity_types']
entity_types = [i[0] for i in entity_types]
entity_types = set(entity_types)
print(f'entity_types: {entity_types}')

entity_types: {'Person', 'Food', 'TV series', 'Musical work', 'Fictional entity', 'Landmark', 'Place of worship', 'Book', 'Book series', 'Movie', 'Artwork'}


## Movie Director

In [9]:
# get wikidata id if it type is Movie
movie_wiki_ids = raw_df[raw_df['entity_types'].apply(lambda x: x[0]) == 'Movie']['wikidata_id'].unique()
print(f'movie_wiki_ids: {len(movie_wiki_ids)}')

movie_wiki_ids: 32


In [10]:
# process the raw df to keep only the movie entities
movie_df = raw_df[raw_df['wikidata_id'].isin(movie_wiki_ids)]
movie_df.head()

Unnamed: 0,id,wikidata_id,entity_types,source,targets,source_locale,target_locale,mention
3,Q104007933_0,Q104007933,[Movie],How long is the runtime of Munich: The Edge of...,"[{'translation': '慕尼黑交锋》的片长是多少？', 'mention': '...",en,zh,慕尼黑交锋
4,Q104007933_1,Q104007933,[Movie],What is the genre of the movie Munich: The Edg...,"[{'translation': '电影慕尼黑交锋》属于什么类型？', 'mention':...",en,zh,慕尼黑交锋
5,Q104007933_2,Q104007933,[Movie],When was Munich: The Edge of War released?,"[{'translation': '慕尼黑交锋》是什么时候上映的？', 'mention':...",en,zh,慕尼黑交锋
36,Q116313532_0,Q116313532,[Movie],Who are the main actors in the movie Till the ...,"[{'translation': '电影直至晚上终结中的主要演员是谁？', 'mention...",en,zh,直至晚上终结
37,Q116313532_1,Q116313532,[Movie],When was the movie Till the End of the Night r...,"[{'translation': '电影直至晚上终结什么时候上映？', 'mention':...",en,zh,直至晚上终结


In [11]:
# take one example
wiki_id = movie_wiki_ids[0]
print(f'wiki_id: {wiki_id}')

wiki_id: Q104007933


In [12]:
def get_director(wiki_id):
  # director of a movie
  # query = f"""
  # SELECT ?director ?directorLabel WHERE {{
  #   wd:{wiki_id} wdt:P57 ?director.
  #   SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
  # }}
  # """

  query = f"""
  SELECT ?movieLabel ?directorLabel WHERE {{
    wd:{wiki_id} wdt:P57 ?director.
    BIND(wd:{wiki_id} AS ?movie)
    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
  }}
  """

  url = "https://query.wikidata.org/sparql"
  headers = {"Accept": "application/json"}
  response = requests.get(url, params={"query": query}, headers=headers)
  data = response.json()

  # Parse the results
  results = data.get("results", {}).get("bindings", [])
  
  time.sleep(1)

  movie_name = results[0]['movieLabel']['value']
  directors = {r['directorLabel']['value'] for r in results}
  
  return movie_name, directors

In [13]:
# get the director of all the wiki_ids
results_dict = {}
for i in movie_wiki_ids:
  results_dict[i] = get_director(i)

In [14]:
# process the director value into a string
results_dict = {k: (v[0], ', '.join(v[1])) for k, v in results_dict.items()}

In [15]:
# take a sample
results_dict[list(results_dict.keys())[0]]

('Munich: The Edge of War', 'Christian Schwochow')

In [16]:
# convert the results_dict to a dataframe without index
results_df = pd.DataFrame.from_dict(results_dict, orient='index').reset_index()
results_df.head()

Unnamed: 0,index,0,1
0,Q104007933,Munich: The Edge of War,Christian Schwochow
1,Q116313532,Till the End of the Night,Christoph Hochhäusler
2,Q1214749,A Day at the Races,Sam Wood
3,Q1216050,Snow Queen,David Wu
4,Q1345583,The Bad and the Beautiful,Vincente Minnelli


In [17]:
# remove the index and keep the first two columns
results_df = results_df.iloc[:, :3]
results_df.columns = ['wikidata_id', 'movie', 'director']
results_df.head()

Unnamed: 0,wikidata_id,movie,director
0,Q104007933,Munich: The Edge of War,Christian Schwochow
1,Q116313532,Till the End of the Night,Christoph Hochhäusler
2,Q1214749,A Day at the Races,Sam Wood
3,Q1216050,Snow Queen,David Wu
4,Q1345583,The Bad and the Beautiful,Vincente Minnelli


In [18]:
# concat results_df to movie_df
movie_df = movie_df.merge(results_df, on='wikidata_id', how='left')
movie_df.head()

Unnamed: 0,id,wikidata_id,entity_types,source,targets,source_locale,target_locale,mention,movie,director
0,Q104007933_0,Q104007933,[Movie],How long is the runtime of Munich: The Edge of...,"[{'translation': '慕尼黑交锋》的片长是多少？', 'mention': '...",en,zh,慕尼黑交锋,Munich: The Edge of War,Christian Schwochow
1,Q104007933_1,Q104007933,[Movie],What is the genre of the movie Munich: The Edg...,"[{'translation': '电影慕尼黑交锋》属于什么类型？', 'mention':...",en,zh,慕尼黑交锋,Munich: The Edge of War,Christian Schwochow
2,Q104007933_2,Q104007933,[Movie],When was Munich: The Edge of War released?,"[{'translation': '慕尼黑交锋》是什么时候上映的？', 'mention':...",en,zh,慕尼黑交锋,Munich: The Edge of War,Christian Schwochow
3,Q116313532_0,Q116313532,[Movie],Who are the main actors in the movie Till the ...,"[{'translation': '电影直至晚上终结中的主要演员是谁？', 'mention...",en,zh,直至晚上终结,Till the End of the Night,Christoph Hochhäusler
4,Q116313532_1,Q116313532,[Movie],When was the movie Till the End of the Night r...,"[{'translation': '电影直至晚上终结什么时候上映？', 'mention':...",en,zh,直至晚上终结,Till the End of the Night,Christoph Hochhäusler


In [19]:
# generate the column quesiton
template = 'Who is the director of the movie {}?'
questions = [template.format(i) for i in movie_df['movie']]
movie_df['question'] = questions
movie_df.head()


Unnamed: 0,id,wikidata_id,entity_types,source,targets,source_locale,target_locale,mention,movie,director,question
0,Q104007933_0,Q104007933,[Movie],How long is the runtime of Munich: The Edge of...,"[{'translation': '慕尼黑交锋》的片长是多少？', 'mention': '...",en,zh,慕尼黑交锋,Munich: The Edge of War,Christian Schwochow,Who is the director of the movie Munich: The E...
1,Q104007933_1,Q104007933,[Movie],What is the genre of the movie Munich: The Edg...,"[{'translation': '电影慕尼黑交锋》属于什么类型？', 'mention':...",en,zh,慕尼黑交锋,Munich: The Edge of War,Christian Schwochow,Who is the director of the movie Munich: The E...
2,Q104007933_2,Q104007933,[Movie],When was Munich: The Edge of War released?,"[{'translation': '慕尼黑交锋》是什么时候上映的？', 'mention':...",en,zh,慕尼黑交锋,Munich: The Edge of War,Christian Schwochow,Who is the director of the movie Munich: The E...
3,Q116313532_0,Q116313532,[Movie],Who are the main actors in the movie Till the ...,"[{'translation': '电影直至晚上终结中的主要演员是谁？', 'mention...",en,zh,直至晚上终结,Till the End of the Night,Christoph Hochhäusler,Who is the director of the movie Till the End ...
4,Q116313532_1,Q116313532,[Movie],When was the movie Till the End of the Night r...,"[{'translation': '电影直至晚上终结什么时候上映？', 'mention':...",en,zh,直至晚上终结,Till the End of the Night,Christoph Hochhäusler,Who is the director of the movie Till the End ...


In [20]:
# save movie_df as tsv
movie_df.to_csv('res/data/semeval25task2/movie.tsv', sep='\t', index=False)