#JOKER Track @ CLEF 2024:
Automatic Wordplay Analysis


In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

##Task 1: Humour-aware information retrieval

In [None]:
%cd "/content/drive/MyDrive/BIP/JOKER/Task 1 - retrieval"

In [None]:
ls

In [None]:
corpus_data = pd.read_json("/content/drive/MyDrive/BIP/JOKER/Task 1 - retrieval/joker_2024_task1_corpus.json")
qrels_train = pd.read_json("/content/drive/MyDrive/BIP/JOKER/Task 1 - retrieval/joker_2024_task1_qrels_train.json")
queries_train = pd.read_json("/content/drive/MyDrive/BIP/JOKER/Task 1 - retrieval/joker_2024_task1_queries_train.json")

In [None]:
print(qrels_train.head())

In [None]:
print(queries_train.head())

            qid      query
0   qid_train_0  testament
1   qid_train_1      steps
2  qid_train_10    faculty
3  qid_train_11      death
4   qid_train_2       vein


In [None]:
import pandas as pd
import json

In [None]:
with open('joker_2024_task1_qrels_train.json', 'r') as file:
    qrels = json.load(file)

with open('joker_2024_task1_corpus.json', 'r') as file:
    corpus = json.load(file)

with open('joker_2024_task1_queries_train.json', 'r') as file:
    train = json.load(file)

In [None]:
data_qrels = pd.DataFrame(qrels)
data_corpus = pd.DataFrame(corpus)
data_train = pd.DataFrame(train)

In [None]:
data_merged = data_qrels.merge(data_corpus, on='docid').merge(data_train, on='qid')

TF-IDF Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer()

In [None]:
data_merged

In [None]:
#query text and joke text into a single column - TF-IDF Vectorizer
data_merged['text_all'] = data_merged['query'] + " " + data_merged['text']

# Fit and transform the combined text
tfidf_matrix = tfidf_vectorizer.fit_transform(data_merged['text_all'])

In [None]:
X_train = tfidf_vectorizer.fit_transform(data_merged['text_all'])
y_train = data_merged['qrel']

In [None]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression model
model = LogisticRegression()

# Trained model
trained_model = model.fit(X_train, y_train)

In [None]:
model.fit(X_train, y_train)

In [None]:
with open('/content/drive/MyDrive/BIP/JOKER/Task 1 - retrieval/joker_2024_task1_queries_test.json', 'r') as file:
    test_queries = json.load(file)

In [None]:
data_test_queries = pd.DataFrame(test_queries)
print(data_test_queries.head())

           qid      query
0   qid_test_0      koala
1   qid_test_1      music
2  qid_test_10   children
3  qid_test_11       milk
4  qid_test_12  moonlight


In [None]:
#Train and Test split
from sklearn.model_selection import train_test_split

# Assuming data_merged['text_all'] contains the text data and data_merged['qrel'] contains the labels
X = tfidf_vectorizer.fit_transform(data_merged['text_all'])
y = data_merged['qrel']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression model
model = LogisticRegression()

# Train the model
trained_model = model.fit(X_train, y_train)


In [None]:
# Make predictions
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report

# Calculate F1 score
f1 = f1_score(y_test, y_pred, average='weighted')  # Use 'weighted' for multi-class classification

# Calculate other metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

# Print metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

# Print classification report for a detailed breakdown
print(classification_report(y_test, y_pred))

Accuracy: 0.8849372384937239
Precision: 0.8907761432605401
Recall: 0.8849372384937239
F1 Score: 0.8756221070743102
              precision    recall  f1-score   support

           0       0.87      0.99      0.93       354
           1       0.94      0.60      0.73       124

    accuracy                           0.88       478
   macro avg       0.91      0.79      0.83       478
weighted avg       0.89      0.88      0.88       478



In [None]:
results = []
# Iterate over each test query
for index, test_query in data_test_queries.iterrows():
    query_id = test_query['qid']
    query_text = test_query['query']
    # Calculate relevance for each joke in the corpus with this query
    scores = []
    for _, joke in data_corpus.iterrows():
        if joke['text'] is None:
            continue
        else:
          text_all = query_text + " " + joke['text']
          vectorized_text = tfidf_vectorizer.transform([text_all])
          relevance_score = model.predict_proba(vectorized_text)[0, 1]
          scores.append({
              'docid': joke['docid'],
              'score': relevance_score
          })

In [None]:
# Sort jokes by relevance score in descending order
scores.sort(key=lambda x: x['score'], reverse=True)
    # Prepare output JSON format
for rank, score_info in enumerate(scores, start=1):
    results.append({
        'run_id':"Tomislav&Rowan_task_1_TFIDF",
        'manual':0,
        'rank': rank,
        'score': score_info['score'],
        'docid': score_info['docid'],
        'qid': query_id
    })

with open('result_joker_task_1.json', 'w') as outfile:
    json.dump(results, outfile, indent=4)

##Task 2

In [None]:
pwd

In [None]:
%cd /content/drive/MyDrive/JOKER/JOKER/Task 2 - classification

In [None]:
ls

In [None]:
import pandas as pd

In [None]:
classification_test_data = pd.read_json("/content/drive/MyDrive/BIP/JOKER/Task 2 - classification/joker-2024-task2-classification-test.json")
classification_train_input_data = pd.read_json("/content/drive/MyDrive/BIP/JOKER/Task 2 - classification/joker-2024-task2-classification-train-input.json")
classification_train_qrels_data = pd.read_json("/content/drive/MyDrive/BIP/JOKER/Task 2 - classification/joker-2024-task2-classification-train-qrels.json")

In [None]:
print(classification_test_data.head())

In [None]:
print(classification_train_input_data.head())

In [None]:
print(classification_train_qrels_data.head())

     id class
0  1162    SC
1   448    EX
2  1280    SC
3  1216    SC
4  1872    WS


In [None]:
df_merged_train_data = pd.merge(classification_test_data, classification_train_qrels_data, on='id')

In [None]:
print(df_merged_train_data.head())

   id                                               text class
0   2  My life’s purpose is to be a cautionary tale f...    SD
1   4  “Today is not my day,” I mutter to myself ever...    SD
2   5            My teacher called me average. How mean!    SD
3   6  My entire life is a big joke. So, tell why exa...    SD
4  10   How do I moisturize my face? I use my own tears!    SD


In [None]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [None]:
# Training input and labels
with open('/content/drive/MyDrive/BIP/JOKER/Task 2 - classification/joker-2024-task2-classification-train-input.json', 'r') as file:
    train_input = json.load(file)

data_train_input = pd.DataFrame(train_input)

with open('/content/drive/MyDrive/BIP/JOKER/Task 2 - classification/joker-2024-task2-classification-train-qrels.json', 'r') as file:
    train_qrels = json.load(file)

df_train_qrels = pd.DataFrame(train_qrels)

In [None]:
# Merge on id
df_train = pd.merge(data_train_input, df_train_qrels, on='id')
df_train

In [None]:
!pip install contractions

In [None]:
# Preprocessing function
from nltk.stem import WordNetLemmatizer
import contractions
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords

lem = WordNetLemmatizer()
def preprocess_text(text):
      sms = contractions.fix(str(text)) # converting shortened words to original (Eg:"I'm" to "I am")
      sms = sms.lower() # lower casing the message
      sms = re.sub(r'https?://S+|www.S+', "", sms).strip() #removing url
      sms = re.sub("[^a-z ]", "", sms) # removing symbols and numbers (keeping only charachters from a-z)
      sms = sms.split() #splitting
      # lemmatization and stopword removal
      sms = [lem.lemmatize(word) for word in sms if not word in set(stopwords.words("english"))]
      sms = " ".join(sms)
      return sms
X = df_train["text"].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
X

In [None]:
df_train['clean_text'] = df_train['text'].apply(preprocess_text)

In [None]:
#adding for classification report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Assuming df_train['text'] contains the text data and df_train['class'] contains the labels

# Preprocess the text
df_train['clean_text'] = df_train['text'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['class'])

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(df_train['clean_text'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, y_train, test_size=0.2, random_state=42)

# Train Logistic Regression model
logistic_regression_model = LogisticRegression(max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

# Make predictions
y_pred = logistic_regression_model.predict(X_test)

# Calculate and print the classification report
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
print(report)

              precision    recall  f1-score   support

         AID       0.77      0.21      0.33        47
          EX       1.00      0.03      0.05        38
          IR       0.25      0.05      0.08        41
          SC       0.51      0.29      0.37        79
          SD       0.80      0.12      0.22        32
          WS       0.38      0.94      0.54       112

    accuracy                           0.42       349
   macro avg       0.62      0.27      0.27       349
weighted avg       0.55      0.42      0.34       349



In [None]:
#NAIVE BAYES
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Assuming df_train['text'] contains the text data and df_train['class'] contains the labels

# Preprocess the text
df_train['clean_text'] = df_train['text'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['class'])

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(df_train['clean_text'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, y_train, test_size=0.2, random_state=42)

# Train Multinomial Naive Bayes model
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)

# Make predictions
y_pred = naive_bayes_model.predict(X_test)

# Calculate and print the classification report
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
print(report)

              precision    recall  f1-score   support

         AID       0.80      0.09      0.15        47
          EX       0.00      0.00      0.00        38
          IR       0.00      0.00      0.00        41
          SC       0.39      0.14      0.21        79
          SD       0.00      0.00      0.00        32
          WS       0.34      0.96      0.50       112

    accuracy                           0.35       349
   macro avg       0.26      0.20      0.14       349
weighted avg       0.31      0.35      0.23       349



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Encode
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['class'])

In [None]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(df_train['clean_text'])

In [None]:
# Train Logistic Regression model
logistic_regression_model = LogisticRegression(max_iter=1000)
logistic_regression_model.fit(X_train_tfidf, y_train)

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Train Naive Bayes model
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train_tfidf, y_train)

In [None]:
with open('/content/drive/MyDrive/BIP/JOKER/Task 2 - classification/joker-2024-task2-classification-test.json', 'r') as file:
    test_data = json.load(file)

In [None]:
df_test = pd.DataFrame(test_data)

In [None]:
df_bayes_test = pd.DataFrame(test_data)
# Apply text preprocessing
df_bayes_test['clean_text'] = df_bayes_test['text'].apply(preprocess_text)

# TF-IDF Vectorization for test data
X_test_tfidf = tfidf_vectorizer.transform(df_bayes_test['clean_text'])

# Predict
bayes_predictions = naive_bayes_model.predict(X_test_tfidf)

# Convert back to original names
bayes_predicted_classes = label_encoder.inverse_transform(bayes_predictions)

In [None]:
# Apply text preprocessing
df_test['clean_text'] = df_test['text'].apply(preprocess_text)

# TF-IDF Vectorization for test data
X_test_tfidf = tfidf_vectorizer.transform(df_test['clean_text'])

In [None]:
# Predict
test_predictions = logistic_regression_model.predict(X_test_tfidf)

# Convert back to original names
predicted_classes = label_encoder.inverse_transform(test_predictions)

In [None]:
results = []
for i, entry in enumerate(test_data):
    output_entry = {
        "run_id": "Tomislav&Rowan_task_2_NaiveBayes",
        "manual": 0,
        "id": entry["id"],
        "class": bayes_predicted_classes[i]
    }
    results.append(output_entry)

# Save to JSON file
with open('result_task_2_naive_bayes.json', 'w', encoding='utf-8') as outfile:
    json.dump(results, outfile, indent=4)

In [None]:
from sklearn.svm import SVC

# Train SVM model
svm_model = SVC(kernel='linear')  # You can specify different kernels like 'linear', 'poly', 'rbf', etc.
svm_model.fit(X_train_tfidf, y_train)

In [None]:
#SVC classification report
# Encode labels
from sklearn.svm import SVC
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['class'])

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(df_train['clean_text'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, y_train, test_size=0.2, random_state=42)

# Train SVM model
svm_model = SVC(kernel='linear')  # You can specify different kernels like 'linear', 'poly', 'rbf', etc.
svm_model.fit(X_train, y_train)

# Make predictions
y_pred = svm_model.predict(X_test)

# Calculate and print the classification report
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0)
print(report)

              precision    recall  f1-score   support

         AID       0.65      0.32      0.43        47
          EX       0.50      0.08      0.14        38
          IR       0.30      0.20      0.24        41
          SC       0.46      0.46      0.46        79
          SD       0.80      0.25      0.38        32
          WS       0.47      0.86      0.61       112

    accuracy                           0.48       349
   macro avg       0.53      0.36      0.37       349
weighted avg       0.50      0.48      0.43       349



In [None]:
df_svc_test = pd.DataFrame(test_data)
# Apply text preprocessing
df_svc_test['clean_text'] = df_svc_test['text'].apply(preprocess_text)

# TF-IDF Vectorization for test data
X_test_tfidf = tfidf_vectorizer.transform(df_svc_test['clean_text'])

# Predict
svc_predictions = svm_model.predict(X_test_tfidf)

# Convert back to original names
svm_predicted_classes = label_encoder.inverse_transform(svc_predictions)

In [None]:
results = []
for i, entry in enumerate(test_data):
    output_entry = {
        "run_id": "Tomislav&Rowan_task_2_SVM",
        "manual": 0,
        "id": entry["id"],
        "class": svm_predicted_classes[i]
    }
    results.append(output_entry)

# Save to JSON file
with open('result_task_2_SVM.json', 'w', encoding='utf-8') as outfile:
    json.dump(results, outfile, indent=4)

##Task 3: Translation of puns from English to French

In [None]:
pwd

In [None]:
%cd "/content/drive/MyDrive/BIP/JOKER/Task 3 - translation/EN-FR-train"

In [None]:
ls

In [None]:
translation_EN_FR_train_input = pd.read_json("/content/drive/MyDrive/BIP/JOKER/Task 3 - translation/EN-FR-train/joker_translation_EN-FR_train_input.json")
task3_2024_test = pd.read_json("/content/drive/MyDrive/BIP/JOKER/Task 3 - translation/EN-FR-train/joker_task3_2024_test.json")
translation_EN_FR_train_qrels = pd.read_json("/content/drive/MyDrive/BIP/JOKER/Task 3 - translation/EN-FR-train/joker_translation_EN-FR_train_qrels.json")

In [None]:
print(translation_EN_FR_train_input.head())

In [None]:
print(task3_2024_test.head())

In [None]:
print(translation_EN_FR_train_qrels.head())

In [None]:
df_merged_translate = pd.merge(translation_EN_FR_train_input, translation_EN_FR_train_qrels, on='id_en')

In [None]:
print(df_merged_translate.tail())

In [None]:
import pandas as pd
import json

with open('joker_translation_EN-FR_train_input.json', 'r') as file:
    train_input = json.load(file)

df_train_input = pd.DataFrame(train_input)

with open('joker_translation_EN-FR_train_qrels.json', 'r') as file:
    train_qrels = json.load(file)

df_train_qrels = pd.DataFrame(train_qrels)

# Merge the training data with labels on id
df_train = pd.merge(df_train_input, df_train_qrels, on='id_en')

In [None]:
print(df_train.tail())

In [None]:
!pip install -U easynmt

In [None]:
from easynmt import EasyNMT
model = EasyNMT('opus-mt')

In [None]:
result = model.translate(df_train['text_en'][0], target_lang='fr')

In [None]:
print(result)

In [None]:
with open('joker_task3_2024_test.json', 'r') as file:
    test_data = json.load(file)

In [None]:
df_test_data = pd.DataFrame(test_data)
df_test_data

In [None]:
df_test_data

In [None]:
results = []
# Translate jokes
for row in df_test_data:
    translation = model.translate(df_test_data['text_en'], source_lang='en', target_lang='fr')
    results.append({
        'run_id': "team1_Petra_and_Regina_task_3_TranslationModel",
        'manual': 0,
        'id_en': row['en_1'],
        'text_fr': translation
    })

In [None]:
with open('results_task_3.json', 'w') as outfile:
    json.dump(results, outfile, indent=4)

In [None]:
from transformers import MarianMTModel, MarianTokenizer

# Load pre-trained MarianMT model and tokenizer for English to French translation
model_name = "Helsinki-NLP/opus-mt-en-fr"
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Define input text
input_text = "Translate this text to French."

# Tokenize input text
inputs = tokenizer(input_text, return_tensors="pt")

# Perform translation
outputs = model.generate(**inputs)

# Decode translated output
translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print translated text
print("Translated text:", translated_text)


In [None]:
# Assuming you have already loaded the test data into a DataFrame df_test_data

results = []

# Translate jokes
for _, row in df_test_data.iterrows():
    # Translate each row's English text to French
    translation = model.generate(**tokenizer(row['text_en'], return_tensors="pt", padding=True))
    translated_text = tokenizer.decode(translation[0], skip_special_tokens=True)

    # Append the translation result to the results list
    results.append({
        'run_id': "Tomislav&Rowan_task_3_MarianMTModel",
        'manual': 0,
        'id_en': row['id_en'],
        'text_fr': translated_text
    })

# Convert results list to DataFrame
translated_df = pd.DataFrame(results)

# Print or use the translated DataFrame as needed
print(translated_df)


In [None]:
with open('results_all_task_3_MarianMTModel.json', 'w') as outfile:
    json.dump(results, outfile, indent=4)