I will improve the comparison of labels on which the mapping will be based, by a more automated approach (the initial X-GENRE mapping is based on my manual analysis).

In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import apyori

## Merging the datasets with predictions into one

In [3]:
# I will create a "master" sheet with predictions that includes GINCO, FTD and CORE sheets.

# Open the sheets
FTD_df = pd.read_csv("data-sheets-with-all-info/FTD-dataset-with-all-information.csv", sep="\t", index_col = 0)

In [4]:
FTD_df.head(1)

Unnamed: 0,ID,labels,Multiple labels,text,length,GINCO_downcast_pred,MT-GINCO_downcast_pred,CORE_main_pred,CORE_sub_pred,labels_to_X-GENRE
0,__id__1-syndicate,A1 (argumentative),,BMW's and Chinese Justice * * * * * In most pl...,975,News/Reporting,News/Reporting,Narrative,News Report/Blog,discarded


In [6]:
FTD_df.describe(include="all")

Unnamed: 0,ID,labels,Multiple labels,text,length,GINCO_downcast_pred,MT-GINCO_downcast_pred,CORE_main_pred,CORE_sub_pred,labels_to_X-GENRE
count,1553,1553,139,1553,1553.0,1553,1553,1553,1553,1414
unique,1553,43,1,1553,,9,9,8,22,8
top,__id__1-syndicate,A1 (argumentative),y,BMW's and Chinese Justice * * * * * In most pl...,,Information/Explanation,Information/Explanation,Informational Description/Explanation,Description of a Thing,discarded
freq,1,296,139,1,,437,328,856,419,364
mean,,,,,1430.271732,,,,,
std,,,,,4789.884538,,,,,
min,,,,,31.0,,,,,
25%,,,,,235.0,,,,,
50%,,,,,531.0,,,,,
75%,,,,,1203.0,,,,,


In [7]:
# Filter out multiple texts
FTD_df = FTD_df[FTD_df["Multiple labels"] != "y"]

FTD_df.describe(include="all")

Unnamed: 0,ID,labels,Multiple labels,text,length,GINCO_downcast_pred,MT-GINCO_downcast_pred,CORE_main_pred,CORE_sub_pred,labels_to_X-GENRE
count,1414,1414,0.0,1414,1414.0,1414,1414,1414,1414,1414
unique,1414,10,0.0,1414,,9,9,8,22,8
top,__id__1-syndicate,A1 (argumentative),,BMW's and Chinese Justice * * * * * In most pl...,,Information/Explanation,Information/Explanation,Informational Description/Explanation,Description of a Thing,discarded
freq,1,296,,1,,407,311,772,382,364
mean,,,,,1445.212164,,,,,
std,,,,,4989.570842,,,,,
min,,,,,31.0,,,,,
25%,,,,,224.0,,,,,
50%,,,,,495.0,,,,,
75%,,,,,1144.25,,,,,


In [9]:
# Keep only relevant columns
FTD_df = FTD_df[['labels', 'text','GINCO_downcast_pred', 'MT-GINCO_downcast_pred', 'CORE_main_pred',
       'CORE_sub_pred', 'labels_to_X-GENRE']]

# Rename some of the columns
FTD_df.columns = ['FTD', 'text','GINCO_downcast', 'MT-GINCO_downcast', 'CORE_main',
       'CORE_sub', 'X-GENRE']

FTD_df.head(2)

Unnamed: 0,FTD,text,GINCO_downcast,MT-GINCO_downcast,CORE_main,CORE_sub,X-GENRE
0,A1 (argumentative),BMW's and Chinese Justice * * * * * In most pl...,News/Reporting,News/Reporting,Narrative,News Report/Blog,discarded
1,A1 (argumentative),China and a New Balance of Power SHANGHAI – Th...,Information/Explanation,Opinion/Argumentation,Opinion,Opinion Blog,discarded


In [10]:
# Add a column with information that this is originally from FTD
FTD_df["original_label"] = "FTD"

FTD_df.head(1)

Unnamed: 0,FTD,text,GINCO_downcast,MT-GINCO_downcast,CORE_main,CORE_sub,X-GENRE,original_label
0,A1 (argumentative),BMW's and Chinese Justice * * * * * In most pl...,News/Reporting,News/Reporting,Narrative,News Report/Blog,discarded,FTD


In [21]:
FTD_df.columns

Index(['FTD', 'text', 'GINCO_downcast', 'MT-GINCO_downcast', 'CORE_main',
       'CORE_sub', 'X-GENRE', 'original_label'],
      dtype='object')

In [22]:
# Reorder labels
FTD_df = FTD_df[['original_label', 'text', 'FTD', 'GINCO_downcast', 'MT-GINCO_downcast', 'CORE_main',
       'CORE_sub', 'X-GENRE']]
FTD_df.head(1)

Unnamed: 0,original_label,text,FTD,GINCO_downcast,MT-GINCO_downcast,CORE_main,CORE_sub,X-GENRE
0,FTD,BMW's and Chinese Justice * * * * * In most pl...,A1 (argumentative),News/Reporting,News/Reporting,Narrative,News Report/Blog,discarded


In [29]:
FTD_df.columns

Index(['original_label', 'text', 'FTD', 'GINCO_downcast', 'MT-GINCO_downcast',
       'CORE_main', 'CORE_sub', 'X-GENRE'],
      dtype='object')

In [78]:
# Open the GINCO dataset
GINCO_df = pd.read_csv("data-sheets-with-all-info/GINCO-MT-GINCO-keeptext-with-all-information.csv", sep="\t", index_col = 0)
GINCO_df.head(1)

Unnamed: 0,id,url,crawled,hard,primary_level_1,primary_level_2,primary_level_3,secondary_level_1,secondary_level_2,secondary_level_3,...,FTD_pred_on_SL,FTD_pred_on_MT,split-without-rare-categories,primary_level_4,downcast_split,CORE_main_pred_on_SL,CORE_main_pred_on_MT,CORE_sub_pred_on_SL,CORE_sub_pred_on_MT,primary_level_1_to_X-GENRE
0,3949,http://www.pomurje.si/aktualno/sport/zimska-li...,2014,False,News/Reporting,News/Reporting,News/Reporting,,,,...,A8 (news),A8 (news),test,News/Reporting,test,Narrative,Narrative,Sports Report,Sports Report,News


In [79]:
GINCO_df.columns

Index(['id', 'url', 'crawled', 'hard', 'primary_level_1', 'primary_level_2',
       'primary_level_3', 'secondary_level_1', 'secondary_level_2',
       'secondary_level_3', 'tertiary_level_1', 'tertiary_level_2',
       'tertiary_level_3', 'split', 'domain', 'GINCORE', 'Slovene_text',
       'MT_text', 'text_length', 'FTD_pred_on_SL', 'FTD_pred_on_MT',
       'split-without-rare-categories', 'primary_level_4', 'downcast_split',
       'CORE_main_pred_on_SL', 'CORE_main_pred_on_MT', 'CORE_sub_pred_on_SL',
       'CORE_sub_pred_on_MT', 'primary_level_1_to_X-GENRE'],
      dtype='object')

In [80]:
# Keep only relevant columns
GINCO_df = GINCO_df[['primary_level_4', 'Slovene_text', 'MT_text', 'FTD_pred_on_SL', 'FTD_pred_on_MT',
       'CORE_main_pred_on_SL', 'CORE_main_pred_on_MT', 'CORE_sub_pred_on_SL',
       'CORE_sub_pred_on_MT', 'primary_level_1_to_X-GENRE']]

GINCO_df.head(1)

Unnamed: 0,primary_level_4,Slovene_text,MT_text,FTD_pred_on_SL,FTD_pred_on_MT,CORE_main_pred_on_SL,CORE_main_pred_on_MT,CORE_sub_pred_on_SL,CORE_sub_pred_on_MT,primary_level_1_to_X-GENRE
0,News/Reporting,"Šport <p/> Zimska liga malega nogometa sobota,...",Sport <p/> Winter Little League Football Satur...,A8 (news),A8 (news),Narrative,Narrative,Sports Report,Sports Report,News


In [81]:
GINCO_df.columns

Index(['primary_level_4', 'Slovene_text', 'MT_text', 'FTD_pred_on_SL',
       'FTD_pred_on_MT', 'CORE_main_pred_on_SL', 'CORE_main_pred_on_MT',
       'CORE_sub_pred_on_SL', 'CORE_sub_pred_on_MT',
       'primary_level_1_to_X-GENRE'],
      dtype='object')

In [82]:
# Separate SI-GINCO and MT-GINCO into two dataframes
SIGINCO_df = GINCO_df[['primary_level_4', 'Slovene_text', 'FTD_pred_on_SL', 'CORE_main_pred_on_SL', 'CORE_sub_pred_on_SL', 'primary_level_1_to_X-GENRE']]

MTGINCO_df = GINCO_df[['primary_level_4', 'MT_text', 'FTD_pred_on_MT', 'CORE_main_pred_on_MT', 'CORE_sub_pred_on_MT',
       'primary_level_1_to_X-GENRE']]

MTGINCO_df.head(1)

Unnamed: 0,primary_level_4,MT_text,FTD_pred_on_MT,CORE_main_pred_on_MT,CORE_sub_pred_on_MT,primary_level_1_to_X-GENRE
0,News/Reporting,Sport <p/> Winter Little League Football Satur...,A8 (news),Narrative,Sports Report,News


In [83]:
print(MTGINCO_df.columns)

Index(['primary_level_4', 'MT_text', 'FTD_pred_on_MT', 'CORE_main_pred_on_MT',
       'CORE_sub_pred_on_MT', 'primary_level_1_to_X-GENRE'],
      dtype='object')


In [84]:
# Rename columns
SIGINCO_df.columns = ['GINCO_downcast', 'text', 'FTD', 'CORE_main', 'CORE_sub','X-GENRE']

MTGINCO_df.columns = ['MT-GINCO_downcast', 'text', 'FTD','CORE_main', 'CORE_sub','X-GENRE']

SIGINCO_df.head(1)

Unnamed: 0,GINCO_downcast,text,FTD,CORE_main,CORE_sub,X-GENRE
0,News/Reporting,"Šport <p/> Zimska liga malega nogometa sobota,...",A8 (news),Narrative,Sports Report,News


In [85]:
# Add column about the original label
SIGINCO_df['original_label'] = "SI-GINCO"
MTGINCO_df['original_label'] = "MT-GINCO"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SIGINCO_df['original_label'] = "SI-GINCO"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MTGINCO_df['original_label'] = "MT-GINCO"


In [86]:
MTGINCO_df.describe(include="all")

Unnamed: 0,MT-GINCO_downcast,text,FTD,CORE_main,CORE_sub,X-GENRE,original_label
count,1002,1002,1002,1002,1002,1002,1002
unique,9,1002,10,8,23,10,1
top,News/Reporting,Sport <p/> Winter Little League Football Satur...,A12 (promotion),Informational Description/Explanation,Description of a Thing,News,MT-GINCO
freq,221,1,287,506,317,221,1002


In [49]:
# Open the CORE df
CORE_df = pd.read_csv("data-sheets-with-all-info/CORE-all-information.csv", sep="\t", index_col = 0)

CORE_df.head(1)

Unnamed: 0,label,text,split,main_labels,sublabels,Len,main_len,sub_len,GINCORE,full_names,main_labels_full_names,FTD_pred,GINCO_downcast_pred,MT-GINCO_downcast_pred,subcategories_to_X-GENRE
0,NA OP SR OB,The Top TEN 'Whiniest Sets of Fans' in English...,train,NA OP,SR OB,4,2,2,NA OP SR OB,SR OB,NA OP,,,,check


In [50]:
# GINCO labels were predicted only on a sample - leave in the dataframe only instances with predictions by all classifiers
CORE_df.dropna(subset=["GINCO_downcast_pred"],inplace=True)
CORE_df.describe(include="all")

Unnamed: 0,label,text,split,main_labels,sublabels,Len,main_len,sub_len,GINCORE,full_names,main_labels_full_names,FTD_pred,GINCO_downcast_pred,MT-GINCO_downcast_pred,subcategories_to_X-GENRE
count,1500,1500,1500,874,1500,1500.0,1500.0,1500.0,1500,1500,1500,1500,1500,1500,1500
unique,77,1500,3,19,43,,,,49,43,20,10,9,9,10
top,NA NE,"Welcome, one and all, to TheCheckingLine.com, ...",train,OP,NE,,,,News,News Report/Blog,Narrative,A1 (argumentative),News/Reporting,News/Reporting,News
freq,379,1,1030,270,395,,,,484,395,626,419,481,481,501
mean,,,,,,2.052,1.052,1.0,,,,,,,
std,,,,,,0.222101,0.222101,0.0,,,,,,,
min,,,,,,2.0,1.0,1.0,,,,,,,
25%,,,,,,2.0,1.0,1.0,,,,,,,
50%,,,,,,2.0,1.0,1.0,,,,,,,
75%,,,,,,2.0,1.0,1.0,,,,,,,


In [51]:
CORE_df.columns

Index(['label', 'text', 'split', 'main_labels', 'sublabels', 'Len', 'main_len',
       'sub_len', 'GINCORE', 'full_names', 'main_labels_full_names',
       'FTD_pred', 'GINCO_downcast_pred', 'MT-GINCO_downcast_pred',
       'subcategories_to_X-GENRE'],
      dtype='object')

In [52]:
# Leave only relevant columns
CORE_df = CORE_df[['text','full_names', 'main_labels_full_names','FTD_pred', 'GINCO_downcast_pred', 'MT-GINCO_downcast_pred', 'subcategories_to_X-GENRE']]
CORE_df.head(1)

Unnamed: 0,text,full_names,main_labels_full_names,FTD_pred,GINCO_downcast_pred,MT-GINCO_downcast_pred,subcategories_to_X-GENRE
14,"Welcome, one and all, to TheCheckingLine.com, ...",Sports Report,Narrative,A1 (argumentative),Opinion/Argumentation,Opinion/Argumentation,News


In [53]:
# Rename the columns
CORE_df.columns = ['text','CORE_sub', 'CORE_main','FTD', 'GINCO_downcast', 'MT-GINCO_downcast', 'X-GENRE']

CORE_df.head(1)

Unnamed: 0,text,CORE_sub,CORE_main,FTD,GINCO_downcast,MT-GINCO_downcast,X-GENRE
14,"Welcome, one and all, to TheCheckingLine.com, ...",Sports Report,Narrative,A1 (argumentative),Opinion/Argumentation,Opinion/Argumentation,News


In [54]:
# Add the column about the original schema
CORE_df['original_label'] = "CORE"
CORE_df.head(1)

Unnamed: 0,text,CORE_sub,CORE_main,FTD,GINCO_downcast,MT-GINCO_downcast,X-GENRE,original_label
14,"Welcome, one and all, to TheCheckingLine.com, ...",Sports Report,Narrative,A1 (argumentative),Opinion/Argumentation,Opinion/Argumentation,News,CORE


In [62]:
SIGINCO_df.shape, FTD_df.shape, MTGINCO_df.shape, CORE_df.shape

((1002, 8), (1414, 8), (1002, 8), (1500, 8))

In [87]:
# Merge the datasets
merge_temp = pd.concat([FTD_df, SIGINCO_df])
merge_temp = pd.concat([merge_temp, MTGINCO_df])
merged_df = pd.concat([merge_temp, CORE_df])

merged_df.describe(include="all")

Unnamed: 0,original_label,text,FTD,GINCO_downcast,MT-GINCO_downcast,CORE_main,CORE_sub,X-GENRE
count,4918,4918,4918,3916,3916,4918,4918,4918
unique,4,4918,10,9,9,20,43,10
top,CORE,BMW's and Chinese Justice * * * * * In most pl...,A12 (promotion),News/Reporting,News/Reporting,Informational Description/Explanation,Description of a Thing,News
freq,1500,1,1011,933,951,2094,1167,1079


In [90]:
# Reset the index
merged_df = merged_df.reset_index()
merged_df.tail()

Unnamed: 0,index,original_label,text,FTD,GINCO_downcast,MT-GINCO_downcast,CORE_main,CORE_sub,X-GENRE
4913,48378,CORE,Bringing You Great Ideas From The Business Wor...,A1 (argumentative),Information/Explanation,Information/Explanation,Opinion,Opinion Blog,Opinion/Argumentation
4914,48403,CORE,"In the hills outside Islamabad in Pakistan, 16...",A1 (argumentative),News/Reporting,News/Reporting,Narrative,News Report/Blog,News
4915,48405,CORE,Who Cares About the Big Bang? Senator Marco Ru...,A1 (argumentative),News/Reporting,News/Reporting,Opinion,Opinion Blog,Opinion/Argumentation
4916,48419,CORE,McCabe: There was no 'toff strategy' The MP be...,A8 (news),News/Reporting,News/Reporting,Narrative,News Report/Blog,News
4917,48423,CORE,Copeland looking to make statement Miserly for...,A8 (news),News/Reporting,News/Reporting,Narrative,News Report/Blog,News


In [91]:
# Save the merged dataset
merged_df.to_csv("data-sheets-with-all-info/Merged-datasets-with-all-predictions.csv", sep="\t")

## Comparison of predictions

To analyse how the predictions are connected, I will take each of the datasets and analyse for each of their original labels whether they are consistently predicted with a label from another schema. Then I will compare them in the other direction - by grouping the predictions of this classifier on all other datasets and seeing whether they are connected well with other labels.

In [2]:
# Open the dataset
merged_df = pd.read_csv("data-sheets-with-all-info/Merged-datasets-with-all-predictions.csv", sep="\t", index_col=0)

In [3]:
merged_df.columns

Index(['index', 'original_label', 'text', 'FTD', 'GINCO_downcast',
       'MT-GINCO_downcast', 'CORE_main', 'CORE_sub', 'X-GENRE'],
      dtype='object')

In [4]:
merged_df.head(1)

Unnamed: 0,index,original_label,text,FTD,GINCO_downcast,MT-GINCO_downcast,CORE_main,CORE_sub,X-GENRE
0,0,FTD,BMW's and Chinese Justice * * * * * In most pl...,A1 (argumentative),News/Reporting,News/Reporting,Narrative,News Report/Blog,discarded


In [102]:
# Count for each pair of schemata in how many times they occur together
combinations = [['FTD', 'GINCO_downcast'], ['FTD', 'MT-GINCO_downcast'], ['FTD', 'CORE_main'], ['FTD', 'CORE_sub'], ['GINCO_downcast', 'MT-GINCO_downcast'], ['GINCO_downcast', 'CORE_main'],['GINCO_downcast', 'CORE_sub'] , ['MT-GINCO_downcast', 'CORE_main'], ['MT-GINCO_downcast', 'CORE_sub'], ['CORE_main', 'CORE_sub'] ]

for i in combinations:
	merged_df[f"{i[0]} + {i[1]} count"] = merged_df.groupby([f"{i[0]}", f"{i[1]}"])[f"{i[0]}"].transform('size')
	merged_df[f"{i[0]} + {i[1]}"] = merged_df[f"{i[0]}"].str.cat(merged_df[f"{i[1]}"], sep = "+")

merged_df.head(1)

Unnamed: 0,index,original_label,text,FTD,GINCO_downcast,MT-GINCO_downcast,CORE_main,CORE_sub,X-GENRE,FTD + GINCO_downcast count,...,FTD + GINCO_downcast,FTD + MT-GINCO_downcast,FTD + CORE_main,FTD + CORE_sub,GINCO_downcast + MT-GINCO_downcast,GINCO_downcast + CORE_main,GINCO_downcast + CORE_sub,MT-GINCO_downcast + CORE_main,MT-GINCO_downcast + CORE_sub,CORE_main + CORE_sub
0,0,FTD,BMW's and Chinese Justice * * * * * In most pl...,A1 (argumentative),News/Reporting,News/Reporting,Narrative,News Report/Blog,discarded,225.0,...,A1 (argumentative)+News/Reporting,A1 (argumentative)+News/Reporting,A1 (argumentative)+Narrative,A1 (argumentative)+News Report/Blog,News/Reporting+News/Reporting,News/Reporting+Narrative,News/Reporting+News Report/Blog,News/Reporting+Narrative,News/Reporting+News Report/Blog,Narrative+News Report/Blog


In [112]:
merged_df.columns

Index(['index', 'original_label', 'text', 'FTD', 'GINCO_downcast',
       'MT-GINCO_downcast', 'CORE_main', 'CORE_sub', 'X-GENRE',
       'FTD + GINCO_downcast count', 'FTD + MT-GINCO_downcast count',
       'FTD + CORE_main count', 'FTD + CORE_sub count',
       'GINCO_downcast + MT-GINCO_downcast count',
       'GINCO_downcast + CORE_main count', 'GINCO_downcast + CORE_sub count',
       'MT-GINCO_downcast + CORE_main count',
       'MT-GINCO_downcast + CORE_sub count', 'CORE_main + CORE_sub count',
       'FTD + GINCO_downcast', 'FTD + MT-GINCO_downcast', 'FTD + CORE_main',
       'FTD + CORE_sub', 'GINCO_downcast + MT-GINCO_downcast',
       'GINCO_downcast + CORE_main', 'GINCO_downcast + CORE_sub',
       'MT-GINCO_downcast + CORE_main', 'MT-GINCO_downcast + CORE_sub',
       'CORE_main + CORE_sub'],
      dtype='object')

In [113]:
# Let's concatenate all labels together

merged_df["merged_labels-CORE_sub"] = merged_df['FTD + GINCO_downcast'].str.cat(merged_df["CORE_sub"], sep = "+")
merged_df["merged_labels-CORE_main"] = merged_df['FTD + GINCO_downcast'].str.cat(merged_df["CORE_main"], sep = "+")

merged_df.head(1)

Unnamed: 0,index,original_label,text,FTD,GINCO_downcast,MT-GINCO_downcast,CORE_main,CORE_sub,X-GENRE,FTD + GINCO_downcast count,FTD + MT-GINCO_downcast count,FTD + CORE_main count,FTD + CORE_sub count,GINCO_downcast + MT-GINCO_downcast count,GINCO_downcast + CORE_main count,GINCO_downcast + CORE_sub count,MT-GINCO_downcast + CORE_main count,MT-GINCO_downcast + CORE_sub count,CORE_main + CORE_sub count,FTD + GINCO_downcast,FTD + MT-GINCO_downcast,FTD + CORE_main,FTD + CORE_sub,GINCO_downcast + MT-GINCO_downcast,GINCO_downcast + CORE_main,GINCO_downcast + CORE_sub,MT-GINCO_downcast + CORE_main,MT-GINCO_downcast + CORE_sub,CORE_main + CORE_sub,merged_labels-CORE_sub,merged_labels-CORE_main
0,0,FTD,BMW's and Chinese Justice * * * * * In most pl...,A1 (argumentative),News/Reporting,News/Reporting,Narrative,News Report/Blog,discarded,225.0,232.0,284,253,656.0,640.0,552.0,682.0,576.0,732,A1 (argumentative)+News/Reporting,A1 (argumentative)+News/Reporting,A1 (argumentative)+Narrative,A1 (argumentative)+News Report/Blog,News/Reporting+News/Reporting,News/Reporting+Narrative,News/Reporting+News Report/Blog,News/Reporting+Narrative,News/Reporting+News Report/Blog,Narrative+News Report/Blog,A1 (argumentative)+News/Reporting+News Report/...,A1 (argumentative)+News/Reporting+Narrative


In [115]:
# Analyze distribution of merged labels
merged_df["merged_labels-CORE_sub"].value_counts(normalize="True")[:10]

A8 (news)+News/Reporting+News Report/Blog                           0.085802
A12 (promotion)+Promotion+Description of a Thing                    0.054137
A1 (argumentative)+News/Reporting+News Report/Blog                  0.038560
A12 (promotion)+Promotion+Description with Intent to Sell           0.034985
A16 (information)+Information/Explanation+Description of a Thing    0.034985
A1 (argumentative)+Opinion/Argumentation+Opinion Blog               0.031154
A8 (news)+News/Reporting+Sports Report                              0.029367
A11 (personal)+Opinion/Argumentation+Personal Blog                  0.027068
A7 (instruction)+Instruction+How-to                                 0.026813
A12 (promotion)+Information/Explanation+Description of a Thing      0.016599
Name: merged_labels-CORE_sub, dtype: float64

In [116]:
# Analyze distribution of merged labels
merged_df["merged_labels-CORE_main"].value_counts(normalize="True")[:10]

A8 (news)+News/Reporting+Narrative                                                  0.110827
A16 (information)+Information/Explanation+Informational Description/Explanation     0.068182
A12 (promotion)+Promotion+Informational Description/Explanation                     0.064096
A1 (argumentative)+News/Reporting+Narrative                                         0.036772
A1 (argumentative)+Opinion/Argumentation+Opinion                                    0.034219
A7 (instruction)+Instruction+How-To/Instructional                                   0.031154
A12 (promotion)+Promotion+Informational Persuasion                                  0.028856
A11 (personal)+Opinion/Argumentation+Narrative                                      0.028090
A1 (argumentative)+Information/Explanation+Informational Description/Explanation    0.022472
A14 (academic)+Information/Explanation+Informational Description/Explanation        0.021961
Name: merged_labels-CORE_main, dtype: float64

In [110]:
# Show all columns
pd.set_option('display.max_columns', None)

In [111]:
merged_df.describe(include="all")

Unnamed: 0,index,original_label,text,FTD,GINCO_downcast,MT-GINCO_downcast,CORE_main,CORE_sub,X-GENRE,FTD + GINCO_downcast count,FTD + MT-GINCO_downcast count,FTD + CORE_main count,FTD + CORE_sub count,GINCO_downcast + MT-GINCO_downcast count,GINCO_downcast + CORE_main count,GINCO_downcast + CORE_sub count,MT-GINCO_downcast + CORE_main count,MT-GINCO_downcast + CORE_sub count,CORE_main + CORE_sub count,FTD + GINCO_downcast,FTD + MT-GINCO_downcast,FTD + CORE_main,FTD + CORE_sub,GINCO_downcast + MT-GINCO_downcast,GINCO_downcast + CORE_main,GINCO_downcast + CORE_sub,MT-GINCO_downcast + CORE_main,MT-GINCO_downcast + CORE_sub,CORE_main + CORE_sub
count,4918.0,4918,4918,4918,3916,3916,4918,4918,4918,3916.0,3916.0,4918.0,4918.0,2914.0,3916.0,3916.0,3916.0,3916.0,4918.0,3916,3916,4918,4918,2914,3916,3916,3916,3916,4918
unique,,4,4918,10,9,9,20,43,10,,,,,,,,,,,73,76,94,197,57,89,197,87,196,157
top,,CORE,BMW's and Chinese Justice * * * * * In most pl...,A12 (promotion),News/Reporting,News/Reporting,Informational Description/Explanation,Description of a Thing,News,,,,,,,,,,,A8 (news)+News/Reporting,A8 (news)+News/Reporting,A12 (promotion)+Informational Description/Expl...,A12 (promotion)+Description of a Thing,News/Reporting+News/Reporting,News/Reporting+Narrative,News/Reporting+News Report/Blog,News/Reporting+Narrative,News/Reporting+News Report/Blog,Informational Description/Explanation+Descript...
freq,,1500,1,1011,933,951,2094,1167,1079,,,,,,,,,,,506,564,665,541,656,640,552,682,576,1066
mean,7958.074014,,,,,,,,,211.196629,233.837079,304.245222,191.596584,344.944406,291.841675,162.086823,295.57712,168.764556,412.094347,,,,,,,,,,
std,13529.153307,,,,,,,,,166.74071,188.106191,234.419321,184.788976,234.070114,241.34874,179.200385,236.636062,187.58176,409.323229,,,,,,,,,,
min,0.0,,,,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,
25%,417.0,,,,,,,,,58.0,56.0,106.0,37.0,95.0,75.0,28.0,99.0,29.0,101.0,,,,,,,,,,
50%,857.5,,,,,,,,,192.0,227.0,224.0,129.0,433.0,224.0,98.0,256.0,99.0,170.0,,,,,,,,,,
75%,8602.25,,,,,,,,,304.0,425.0,629.0,253.0,479.0,608.0,255.0,523.0,241.0,732.0,,,,,,,,,,


In [None]:
#Let's analyze which pairs occur together most often

In [107]:
# For each of the pairs, analyse how often they occur together
for i in combinations:
	print(f"Value counts for: {i[0]} + {i[1]}")
	print(merged_df[f"{i[0]} + {i[1]}"].value_counts(normalize=True).sort_values()to_markdown())

Value counts for: FTD + GINCO_downcast
|                                               |   FTD + GINCO_downcast |
|:----------------------------------------------|-----------------------:|
| A8 (news)+News/Reporting                      |            0.129213    |
| A12 (promotion)+Promotion                     |            0.104188    |
| A16 (information)+Information/Explanation     |            0.0776302   |
| A1 (argumentative)+Opinion/Argumentation      |            0.072523    |
| A1 (argumentative)+News/Reporting             |            0.0574566   |
| A7 (instruction)+Instruction                  |            0.0528601   |
| A11 (personal)+Opinion/Argumentation          |            0.0490296   |
| A1 (argumentative)+Information/Explanation    |            0.0337079   |
| A17 (review)+Opinion/Argumentation            |            0.0247702   |
| A12 (promotion)+Information/Explanation       |            0.023238    |
| A14 (academic)+Information/Explanation        |            

## Finding the best mapping using the association rules

In [3]:
# Open the dataset
merged_df = pd.read_csv("data-sheets-with-all-info/Merged-datasets-with-all-predictions.csv", sep="\t", index_col = 0)
merged_df.head(1)

Unnamed: 0,index,original_label,text,FTD,GINCO_downcast,MT-GINCO_downcast,CORE_main,CORE_sub,X-GENRE
0,0,FTD,BMW's and Chinese Justice * * * * * In most pl...,A1 (argumentative),News/Reporting,News/Reporting,Narrative,News Report/Blog,discarded


In [4]:
# Use only relevant columns
merged_df = merged_df[['FTD', 'GINCO_downcast', 'CORE_main', 'CORE_sub']]
merged_df.head(1)

Unnamed: 0,FTD,GINCO_downcast,CORE_main,CORE_sub
0,A1 (argumentative),News/Reporting,Narrative,News Report/Blog


In [5]:
# Remove nan values - they occurred in the GINCO_downcast column because we added MT-GINCO downcast without this values to the dataframe, and we don't use this column here.
merged_df.dropna(subset=["GINCO_downcast"], inplace=True)

In [6]:
# Check if there are any NaNs left
merged_df[merged_df.isnull().values]

Unnamed: 0,FTD,GINCO_downcast,CORE_main,CORE_sub


In [7]:
# Reset index
merged_df = merged_df.reset_index()

In [8]:
# Add information about columns to the label names
merged_df["FTD"] = "FTD: " + merged_df["FTD"].astype(str)
merged_df["FTD"].head(5)

0    FTD: A1 (argumentative)
1    FTD: A1 (argumentative)
2    FTD: A1 (argumentative)
3    FTD: A1 (argumentative)
4    FTD: A1 (argumentative)
Name: FTD, dtype: object

In [9]:
merged_df.columns

Index(['index', 'FTD', 'GINCO_downcast', 'CORE_main', 'CORE_sub'], dtype='object')

In [10]:
merged_df["GINCO_downcast"] = "GINCO: " + merged_df["GINCO_downcast"].astype(str)
merged_df["CORE_main"] = "CORE_main: " + merged_df["CORE_main"].astype(str)
merged_df["CORE_sub"] = "CORE_sub: " + merged_df["CORE_sub"].astype(str)
merged_df.head(3)

Unnamed: 0,index,FTD,GINCO_downcast,CORE_main,CORE_sub
0,0,FTD: A1 (argumentative),GINCO: News/Reporting,CORE_main: Narrative,CORE_sub: News Report/Blog
1,1,FTD: A1 (argumentative),GINCO: Information/Explanation,CORE_main: Opinion,CORE_sub: Opinion Blog
2,2,FTD: A1 (argumentative),GINCO: Opinion/Argumentation,CORE_main: Opinion,CORE_sub: Opinion Blog


In [11]:
merged_df.shape

(3916, 5)

In [12]:
# Save the dataset

merged_df.to_csv("data-sheets-with-all-info/Merged-datasets-with-all-predictions.csv", sep="\t")

In [162]:
# Let's created the required ouput - a list of all values per instance - I will create a list of pairs from each schemata instead of lists of 4 values from the schema
apriori_list = []

for i in range(0,3915):
	apriori_list.append([str(merged_df.values[i,j]) for j in range(1,5)])


apriori_list[:10]

[['FTD: A1 (argumentative)',
  'GINCO: News/Reporting',
  'CORE_main: Narrative',
  'CORE_sub: News Report/Blog'],
 ['FTD: A1 (argumentative)',
  'GINCO: Information/Explanation',
  'CORE_main: Opinion',
  'CORE_sub: Opinion Blog'],
 ['FTD: A1 (argumentative)',
  'GINCO: Opinion/Argumentation',
  'CORE_main: Opinion',
  'CORE_sub: Opinion Blog'],
 ['FTD: A1 (argumentative)',
  'GINCO: Opinion/Argumentation',
  'CORE_main: Narrative',
  'CORE_sub: Historical Article'],
 ['FTD: A1 (argumentative)',
  'GINCO: News/Reporting',
  'CORE_main: Narrative',
  'CORE_sub: News Report/Blog'],
 ['FTD: A1 (argumentative)',
  'GINCO: List of Summaries/Excerpts',
  'CORE_main: Narrative',
  'CORE_sub: News Report/Blog'],
 ['FTD: A1 (argumentative)',
  'GINCO: News/Reporting',
  'CORE_main: Narrative',
  'CORE_sub: News Report/Blog'],
 ['FTD: A1 (argumentative)',
  'GINCO: Opinion/Argumentation',
  'CORE_main: Informational Description/Explanation',
  'CORE_sub: Description of a Thing'],
 ['FTD: A1 (ar

In [163]:
# Training the model
from apyori import apriori

results = list(apriori(apriori_list,
             min_support=0.01,
             min_confidence=0.80,
             min_lift=1.0,
             max_length=None))

In [165]:
#Transform the results into pandas dataframe
output = list(results) # returns a non-tabular output

# putting output into a pandas dataframe
def inspect(output):
    lhs         = [tuple(result[2][0][0])[0] for result in output]
    rhs         = [tuple(result[2][0][1])[0] for result in output]
    support    = [result[1] for result in output]
    confidence = [result[2][0][2] for result in output]
    lift       = [result[2][0][3] for result in output]
    entire_item = [list(result) for result in output]
    return list(zip(lhs, rhs, support, confidence, lift, entire_item))
output_df = pd.DataFrame(inspect(output), columns = ['Left_Hand_Side', 'Right_Hand_Side', 'Support', 'Confidence', 'Lift','Full_result'])

# Display results based on lift
output_df.sort_values(by="Lift", ascending=False).head()

In [146]:
output_DataFrame.iloc[102]["Full_result"]

[frozenset({'CORE_main: Narrative',
            'CORE_sub: Short Story',
            'FTD: A4 (fiction)',
            'GINCO: Other'}),
 0.013282247765006385,
 [OrderedStatistic(items_base=frozenset({'CORE_sub: Short Story', 'GINCO: Other'}), items_add=frozenset({'CORE_main: Narrative', 'FTD: A4 (fiction)'}), confidence=0.8666666666666667, lift=37.285714285714285),
  OrderedStatistic(items_base=frozenset({'CORE_sub: Short Story', 'CORE_main: Narrative', 'GINCO: Other'}), items_add=frozenset({'FTD: A4 (fiction)'}), confidence=1.0, lift=30.826771653543307),
  OrderedStatistic(items_base=frozenset({'GINCO: Other', 'CORE_main: Narrative', 'FTD: A4 (fiction)'}), items_add=frozenset({'CORE_sub: Short Story'}), confidence=0.9811320754716981, lift=38.79931389365352),
  OrderedStatistic(items_base=frozenset({'CORE_sub: Short Story', 'FTD: A4 (fiction)', 'GINCO: Other'}), items_add=frozenset({'CORE_main: Narrative'}), confidence=0.8965517241379309, lift=3.1564748201438846)]]

In [147]:
output_DataFrame.iloc[67]["Full_result"]

[frozenset({'CORE_main: Narrative',
            'CORE_sub: Short Story',
            'FTD: A4 (fiction)'}),
 0.020945083014048533,
 [OrderedStatistic(items_base=frozenset({'CORE_sub: Short Story'}), items_add=frozenset({'CORE_main: Narrative', 'FTD: A4 (fiction)'}), confidence=0.8282828282828284, lift=35.634365634365636),
  OrderedStatistic(items_base=frozenset({'CORE_sub: Short Story', 'CORE_main: Narrative'}), items_add=frozenset({'FTD: A4 (fiction)'}), confidence=0.9647058823529412, lift=29.738767948124135),
  OrderedStatistic(items_base=frozenset({'CORE_main: Narrative', 'FTD: A4 (fiction)'}), items_add=frozenset({'CORE_sub: Short Story'}), confidence=0.9010989010989011, lift=35.634365634365636),
  OrderedStatistic(items_base=frozenset({'CORE_sub: Short Story', 'FTD: A4 (fiction)'}), items_add=frozenset({'CORE_main: Narrative'}), confidence=0.9010989010989011, lift=3.1724839908293148)]]

Support: It measures the number of times a particular item or combination of items occur in a dataset out of the number of all instances.
`Support(I) = \frac{transaction containing(I)}{total.transactions}`

Confidence: It measures how the left value is likely to consume to be predicted for this instance given they the right value has been predicted.
`Confidence(I_1\rightarrow I_2) = \frac{transaction cointaing(I_1and I_2)}{transactions containing(I_1)}

Lift: A lift is a metric that determines the strength of association between the best rules. It is obtained by taking confidence and diving it with support. 
`Lift(I_1\rightarrow I_2) = \frac{Confidence(I_1\rightarrow I_2)}{Support(I_2)}`

In [182]:
# Let's also try the same, but comparing schemata only in pairs

def compare_with_apriori(dataframe, column1, column2):
	apriori_list =  list(list(zip(list(dataframe[column1]), list(dataframe[column2]))))

	results = list(apriori(apriori_list,
             min_support=0.01,
             min_confidence=0.50,
             min_lift=1.0,
             max_length=None))

	output_df = pd.DataFrame(inspect(list(results)), columns = ['Left_Hand_Side', 'Right_Hand_Side', 'Support', 'Confidence', 'Lift','Full_result'])

	output_df = output_df.sort_values(by="Lift", ascending=False)

	return output_df


In [183]:
FTD_GINCO_df = compare_with_apriori(merged_df, "FTD", "GINCO_downcast")

FTD_GINCO_df

Unnamed: 0,Left_Hand_Side,Right_Hand_Side,Support,Confidence,Lift,Full_result
7,FTD: A9 (legal),GINCO: Legal/Regulation,0.01762,0.627273,32.321053,"[(GINCO: Legal/Regulation, FTD: A9 (legal)), 0..."
5,FTD: A7 (instruction),GINCO: Instruction,0.05286,0.565574,9.629508,"[(FTD: A7 (instruction), GINCO: Instruction), ..."
4,FTD: A4 (fiction),GINCO: Other,0.016599,0.511811,7.679126,"[(GINCO: Other, FTD: A4 (fiction)), 0.01659856..."
2,FTD: A14 (academic),GINCO: Information/Explanation,0.022472,0.814815,4.323597,"[(GINCO: Information/Explanation, FTD: A14 (ac..."
1,FTD: A12 (promotion),GINCO: Promotion,0.104188,0.563536,4.14035,"[(GINCO: Promotion, FTD: A12 (promotion)), 0.1..."
3,FTD: A16 (information),GINCO: Information/Explanation,0.07763,0.72209,3.831579,"[(FTD: A16 (information), GINCO: Information/E..."
0,FTD: A11 (personal),GINCO: Opinion/Argumentation,0.04903,0.666667,3.729524,"[(GINCO: Opinion/Argumentation, FTD: A11 (pers..."
6,FTD: A8 (news),GINCO: News/Reporting,0.129213,0.840532,3.52789,"[(FTD: A8 (news), GINCO: News/Reporting), 0.12..."


In [184]:
FTD_CORE_main_df = compare_with_apriori(merged_df, "FTD", "CORE_main")

FTD_CORE_main_df

Unnamed: 0,Left_Hand_Side,Right_Hand_Side,Support,Confidence,Lift,Full_result
0,CORE_main: How-To/Instructional,FTD: A7 (instruction),0.03524,0.890323,9.525965,"[(CORE_main: How-To/Instructional, FTD: A7 (in..."
5,CORE_main: Informational Persuasion,FTD: A12 (promotion),0.036006,0.758065,4.10025,"[(FTD: A12 (promotion), CORE_main: Information..."
7,FTD: A8 (news),CORE_main: Narrative,0.124617,0.810631,2.85214,"[(FTD: A8 (news), CORE_main: Narrative), 0.124..."
6,FTD: A4 (fiction),CORE_main: Narrative,0.023238,0.716535,2.521072,"[(CORE_main: Narrative, FTD: A4 (fiction)), 0...."
4,FTD: A9 (legal),CORE_main: Informational Description/Explanation,0.027068,0.963636,2.376322,[(CORE_main: Informational Description/Explana...
2,FTD: A14 (academic),CORE_main: Informational Description/Explanation,0.02477,0.898148,2.214829,[(CORE_main: Informational Description/Explana...
3,FTD: A16 (information),CORE_main: Informational Description/Explanation,0.089632,0.833729,2.055972,"[(FTD: A16 (information), CORE_main: Informati..."
1,FTD: A12 (promotion),CORE_main: Informational Description/Explanation,0.118744,0.642265,1.583823,"[(FTD: A12 (promotion), CORE_main: Information..."


In [186]:
FTD_core_sub_df = compare_with_apriori(merged_df, "FTD", "CORE_sub")
FTD_core_sub_df

Unnamed: 0,Left_Hand_Side,Right_Hand_Side,Support,Confidence,Lift,Full_result
10,CORE_sub: Short Story,FTD: A4 (fiction),0.023238,0.919192,28.342957,"[(CORE_sub: Short Story, FTD: A4 (fiction)), 0..."
4,CORE_sub: Legal terms,FTD: A9 (legal),0.021706,0.643939,22.924242,"[(CORE_sub: Legal terms, FTD: A9 (legal)), 0.0..."
8,CORE_sub: Research Article,FTD: A14 (academic),0.017109,0.563025,20.414877,"[(CORE_sub: Research Article, FTD: A14 (academ..."
3,CORE_sub: How-to,FTD: A7 (instruction),0.03192,0.905797,9.691534,"[(CORE_sub: How-to, FTD: A7 (instruction)), 0...."
9,CORE_sub: Reviews,FTD: A17 (review),0.025281,0.798387,9.222666,"[(CORE_sub: Reviews, FTD: A17 (review)), 0.025..."
7,CORE_sub: Personal Blog,FTD: A11 (personal),0.030899,0.675978,9.191418,"[(CORE_sub: Personal Blog, FTD: A11 (personal)..."
2,CORE_sub: Encyclopedia Article,FTD: A16 (information),0.016088,0.65625,6.104216,"[(FTD: A16 (information), CORE_sub: Encycloped..."
11,CORE_sub: Sports Report,FTD: A8 (news),0.034219,0.807229,5.251011,"[(FTD: A8 (news), CORE_sub: Sports Report), 0...."
1,CORE_sub: Description with Intent to Sell,FTD: A12 (promotion),0.042135,0.793269,4.290666,"[(FTD: A12 (promotion), CORE_sub: Description ..."
6,CORE_sub: Opinion Blog,FTD: A1 (argumentative),0.044178,0.742489,3.498903,"[(CORE_sub: Opinion Blog, FTD: A1 (argumentati..."


In [187]:
GINCO_CORE_main_df = compare_with_apriori(merged_df, "GINCO_downcast", "CORE_main")
GINCO_CORE_main_df

Unnamed: 0,Left_Hand_Side,Right_Hand_Side,Support,Confidence,Lift,Full_result
5,CORE_main: Interactive Discussion,GINCO: Forum,0.027579,0.537313,14.027463,"[(CORE_main: Interactive Discussion, GINCO: Fo..."
0,CORE_main: How-To/Instructional,GINCO: Instruction,0.032176,0.812903,13.840561,"[(CORE_main: How-To/Instructional, GINCO: Inst..."
4,CORE_main: Informational Persuasion,GINCO: Promotion,0.033197,0.698925,5.135064,"[(GINCO: Promotion, CORE_main: Informational P..."
7,CORE_main: Opinion,GINCO: Opinion/Argumentation,0.057201,0.501119,2.8034,"[(GINCO: Opinion/Argumentation, CORE_main: Opi..."
2,GINCO: Legal/Regulation,CORE_main: Informational Description/Explanation,0.019152,0.986842,2.433548,"[(GINCO: Legal/Regulation, CORE_main: Informat..."
6,CORE_main: Narrative,GINCO: News/Reporting,0.163432,0.575022,2.413492,"[(CORE_main: Narrative, GINCO: News/Reporting)..."
1,GINCO: Information/Explanation,CORE_main: Informational Description/Explanation,0.15526,0.823848,2.031606,"[(GINCO: Information/Explanation, CORE_main: I..."
3,GINCO: Promotion,CORE_main: Informational Description/Explanation,0.079162,0.581614,1.434256,"[(GINCO: Promotion, CORE_main: Informational D..."


In [188]:
GINCO_CORE_sub_df = compare_with_apriori(merged_df, "GINCO_downcast", "CORE_sub")
GINCO_CORE_sub_df

Unnamed: 0,Left_Hand_Side,Right_Hand_Side,Support,Confidence,Lift,Full_result
4,CORE_sub: Legal terms,GINCO: Legal/Regulation,0.017365,0.515152,26.54386,"[(GINCO: Legal/Regulation, CORE_sub: Legal ter..."
1,CORE_sub: Discussion Forum,GINCO: Forum,0.024259,0.59375,15.500833,"[(GINCO: Forum, CORE_sub: Discussion Forum), 0..."
3,CORE_sub: How-to,GINCO: Instruction,0.027324,0.775362,13.201386,"[(CORE_sub: How-to, GINCO: Instruction), 0.027..."
9,CORE_sub: Short Story,GINCO: Other,0.015322,0.606061,9.093231,"[(CORE_sub: Short Story, GINCO: Other), 0.0153..."
0,CORE_sub: Description with Intent to Sell,GINCO: Promotion,0.039326,0.740385,5.439674,"[(GINCO: Promotion, CORE_sub: Description with..."
7,CORE_sub: Personal Blog,GINCO: Opinion/Argumentation,0.036517,0.798883,4.469178,"[(GINCO: Opinion/Argumentation, CORE_sub: Pers..."
8,CORE_sub: Research Article,GINCO: Information/Explanation,0.025026,0.823529,4.369839,"[(GINCO: Information/Explanation, CORE_sub: Re..."
2,CORE_sub: Encyclopedia Article,GINCO: Information/Explanation,0.018131,0.739583,3.924402,"[(GINCO: Information/Explanation, CORE_sub: En..."
6,CORE_sub: Opinion Blog,GINCO: Opinion/Argumentation,0.039837,0.669528,3.74553,"[(CORE_sub: Opinion Blog, GINCO: Opinion/Argum..."
5,CORE_sub: News Report/Blog,GINCO: News/Reporting,0.14096,0.761379,3.195671,"[(CORE_sub: News Report/Blog, GINCO: News/Repo..."


In [189]:
CORE_sub_CORE_main_df = compare_with_apriori(merged_df, "CORE_main", "CORE_sub")
CORE_sub_CORE_main_df

Unnamed: 0,Left_Hand_Side,Right_Hand_Side,Support,Confidence,Lift,Full_result
18,CORE_main: Spoken,CORE_sub: Interview,0.016854,0.594595,24.509815,"[(CORE_sub: Interview, CORE_main: Spoken), 0.0..."
0,CORE_main: How-To/Instructional,CORE_sub: How-to,0.029877,0.754839,21.419916,"[(CORE_main: How-To/Instructional, CORE_sub: H..."
8,CORE_main: Interactive Discussion,CORE_sub: Discussion Forum,0.033963,0.661692,16.1949,"[(CORE_main: Interactive Discussion, CORE_sub:..."
7,CORE_main: Informational Persuasion,CORE_sub: Description with Intent to Sell,0.038304,0.806452,15.183002,"[(CORE_main: Informational Persuasion, CORE_su..."
9,CORE_sub: Question/Answer Forum,CORE_main: Interactive Discussion,0.012513,0.662162,12.900632,"[(CORE_sub: Question/Answer Forum, CORE_main: ..."
14,CORE_sub: Advice,CORE_main: Opinion,0.011491,0.882353,7.729964,"[(CORE_sub: Advice, CORE_main: Opinion), 0.011..."
15,CORE_sub: Opinion Blog,CORE_main: Opinion,0.050051,0.841202,7.369454,"[(CORE_sub: Opinion Blog, CORE_main: Opinion),..."
17,CORE_sub: Reviews,CORE_main: Opinion,0.026302,0.830645,7.276972,"[(CORE_sub: Reviews, CORE_main: Opinion), 0.02..."
16,CORE_sub: Religious Blogs/Sermons,CORE_main: Opinion,0.012513,0.777778,6.813821,"[(CORE_sub: Religious Blogs/Sermons, CORE_main..."
13,CORE_sub: Sports Report,CORE_main: Narrative,0.040858,0.963855,3.391247,"[(CORE_sub: Sports Report, CORE_main: Narrativ..."


# Analysing the correlation of labels with the chi-square test

The Chi-square test is one of the statistical tests we can use to decide whether there is a correlation between the categorical variables by analysing the relationship between the observed and expected values.

Pearson’s chi-squared test is a hypothesis test that is used to determine whether there is a significant association between two categorical variables in the data. The test involves two hypotheses (H0 & H1):
- H0 : The two categorical variables have no relationship (independent)
- H1 : There is a relationship (dependent) between two categorical variables

So as a null hypothesis, we keep the positive aspect of the test and in the alternate hypothesis, we keep the negative aspect. The positive aspect of chi-square is that there should not be any correlation because correlation can result in overfitting of the machine learning algorithm. The negative is that there is a correlation between the two categorical columns.

In [13]:
merged_df.head(2)

Unnamed: 0,index,FTD,GINCO_downcast,CORE_main,CORE_sub
0,0,FTD: A1 (argumentative),GINCO: News/Reporting,CORE_main: Narrative,CORE_sub: News Report/Blog
1,1,FTD: A1 (argumentative),GINCO: Information/Explanation,CORE_main: Opinion,CORE_sub: Opinion Blog


In [14]:
# Create a constingency table for two schemata (two columns)
FTD_GINCO_comparison = pd.crosstab(merged_df["FTD"], merged_df["GINCO_downcast"])

FTD_GINCO_comparison

GINCO_downcast,GINCO: Forum,GINCO: Information/Explanation,GINCO: Instruction,GINCO: Legal/Regulation,GINCO: List of Summaries/Excerpts,GINCO: News/Reporting,GINCO: Opinion/Argumentation,GINCO: Other,GINCO: Promotion
FTD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
FTD: A1 (argumentative),58,132,1,3,57,225,284,54,17
FTD: A11 (personal),41,6,3,0,15,6,192,25,0
FTD: A12 (promotion),11,91,17,0,58,66,29,44,408
FTD: A14 (academic),1,88,0,1,2,6,7,3,0
FTD: A16 (information),4,304,1,1,26,37,23,10,15
FTD: A17 (review),18,20,0,0,53,71,97,16,64
FTD: A4 (fiction),0,20,0,0,3,2,37,65,0
FTD: A7 (instruction),17,37,207,0,26,10,17,26,26
FTD: A8 (news),0,16,0,2,54,506,14,7,3
FTD: A9 (legal),0,24,1,69,1,4,0,11,0


In [21]:
# Add sum of values in the row
FTD_GINCO_comparison['Row_total'] = FTD_GINCO_comparison.apply(np.sum, axis=1)

FTD_GINCO_comparison

GINCO_downcast,GINCO: Forum,GINCO: Information/Explanation,GINCO: Instruction,GINCO: Legal/Regulation,GINCO: List of Summaries/Excerpts,GINCO: News/Reporting,GINCO: Opinion/Argumentation,GINCO: Other,GINCO: Promotion,Row_total
FTD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
FTD: A1 (argumentative),58,132,1,3,57,225,284,54,17,831
FTD: A11 (personal),41,6,3,0,15,6,192,25,0,288
FTD: A12 (promotion),11,91,17,0,58,66,29,44,408,724
FTD: A14 (academic),1,88,0,1,2,6,7,3,0,108
FTD: A16 (information),4,304,1,1,26,37,23,10,15,421
FTD: A17 (review),18,20,0,0,53,71,97,16,64,339
FTD: A4 (fiction),0,20,0,0,3,2,37,65,0,127
FTD: A7 (instruction),17,37,207,0,26,10,17,26,26,366
FTD: A8 (news),0,16,0,2,54,506,14,7,3,602
FTD: A9 (legal),0,24,1,69,1,4,0,11,0,110


In [22]:
# Add sum of columns
FTD_GINCO_comparison.loc['Column_total']= FTD_GINCO_comparison.sum()

FTD_GINCO_comparison

GINCO_downcast,GINCO: Forum,GINCO: Information/Explanation,GINCO: Instruction,GINCO: Legal/Regulation,GINCO: List of Summaries/Excerpts,GINCO: News/Reporting,GINCO: Opinion/Argumentation,GINCO: Other,GINCO: Promotion,Row_total
FTD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
FTD: A1 (argumentative),58,132,1,3,57,225,284,54,17,831
FTD: A11 (personal),41,6,3,0,15,6,192,25,0,288
FTD: A12 (promotion),11,91,17,0,58,66,29,44,408,724
FTD: A14 (academic),1,88,0,1,2,6,7,3,0,108
FTD: A16 (information),4,304,1,1,26,37,23,10,15,421
FTD: A17 (review),18,20,0,0,53,71,97,16,64,339
FTD: A4 (fiction),0,20,0,0,3,2,37,65,0,127
FTD: A7 (instruction),17,37,207,0,26,10,17,26,26,366
FTD: A8 (news),0,16,0,2,54,506,14,7,3,602
FTD: A9 (legal),0,24,1,69,1,4,0,11,0,110


In [34]:
pairs = FTD_GINCO_comparison.iloc[].stack().index.values
pairs

array([('FTD: A1 (argumentative)', 'GINCO: Forum'),
       ('FTD: A1 (argumentative)', 'GINCO: Information/Explanation'),
       ('FTD: A1 (argumentative)', 'GINCO: Instruction'),
       ('FTD: A1 (argumentative)', 'GINCO: Legal/Regulation'),
       ('FTD: A1 (argumentative)', 'GINCO: List of Summaries/Excerpts'),
       ('FTD: A1 (argumentative)', 'GINCO: News/Reporting'),
       ('FTD: A1 (argumentative)', 'GINCO: Opinion/Argumentation'),
       ('FTD: A1 (argumentative)', 'GINCO: Other'),
       ('FTD: A1 (argumentative)', 'GINCO: Promotion'),
       ('FTD: A1 (argumentative)', 'Row_total'),
       ('FTD: A11 (personal)', 'GINCO: Forum'),
       ('FTD: A11 (personal)', 'GINCO: Information/Explanation'),
       ('FTD: A11 (personal)', 'GINCO: Instruction'),
       ('FTD: A11 (personal)', 'GINCO: Legal/Regulation'),
       ('FTD: A11 (personal)', 'GINCO: List of Summaries/Excerpts'),
       ('FTD: A11 (personal)', 'GINCO: News/Reporting'),
       ('FTD: A11 (personal)', 'GINCO: Opinio

In [None]:
# Remove "Column total"

In [26]:
# To calculate dependency of each label pair, you need to create a table, as seen in https://link.springer.com/article/10.1007/s10994-012-5312-9/tables/1

# Example for dependency of Forum and FTD Argumentative

# Calculate a - labels occured together
a = FTD_GINCO_comparison.loc["FTD: A1 (argumentative)", "GINCO: Forum"]
print(a)

# Calculate b - number of the FTD label that did not occured in combination with the GINCO label
b = FTD_GINCO_comparison.loc["FTD: A1 (argumentative)", "Row_total"]-a
print(b)

# Calculate c - number of the GINCO label that did not occured in combination with the FTD label
c = FTD_GINCO_comparison.loc["Column_total", "GINCO: Forum"]-a
print(c)

# Calculate d - number of instances without these two labels
d = FTD_GINCO_comparison.loc["Column_total", "Row_total"]-a-b-c
print(d)

58
773
92
2993


In [27]:
# Calculate dependency based on the formula in https://link.springer.com/article/10.1007/s10994-012-5312-9
x = ((((a*d)-(b*c))**2)*(a+b+c+d))/((a+b)*(c+d)*(b+d)*(a+c))
x

28.397244332005908

In [36]:
pair_list = []
value_list = []

for pair in pairs:
	# Calculate a - labels occured together
	a = FTD_GINCO_comparison.loc[pair[0], pair[1]]

	# Calculate b - number of the FTD label that did not occured in combination with the GINCO label
	b = FTD_GINCO_comparison.loc[pair[0], "Row_total"]-a

	# Calculate c - number of the GINCO label that did not occured in combination with the FTD label
	c = FTD_GINCO_comparison.loc["Column_total", pair[1]]-a

	# Calculate d - number of instances without these two labels
	d = FTD_GINCO_comparison.loc["Column_total", "Row_total"]-a-b-c

	x = ((((a*d)-(b*c))**2)*(a+b+c+d))/((a+b)*(c+d)*(b+d)*(a+c))

	pair_list.append(f"{pair[0]}, {pair[1]}")
	value_list.append(x)

# Create a df with results
FTD_GINCO_results = pd.DataFrame({"pair": pair_list, "dependency_value": value_list}).sort_values("dependency_value", ascending= False)

FTD_GINCO_results


  x = ((((a*d)-(b*c))**2)*(a+b+c+d))/((a+b)*(c+d)*(b+d)*(a+c))


Unnamed: 0,pair,dependency_value
93,"FTD: A9 (legal), GINCO: Legal/Regulation",2197.463402
72,"FTD: A7 (instruction), GINCO: Instruction",1876.031006
85,"FTD: A8 (news), GINCO: News/Reporting",1421.777009
28,"FTD: A12 (promotion), GINCO: Promotion",1380.065589
41,"FTD: A16 (information), GINCO: Information/Exp...",878.289115
...,...,...
105,"Column_total, GINCO: News/Reporting",
106,"Column_total, GINCO: Opinion/Argumentation",
107,"Column_total, GINCO: Other",
108,"Column_total, GINCO: Promotion",


In [38]:
# Show top 30
FTD_GINCO_results.head(30)

Unnamed: 0,pair,dependency_value
93,"FTD: A9 (legal), GINCO: Legal/Regulation",2197.463402
72,"FTD: A7 (instruction), GINCO: Instruction",1876.031006
85,"FTD: A8 (news), GINCO: News/Reporting",1421.777009
28,"FTD: A12 (promotion), GINCO: Promotion",1380.065589
41,"FTD: A16 (information), GINCO: Information/Exp...",878.289115
16,"FTD: A11 (personal), GINCO: Opinion/Argumentation",504.108089
67,"FTD: A4 (fiction), GINCO: Other",418.132802
31,"FTD: A14 (academic), GINCO: Information/Explan...",284.897583
6,"FTD: A1 (argumentative), GINCO: Opinion/Argume...",190.92003
81,"FTD: A8 (news), GINCO: Information/Explanation",121.883716


In [37]:
# Remove NaN
FTD_GINCO_results = FTD_GINCO_results.dropna()
FTD_GINCO_results

Unnamed: 0,pair,dependency_value
93,"FTD: A9 (legal), GINCO: Legal/Regulation",2197.463402
72,"FTD: A7 (instruction), GINCO: Instruction",1876.031006
85,"FTD: A8 (news), GINCO: News/Reporting",1421.777009
28,"FTD: A12 (promotion), GINCO: Promotion",1380.065589
41,"FTD: A16 (information), GINCO: Information/Exp...",878.289115
...,...,...
27,"FTD: A12 (promotion), GINCO: Other",0.493019
24,"FTD: A12 (promotion), GINCO: List of Summaries...",0.291166
77,"FTD: A7 (instruction), GINCO: Other",0.124999
74,"FTD: A7 (instruction), GINCO: List of Summarie...",0.106856


In [33]:
GINCO_labels = list(merged_df["GINCO_downcast"].unique())
FTD_labels = list(merged_df["FTD"].unique())


[('G', 'F'),
 ('I', 'T'),
 ('N', 'D'),
 ('C', ':'),
 ('O', ' '),
 (':', 'A'),
 (' ', '1'),
 ('N', ' '),
 ('e', '('),
 ('w', 'a'),
 ('s', 'r'),
 ('/', 'g'),
 ('R', 'u'),
 ('e', 'm'),
 ('p', 'e'),
 ('o', 'n'),
 ('r', 't'),
 ('t', 'a'),
 ('i', 't'),
 ('n', 'i'),
 ('g', 'v'),
 ('G', 'F'),
 ('I', 'T'),
 ('N', 'D'),
 ('C', ':'),
 ('O', ' '),
 (':', 'A'),
 (' ', '4'),
 ('N', ' '),
 ('e', '('),
 ('w', 'f'),
 ('s', 'i'),
 ('/', 'c'),
 ('R', 't'),
 ('e', 'i'),
 ('p', 'o'),
 ('o', 'n'),
 ('r', ')'),
 ('G', 'F'),
 ('I', 'T'),
 ('N', 'D'),
 ('C', ':'),
 ('O', ' '),
 (':', 'A'),
 (' ', '1'),
 ('N', '2'),
 ('e', ' '),
 ('w', '('),
 ('s', 'p'),
 ('/', 'r'),
 ('R', 'o'),
 ('e', 'm'),
 ('p', 'o'),
 ('o', 't'),
 ('r', 'i'),
 ('t', 'o'),
 ('i', 'n'),
 ('n', ')'),
 ('G', 'F'),
 ('I', 'T'),
 ('N', 'D'),
 ('C', ':'),
 ('O', ' '),
 (':', 'A'),
 (' ', '8'),
 ('N', ' '),
 ('e', '('),
 ('w', 'n'),
 ('s', 'e'),
 ('/', 'w'),
 ('R', 's'),
 ('e', ')'),
 ('G', 'F'),
 ('I', 'T'),
 ('N', 'D'),
 ('C', ':'),
 ('O', ' '),

In [None]:
# Do this for all label pairs
for pair in 
