I will improve the comparison of labels on which the mapping will be based, by a more automated approach (the initial X-GENRE mapping is based on my manual analysis).

In [119]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import apyori

## Merging the datasets with predictions into one

In [3]:
# I will create a "master" sheet with predictions that includes GINCO, FTD and CORE sheets.

# Open the sheets
FTD_df = pd.read_csv("data-sheets-with-all-info/FTD-dataset-with-all-information.csv", sep="\t", index_col = 0)

In [4]:
FTD_df.head(1)

Unnamed: 0,ID,labels,Multiple labels,text,length,GINCO_downcast_pred,MT-GINCO_downcast_pred,CORE_main_pred,CORE_sub_pred,labels_to_X-GENRE
0,__id__1-syndicate,A1 (argumentative),,BMW's and Chinese Justice * * * * * In most pl...,975,News/Reporting,News/Reporting,Narrative,News Report/Blog,discarded


In [6]:
FTD_df.describe(include="all")

Unnamed: 0,ID,labels,Multiple labels,text,length,GINCO_downcast_pred,MT-GINCO_downcast_pred,CORE_main_pred,CORE_sub_pred,labels_to_X-GENRE
count,1553,1553,139,1553,1553.0,1553,1553,1553,1553,1414
unique,1553,43,1,1553,,9,9,8,22,8
top,__id__1-syndicate,A1 (argumentative),y,BMW's and Chinese Justice * * * * * In most pl...,,Information/Explanation,Information/Explanation,Informational Description/Explanation,Description of a Thing,discarded
freq,1,296,139,1,,437,328,856,419,364
mean,,,,,1430.271732,,,,,
std,,,,,4789.884538,,,,,
min,,,,,31.0,,,,,
25%,,,,,235.0,,,,,
50%,,,,,531.0,,,,,
75%,,,,,1203.0,,,,,


In [7]:
# Filter out multiple texts
FTD_df = FTD_df[FTD_df["Multiple labels"] != "y"]

FTD_df.describe(include="all")

Unnamed: 0,ID,labels,Multiple labels,text,length,GINCO_downcast_pred,MT-GINCO_downcast_pred,CORE_main_pred,CORE_sub_pred,labels_to_X-GENRE
count,1414,1414,0.0,1414,1414.0,1414,1414,1414,1414,1414
unique,1414,10,0.0,1414,,9,9,8,22,8
top,__id__1-syndicate,A1 (argumentative),,BMW's and Chinese Justice * * * * * In most pl...,,Information/Explanation,Information/Explanation,Informational Description/Explanation,Description of a Thing,discarded
freq,1,296,,1,,407,311,772,382,364
mean,,,,,1445.212164,,,,,
std,,,,,4989.570842,,,,,
min,,,,,31.0,,,,,
25%,,,,,224.0,,,,,
50%,,,,,495.0,,,,,
75%,,,,,1144.25,,,,,


In [9]:
# Keep only relevant columns
FTD_df = FTD_df[['labels', 'text','GINCO_downcast_pred', 'MT-GINCO_downcast_pred', 'CORE_main_pred',
       'CORE_sub_pred', 'labels_to_X-GENRE']]

# Rename some of the columns
FTD_df.columns = ['FTD', 'text','GINCO_downcast', 'MT-GINCO_downcast', 'CORE_main',
       'CORE_sub', 'X-GENRE']

FTD_df.head(2)

Unnamed: 0,FTD,text,GINCO_downcast,MT-GINCO_downcast,CORE_main,CORE_sub,X-GENRE
0,A1 (argumentative),BMW's and Chinese Justice * * * * * In most pl...,News/Reporting,News/Reporting,Narrative,News Report/Blog,discarded
1,A1 (argumentative),China and a New Balance of Power SHANGHAI – Th...,Information/Explanation,Opinion/Argumentation,Opinion,Opinion Blog,discarded


In [10]:
# Add a column with information that this is originally from FTD
FTD_df["original_label"] = "FTD"

FTD_df.head(1)

Unnamed: 0,FTD,text,GINCO_downcast,MT-GINCO_downcast,CORE_main,CORE_sub,X-GENRE,original_label
0,A1 (argumentative),BMW's and Chinese Justice * * * * * In most pl...,News/Reporting,News/Reporting,Narrative,News Report/Blog,discarded,FTD


In [21]:
FTD_df.columns

Index(['FTD', 'text', 'GINCO_downcast', 'MT-GINCO_downcast', 'CORE_main',
       'CORE_sub', 'X-GENRE', 'original_label'],
      dtype='object')

In [22]:
# Reorder labels
FTD_df = FTD_df[['original_label', 'text', 'FTD', 'GINCO_downcast', 'MT-GINCO_downcast', 'CORE_main',
       'CORE_sub', 'X-GENRE']]
FTD_df.head(1)

Unnamed: 0,original_label,text,FTD,GINCO_downcast,MT-GINCO_downcast,CORE_main,CORE_sub,X-GENRE
0,FTD,BMW's and Chinese Justice * * * * * In most pl...,A1 (argumentative),News/Reporting,News/Reporting,Narrative,News Report/Blog,discarded


In [29]:
FTD_df.columns

Index(['original_label', 'text', 'FTD', 'GINCO_downcast', 'MT-GINCO_downcast',
       'CORE_main', 'CORE_sub', 'X-GENRE'],
      dtype='object')

In [78]:
# Open the GINCO dataset
GINCO_df = pd.read_csv("data-sheets-with-all-info/GINCO-MT-GINCO-keeptext-with-all-information.csv", sep="\t", index_col = 0)
GINCO_df.head(1)

Unnamed: 0,id,url,crawled,hard,primary_level_1,primary_level_2,primary_level_3,secondary_level_1,secondary_level_2,secondary_level_3,...,FTD_pred_on_SL,FTD_pred_on_MT,split-without-rare-categories,primary_level_4,downcast_split,CORE_main_pred_on_SL,CORE_main_pred_on_MT,CORE_sub_pred_on_SL,CORE_sub_pred_on_MT,primary_level_1_to_X-GENRE
0,3949,http://www.pomurje.si/aktualno/sport/zimska-li...,2014,False,News/Reporting,News/Reporting,News/Reporting,,,,...,A8 (news),A8 (news),test,News/Reporting,test,Narrative,Narrative,Sports Report,Sports Report,News


In [79]:
GINCO_df.columns

Index(['id', 'url', 'crawled', 'hard', 'primary_level_1', 'primary_level_2',
       'primary_level_3', 'secondary_level_1', 'secondary_level_2',
       'secondary_level_3', 'tertiary_level_1', 'tertiary_level_2',
       'tertiary_level_3', 'split', 'domain', 'GINCORE', 'Slovene_text',
       'MT_text', 'text_length', 'FTD_pred_on_SL', 'FTD_pred_on_MT',
       'split-without-rare-categories', 'primary_level_4', 'downcast_split',
       'CORE_main_pred_on_SL', 'CORE_main_pred_on_MT', 'CORE_sub_pred_on_SL',
       'CORE_sub_pred_on_MT', 'primary_level_1_to_X-GENRE'],
      dtype='object')

In [80]:
# Keep only relevant columns
GINCO_df = GINCO_df[['primary_level_4', 'Slovene_text', 'MT_text', 'FTD_pred_on_SL', 'FTD_pred_on_MT',
       'CORE_main_pred_on_SL', 'CORE_main_pred_on_MT', 'CORE_sub_pred_on_SL',
       'CORE_sub_pred_on_MT', 'primary_level_1_to_X-GENRE']]

GINCO_df.head(1)

Unnamed: 0,primary_level_4,Slovene_text,MT_text,FTD_pred_on_SL,FTD_pred_on_MT,CORE_main_pred_on_SL,CORE_main_pred_on_MT,CORE_sub_pred_on_SL,CORE_sub_pred_on_MT,primary_level_1_to_X-GENRE
0,News/Reporting,"Šport <p/> Zimska liga malega nogometa sobota,...",Sport <p/> Winter Little League Football Satur...,A8 (news),A8 (news),Narrative,Narrative,Sports Report,Sports Report,News


In [81]:
GINCO_df.columns

Index(['primary_level_4', 'Slovene_text', 'MT_text', 'FTD_pred_on_SL',
       'FTD_pred_on_MT', 'CORE_main_pred_on_SL', 'CORE_main_pred_on_MT',
       'CORE_sub_pred_on_SL', 'CORE_sub_pred_on_MT',
       'primary_level_1_to_X-GENRE'],
      dtype='object')

In [82]:
# Separate SI-GINCO and MT-GINCO into two dataframes
SIGINCO_df = GINCO_df[['primary_level_4', 'Slovene_text', 'FTD_pred_on_SL', 'CORE_main_pred_on_SL', 'CORE_sub_pred_on_SL', 'primary_level_1_to_X-GENRE']]

MTGINCO_df = GINCO_df[['primary_level_4', 'MT_text', 'FTD_pred_on_MT', 'CORE_main_pred_on_MT', 'CORE_sub_pred_on_MT',
       'primary_level_1_to_X-GENRE']]

MTGINCO_df.head(1)

Unnamed: 0,primary_level_4,MT_text,FTD_pred_on_MT,CORE_main_pred_on_MT,CORE_sub_pred_on_MT,primary_level_1_to_X-GENRE
0,News/Reporting,Sport <p/> Winter Little League Football Satur...,A8 (news),Narrative,Sports Report,News


In [83]:
print(MTGINCO_df.columns)

Index(['primary_level_4', 'MT_text', 'FTD_pred_on_MT', 'CORE_main_pred_on_MT',
       'CORE_sub_pred_on_MT', 'primary_level_1_to_X-GENRE'],
      dtype='object')


In [84]:
# Rename columns
SIGINCO_df.columns = ['GINCO_downcast', 'text', 'FTD', 'CORE_main', 'CORE_sub','X-GENRE']

MTGINCO_df.columns = ['MT-GINCO_downcast', 'text', 'FTD','CORE_main', 'CORE_sub','X-GENRE']

SIGINCO_df.head(1)

Unnamed: 0,GINCO_downcast,text,FTD,CORE_main,CORE_sub,X-GENRE
0,News/Reporting,"Šport <p/> Zimska liga malega nogometa sobota,...",A8 (news),Narrative,Sports Report,News


In [85]:
# Add column about the original label
SIGINCO_df['original_label'] = "SI-GINCO"
MTGINCO_df['original_label'] = "MT-GINCO"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SIGINCO_df['original_label'] = "SI-GINCO"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MTGINCO_df['original_label'] = "MT-GINCO"


In [86]:
MTGINCO_df.describe(include="all")

Unnamed: 0,MT-GINCO_downcast,text,FTD,CORE_main,CORE_sub,X-GENRE,original_label
count,1002,1002,1002,1002,1002,1002,1002
unique,9,1002,10,8,23,10,1
top,News/Reporting,Sport <p/> Winter Little League Football Satur...,A12 (promotion),Informational Description/Explanation,Description of a Thing,News,MT-GINCO
freq,221,1,287,506,317,221,1002


In [49]:
# Open the CORE df
CORE_df = pd.read_csv("data-sheets-with-all-info/CORE-all-information.csv", sep="\t", index_col = 0)

CORE_df.head(1)

Unnamed: 0,label,text,split,main_labels,sublabels,Len,main_len,sub_len,GINCORE,full_names,main_labels_full_names,FTD_pred,GINCO_downcast_pred,MT-GINCO_downcast_pred,subcategories_to_X-GENRE
0,NA OP SR OB,The Top TEN 'Whiniest Sets of Fans' in English...,train,NA OP,SR OB,4,2,2,NA OP SR OB,SR OB,NA OP,,,,check


In [50]:
# GINCO labels were predicted only on a sample - leave in the dataframe only instances with predictions by all classifiers
CORE_df.dropna(subset=["GINCO_downcast_pred"],inplace=True)
CORE_df.describe(include="all")

Unnamed: 0,label,text,split,main_labels,sublabels,Len,main_len,sub_len,GINCORE,full_names,main_labels_full_names,FTD_pred,GINCO_downcast_pred,MT-GINCO_downcast_pred,subcategories_to_X-GENRE
count,1500,1500,1500,874,1500,1500.0,1500.0,1500.0,1500,1500,1500,1500,1500,1500,1500
unique,77,1500,3,19,43,,,,49,43,20,10,9,9,10
top,NA NE,"Welcome, one and all, to TheCheckingLine.com, ...",train,OP,NE,,,,News,News Report/Blog,Narrative,A1 (argumentative),News/Reporting,News/Reporting,News
freq,379,1,1030,270,395,,,,484,395,626,419,481,481,501
mean,,,,,,2.052,1.052,1.0,,,,,,,
std,,,,,,0.222101,0.222101,0.0,,,,,,,
min,,,,,,2.0,1.0,1.0,,,,,,,
25%,,,,,,2.0,1.0,1.0,,,,,,,
50%,,,,,,2.0,1.0,1.0,,,,,,,
75%,,,,,,2.0,1.0,1.0,,,,,,,


In [51]:
CORE_df.columns

Index(['label', 'text', 'split', 'main_labels', 'sublabels', 'Len', 'main_len',
       'sub_len', 'GINCORE', 'full_names', 'main_labels_full_names',
       'FTD_pred', 'GINCO_downcast_pred', 'MT-GINCO_downcast_pred',
       'subcategories_to_X-GENRE'],
      dtype='object')

In [52]:
# Leave only relevant columns
CORE_df = CORE_df[['text','full_names', 'main_labels_full_names','FTD_pred', 'GINCO_downcast_pred', 'MT-GINCO_downcast_pred', 'subcategories_to_X-GENRE']]
CORE_df.head(1)

Unnamed: 0,text,full_names,main_labels_full_names,FTD_pred,GINCO_downcast_pred,MT-GINCO_downcast_pred,subcategories_to_X-GENRE
14,"Welcome, one and all, to TheCheckingLine.com, ...",Sports Report,Narrative,A1 (argumentative),Opinion/Argumentation,Opinion/Argumentation,News


In [53]:
# Rename the columns
CORE_df.columns = ['text','CORE_sub', 'CORE_main','FTD', 'GINCO_downcast', 'MT-GINCO_downcast', 'X-GENRE']

CORE_df.head(1)

Unnamed: 0,text,CORE_sub,CORE_main,FTD,GINCO_downcast,MT-GINCO_downcast,X-GENRE
14,"Welcome, one and all, to TheCheckingLine.com, ...",Sports Report,Narrative,A1 (argumentative),Opinion/Argumentation,Opinion/Argumentation,News


In [54]:
# Add the column about the original schema
CORE_df['original_label'] = "CORE"
CORE_df.head(1)

Unnamed: 0,text,CORE_sub,CORE_main,FTD,GINCO_downcast,MT-GINCO_downcast,X-GENRE,original_label
14,"Welcome, one and all, to TheCheckingLine.com, ...",Sports Report,Narrative,A1 (argumentative),Opinion/Argumentation,Opinion/Argumentation,News,CORE


In [62]:
SIGINCO_df.shape, FTD_df.shape, MTGINCO_df.shape, CORE_df.shape

((1002, 8), (1414, 8), (1002, 8), (1500, 8))

In [87]:
# Merge the datasets
merge_temp = pd.concat([FTD_df, SIGINCO_df])
merge_temp = pd.concat([merge_temp, MTGINCO_df])
merged_df = pd.concat([merge_temp, CORE_df])

merged_df.describe(include="all")

Unnamed: 0,original_label,text,FTD,GINCO_downcast,MT-GINCO_downcast,CORE_main,CORE_sub,X-GENRE
count,4918,4918,4918,3916,3916,4918,4918,4918
unique,4,4918,10,9,9,20,43,10
top,CORE,BMW's and Chinese Justice * * * * * In most pl...,A12 (promotion),News/Reporting,News/Reporting,Informational Description/Explanation,Description of a Thing,News
freq,1500,1,1011,933,951,2094,1167,1079


In [90]:
# Reset the index
merged_df = merged_df.reset_index()
merged_df.tail()

Unnamed: 0,index,original_label,text,FTD,GINCO_downcast,MT-GINCO_downcast,CORE_main,CORE_sub,X-GENRE
4913,48378,CORE,Bringing You Great Ideas From The Business Wor...,A1 (argumentative),Information/Explanation,Information/Explanation,Opinion,Opinion Blog,Opinion/Argumentation
4914,48403,CORE,"In the hills outside Islamabad in Pakistan, 16...",A1 (argumentative),News/Reporting,News/Reporting,Narrative,News Report/Blog,News
4915,48405,CORE,Who Cares About the Big Bang? Senator Marco Ru...,A1 (argumentative),News/Reporting,News/Reporting,Opinion,Opinion Blog,Opinion/Argumentation
4916,48419,CORE,McCabe: There was no 'toff strategy' The MP be...,A8 (news),News/Reporting,News/Reporting,Narrative,News Report/Blog,News
4917,48423,CORE,Copeland looking to make statement Miserly for...,A8 (news),News/Reporting,News/Reporting,Narrative,News Report/Blog,News


In [91]:
# Save the merged dataset
merged_df.to_csv("data-sheets-with-all-info/Merged-datasets-with-all-predictions.csv", sep="\t")

## Comparison of predictions

To analyse how the predictions are connected, I will take each of the datasets and analyse for each of their original labels whether they are consistently predicted with a label from another schema. Then I will compare them in the other direction - by grouping the predictions of this classifier on all other datasets and seeing whether they are connected well with other labels.

In [99]:
merged_df.columns

Index(['index', 'original_label', 'text', 'FTD', 'GINCO_downcast',
       'MT-GINCO_downcast', 'CORE_main', 'CORE_sub', 'X-GENRE'],
      dtype='object')

In [102]:
# Count for each pair of schemata in how many times they occur together
combinations = [['FTD', 'GINCO_downcast'], ['FTD', 'MT-GINCO_downcast'], ['FTD', 'CORE_main'], ['FTD', 'CORE_sub'], ['GINCO_downcast', 'MT-GINCO_downcast'], ['GINCO_downcast', 'CORE_main'],['GINCO_downcast', 'CORE_sub'] , ['MT-GINCO_downcast', 'CORE_main'], ['MT-GINCO_downcast', 'CORE_sub'], ['CORE_main', 'CORE_sub'] ]

for i in combinations:
	merged_df[f"{i[0]} + {i[1]} count"] = merged_df.groupby([f"{i[0]}", f"{i[1]}"])[f"{i[0]}"].transform('size')
	merged_df[f"{i[0]} + {i[1]}"] = merged_df[f"{i[0]}"].str.cat(merged_df[f"{i[1]}"], sep = "+")

merged_df.head(1)

Unnamed: 0,index,original_label,text,FTD,GINCO_downcast,MT-GINCO_downcast,CORE_main,CORE_sub,X-GENRE,FTD + GINCO_downcast count,...,FTD + GINCO_downcast,FTD + MT-GINCO_downcast,FTD + CORE_main,FTD + CORE_sub,GINCO_downcast + MT-GINCO_downcast,GINCO_downcast + CORE_main,GINCO_downcast + CORE_sub,MT-GINCO_downcast + CORE_main,MT-GINCO_downcast + CORE_sub,CORE_main + CORE_sub
0,0,FTD,BMW's and Chinese Justice * * * * * In most pl...,A1 (argumentative),News/Reporting,News/Reporting,Narrative,News Report/Blog,discarded,225.0,...,A1 (argumentative)+News/Reporting,A1 (argumentative)+News/Reporting,A1 (argumentative)+Narrative,A1 (argumentative)+News Report/Blog,News/Reporting+News/Reporting,News/Reporting+Narrative,News/Reporting+News Report/Blog,News/Reporting+Narrative,News/Reporting+News Report/Blog,Narrative+News Report/Blog


In [112]:
merged_df.columns

Index(['index', 'original_label', 'text', 'FTD', 'GINCO_downcast',
       'MT-GINCO_downcast', 'CORE_main', 'CORE_sub', 'X-GENRE',
       'FTD + GINCO_downcast count', 'FTD + MT-GINCO_downcast count',
       'FTD + CORE_main count', 'FTD + CORE_sub count',
       'GINCO_downcast + MT-GINCO_downcast count',
       'GINCO_downcast + CORE_main count', 'GINCO_downcast + CORE_sub count',
       'MT-GINCO_downcast + CORE_main count',
       'MT-GINCO_downcast + CORE_sub count', 'CORE_main + CORE_sub count',
       'FTD + GINCO_downcast', 'FTD + MT-GINCO_downcast', 'FTD + CORE_main',
       'FTD + CORE_sub', 'GINCO_downcast + MT-GINCO_downcast',
       'GINCO_downcast + CORE_main', 'GINCO_downcast + CORE_sub',
       'MT-GINCO_downcast + CORE_main', 'MT-GINCO_downcast + CORE_sub',
       'CORE_main + CORE_sub'],
      dtype='object')

In [113]:
# Let's concatenate all labels together

merged_df["merged_labels-CORE_sub"] = merged_df['FTD + GINCO_downcast'].str.cat(merged_df["CORE_sub"], sep = "+")
merged_df["merged_labels-CORE_main"] = merged_df['FTD + GINCO_downcast'].str.cat(merged_df["CORE_main"], sep = "+")

merged_df.head(1)

Unnamed: 0,index,original_label,text,FTD,GINCO_downcast,MT-GINCO_downcast,CORE_main,CORE_sub,X-GENRE,FTD + GINCO_downcast count,FTD + MT-GINCO_downcast count,FTD + CORE_main count,FTD + CORE_sub count,GINCO_downcast + MT-GINCO_downcast count,GINCO_downcast + CORE_main count,GINCO_downcast + CORE_sub count,MT-GINCO_downcast + CORE_main count,MT-GINCO_downcast + CORE_sub count,CORE_main + CORE_sub count,FTD + GINCO_downcast,FTD + MT-GINCO_downcast,FTD + CORE_main,FTD + CORE_sub,GINCO_downcast + MT-GINCO_downcast,GINCO_downcast + CORE_main,GINCO_downcast + CORE_sub,MT-GINCO_downcast + CORE_main,MT-GINCO_downcast + CORE_sub,CORE_main + CORE_sub,merged_labels-CORE_sub,merged_labels-CORE_main
0,0,FTD,BMW's and Chinese Justice * * * * * In most pl...,A1 (argumentative),News/Reporting,News/Reporting,Narrative,News Report/Blog,discarded,225.0,232.0,284,253,656.0,640.0,552.0,682.0,576.0,732,A1 (argumentative)+News/Reporting,A1 (argumentative)+News/Reporting,A1 (argumentative)+Narrative,A1 (argumentative)+News Report/Blog,News/Reporting+News/Reporting,News/Reporting+Narrative,News/Reporting+News Report/Blog,News/Reporting+Narrative,News/Reporting+News Report/Blog,Narrative+News Report/Blog,A1 (argumentative)+News/Reporting+News Report/...,A1 (argumentative)+News/Reporting+Narrative


In [115]:
# Analyze distribution of merged labels
merged_df["merged_labels-CORE_sub"].value_counts(normalize="True")[:10]

A8 (news)+News/Reporting+News Report/Blog                           0.085802
A12 (promotion)+Promotion+Description of a Thing                    0.054137
A1 (argumentative)+News/Reporting+News Report/Blog                  0.038560
A12 (promotion)+Promotion+Description with Intent to Sell           0.034985
A16 (information)+Information/Explanation+Description of a Thing    0.034985
A1 (argumentative)+Opinion/Argumentation+Opinion Blog               0.031154
A8 (news)+News/Reporting+Sports Report                              0.029367
A11 (personal)+Opinion/Argumentation+Personal Blog                  0.027068
A7 (instruction)+Instruction+How-to                                 0.026813
A12 (promotion)+Information/Explanation+Description of a Thing      0.016599
Name: merged_labels-CORE_sub, dtype: float64

In [116]:
# Analyze distribution of merged labels
merged_df["merged_labels-CORE_main"].value_counts(normalize="True")[:10]

A8 (news)+News/Reporting+Narrative                                                  0.110827
A16 (information)+Information/Explanation+Informational Description/Explanation     0.068182
A12 (promotion)+Promotion+Informational Description/Explanation                     0.064096
A1 (argumentative)+News/Reporting+Narrative                                         0.036772
A1 (argumentative)+Opinion/Argumentation+Opinion                                    0.034219
A7 (instruction)+Instruction+How-To/Instructional                                   0.031154
A12 (promotion)+Promotion+Informational Persuasion                                  0.028856
A11 (personal)+Opinion/Argumentation+Narrative                                      0.028090
A1 (argumentative)+Information/Explanation+Informational Description/Explanation    0.022472
A14 (academic)+Information/Explanation+Informational Description/Explanation        0.021961
Name: merged_labels-CORE_main, dtype: float64

In [110]:
# Show all columns
pd.set_option('display.max_columns', None)

In [111]:
merged_df.describe(include="all")

Unnamed: 0,index,original_label,text,FTD,GINCO_downcast,MT-GINCO_downcast,CORE_main,CORE_sub,X-GENRE,FTD + GINCO_downcast count,FTD + MT-GINCO_downcast count,FTD + CORE_main count,FTD + CORE_sub count,GINCO_downcast + MT-GINCO_downcast count,GINCO_downcast + CORE_main count,GINCO_downcast + CORE_sub count,MT-GINCO_downcast + CORE_main count,MT-GINCO_downcast + CORE_sub count,CORE_main + CORE_sub count,FTD + GINCO_downcast,FTD + MT-GINCO_downcast,FTD + CORE_main,FTD + CORE_sub,GINCO_downcast + MT-GINCO_downcast,GINCO_downcast + CORE_main,GINCO_downcast + CORE_sub,MT-GINCO_downcast + CORE_main,MT-GINCO_downcast + CORE_sub,CORE_main + CORE_sub
count,4918.0,4918,4918,4918,3916,3916,4918,4918,4918,3916.0,3916.0,4918.0,4918.0,2914.0,3916.0,3916.0,3916.0,3916.0,4918.0,3916,3916,4918,4918,2914,3916,3916,3916,3916,4918
unique,,4,4918,10,9,9,20,43,10,,,,,,,,,,,73,76,94,197,57,89,197,87,196,157
top,,CORE,BMW's and Chinese Justice * * * * * In most pl...,A12 (promotion),News/Reporting,News/Reporting,Informational Description/Explanation,Description of a Thing,News,,,,,,,,,,,A8 (news)+News/Reporting,A8 (news)+News/Reporting,A12 (promotion)+Informational Description/Expl...,A12 (promotion)+Description of a Thing,News/Reporting+News/Reporting,News/Reporting+Narrative,News/Reporting+News Report/Blog,News/Reporting+Narrative,News/Reporting+News Report/Blog,Informational Description/Explanation+Descript...
freq,,1500,1,1011,933,951,2094,1167,1079,,,,,,,,,,,506,564,665,541,656,640,552,682,576,1066
mean,7958.074014,,,,,,,,,211.196629,233.837079,304.245222,191.596584,344.944406,291.841675,162.086823,295.57712,168.764556,412.094347,,,,,,,,,,
std,13529.153307,,,,,,,,,166.74071,188.106191,234.419321,184.788976,234.070114,241.34874,179.200385,236.636062,187.58176,409.323229,,,,,,,,,,
min,0.0,,,,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,
25%,417.0,,,,,,,,,58.0,56.0,106.0,37.0,95.0,75.0,28.0,99.0,29.0,101.0,,,,,,,,,,
50%,857.5,,,,,,,,,192.0,227.0,224.0,129.0,433.0,224.0,98.0,256.0,99.0,170.0,,,,,,,,,,
75%,8602.25,,,,,,,,,304.0,425.0,629.0,253.0,479.0,608.0,255.0,523.0,241.0,732.0,,,,,,,,,,


In [None]:
#Let's analyze which pairs occur together most often

In [107]:
# For each of the pairs, analyse how often they occur together
for i in combinations:
	print(f"Value counts for: {i[0]} + {i[1]}")
	print(merged_df[f"{i[0]} + {i[1]}"].value_counts(normalize=True).sort_values()to_markdown())

Value counts for: FTD + GINCO_downcast
|                                               |   FTD + GINCO_downcast |
|:----------------------------------------------|-----------------------:|
| A8 (news)+News/Reporting                      |            0.129213    |
| A12 (promotion)+Promotion                     |            0.104188    |
| A16 (information)+Information/Explanation     |            0.0776302   |
| A1 (argumentative)+Opinion/Argumentation      |            0.072523    |
| A1 (argumentative)+News/Reporting             |            0.0574566   |
| A7 (instruction)+Instruction                  |            0.0528601   |
| A11 (personal)+Opinion/Argumentation          |            0.0490296   |
| A1 (argumentative)+Information/Explanation    |            0.0337079   |
| A17 (review)+Opinion/Argumentation            |            0.0247702   |
| A12 (promotion)+Information/Explanation       |            0.023238    |
| A14 (academic)+Information/Explanation        |            

## Finding the best mapping using the association rules

In [120]:
# Open the dataset
merged_df = pd.read_csv("data-sheets-with-all-info/Merged-datasets-with-all-predictions.csv", sep="\t", index_col = 0)
merged_df.head(1)

Unnamed: 0,index,original_label,text,FTD,GINCO_downcast,MT-GINCO_downcast,CORE_main,CORE_sub,X-GENRE
0,0,FTD,BMW's and Chinese Justice * * * * * In most pl...,A1 (argumentative),News/Reporting,News/Reporting,Narrative,News Report/Blog,discarded


In [117]:
# installing the apyori package
!pip install apyori

Collecting apyori
  Downloading apyori-1.1.2.tar.gz (8.6 kB)
Building wheels for collected packages: apyori
  Building wheel for apyori (setup.py) ... [?25ldone
[?25h  Created wheel for apyori: filename=apyori-1.1.2-py3-none-any.whl size=5974 sha256=764b48ef36ba362b6204dc32115166642a066ec42c7f40ef2396123b9aac5f9c
  Stored in directory: /home/tajak/.cache/pip/wheels/32/2a/54/10c595515f385f3726642b10c60bf788029e8f3a1323e3913a
Successfully built apyori
Installing collected packages: apyori
Successfully installed apyori-1.1.2


In [None]:
FTD	GINCO_downcast	CORE_main	CORE_sub

In [122]:
merged_df.columns

Index(['index', 'original_label', 'text', 'FTD', 'GINCO_downcast',
       'MT-GINCO_downcast', 'CORE_main', 'CORE_sub', 'X-GENRE'],
      dtype='object')

In [123]:
# Use only relevant columns
merged_df = merged_df[['FTD', 'GINCO_downcast', 'CORE_main', 'CORE_sub']]
merged_df.head(1)

Unnamed: 0,FTD,GINCO_downcast,CORE_main,CORE_sub
0,A1 (argumentative),News/Reporting,Narrative,News Report/Blog


In [124]:
merged_df.shape

(4918, 4)

In [126]:
# Let's created the required ouput - a list of all values per instance
apriori_list = []

for i in range(0,4917):
	apriori_list.append([str(merged_df.values[i,j]) for j in range(0,4)])

apriori_list[:10]

[['A1 (argumentative)', 'News/Reporting', 'Narrative', 'News Report/Blog'],
 ['A1 (argumentative)', 'Information/Explanation', 'Opinion', 'Opinion Blog'],
 ['A1 (argumentative)', 'Opinion/Argumentation', 'Opinion', 'Opinion Blog'],
 ['A1 (argumentative)',
  'Opinion/Argumentation',
  'Narrative',
  'Historical Article'],
 ['A1 (argumentative)', 'News/Reporting', 'Narrative', 'News Report/Blog'],
 ['A1 (argumentative)',
  'List of Summaries/Excerpts',
  'Narrative',
  'News Report/Blog'],
 ['A1 (argumentative)', 'News/Reporting', 'Narrative', 'News Report/Blog'],
 ['A1 (argumentative)',
  'Opinion/Argumentation',
  'Informational Description/Explanation',
  'Description of a Thing'],
 ['A1 (argumentative)', 'News/Reporting', 'Narrative', 'News Report/Blog'],
 ['A1 (argumentative)', 'News/Reporting', 'Narrative', 'News Report/Blog']]

In [155]:
apriori_list = [list(x) for x in list(zip(list(merged_df["CORE_sub"]), list(merged_df["GINCO_downcast"])))]
apriori_list[:2]

[['News Report/Blog', 'News/Reporting'],
 ['Opinion Blog', 'Information/Explanation']]

In [156]:
# Training the model
from apyori import apriori

results = list(apriori(apriori_list,
             min_support=0.01,
             min_confidence=0.80,
             min_lift=1.0,
             max_length=None))

TypeError: '<' not supported between instances of 'float' and 'str'

In [154]:
results

[RelationRecord(items=frozenset({'Narrative'}), support=1.0, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Narrative'}), confidence=1.0, lift=1.0)]),
 RelationRecord(items=frozenset({'News Report/Blog'}), support=1.0, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'News Report/Blog'}), confidence=1.0, lift=1.0)]),
 RelationRecord(items=frozenset({'News/Reporting'}), support=1.0, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'News/Reporting'}), confidence=1.0, lift=1.0)]),
 RelationRecord(items=frozenset({'News Report/Blog', 'Narrative'}), support=1.0, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'News Report/Blog', 'Narrative'}), confidence=1.0, lift=1.0), OrderedStatistic(items_base=frozenset({'Narrative'}), items_add=frozenset({'News Report/Blog'}), confidence=1.0, lift=1.0), OrderedStatistic(items_base=frozenset({'News Report/Blog'}), items

In [129]:
#Transform the results into pandas dataframe
output = list(rule) # returns a non-tabular output

# putting output into a pandas dataframe
def inspect(output):
    lhs         = [tuple(result[2][0][0])[0] for result in output]
    rhs         = [tuple(result[2][0][1])[0] for result in output]
    support    = [result[1] for result in output]
    confidence = [result[2][0][2] for result in output]
    lift       = [result[2][0][3] for result in output]
    return list(zip(lhs, rhs, support, confidence, lift))
output_DataFrame = pd.DataFrame(inspect(output), columns = ['Left_Hand_Side', 'Right_Hand_Side', 'Support', 'Confidence', 'Lift'])

In [130]:
output_DataFrame

Unnamed: 0,Left_Hand_Side,Right_Hand_Side,Support,Confidence,Lift


In [None]:
# Display results based on lift
output_DataFrame.nlargest(n = 10, columns = 'Lift')