# Preprocessing datasets

Preprocess of the datasets and split in train/test

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

#### Import datasets

In [2]:
ibm_data = '../data/ibm/claim_stance_dataset_v1.csv'
ibm_train = '../data/ibm/ibm_train.csv'
ibm_test = '../data/ibm/ibm_test.csv'

## 1. IBM dataset

### Explore dataset

In [3]:
df = pd.read_csv(ibm_data) 

In [4]:
df.head()

Unnamed: 0,topicId,split,topicText,topicTarget,topicSentiment,claims.claimId,claims.stance,claims.claimCorrectedText,claims.claimOriginalText,claims.article.rawFile,...,claims.article.rawSpan.end,claims.article.cleanFile,claims.article.cleanSpan.start,claims.article.cleanSpan.end,claims.Compatible,claims.claimTarget.text,claims.claimTarget.span.start,claims.claimTarget.span.end,claims.claimSentiment,claims.targetsRelation
0,1,test,This house believes that the sale of violent v...,the sale of violent video games to minors,-1,2973,PRO,Exposure to violent video games causes at leas...,exposure to violent video games causes at leas...,articles/t1/raw_1.txt,...,640,articles/t1/clean_1.txt,418,568,yes,Exposure to violent video games,0.0,31.0,-1.0,1.0
1,1,test,This house believes that the sale of violent v...,the sale of violent video games to minors,-1,2974,CON,video game violence is not related to serious ...,video game violence is not related to serious ...,articles/t1/raw_1.txt,...,1697,articles/t1/clean_1.txt,829,907,yes,video game violence,0.0,19.0,1.0,1.0
2,1,test,This house believes that the sale of violent v...,the sale of violent video games to minors,-1,2975,CON,some violent video games may actually have a p...,some violent video games may actually have a p...,articles/t1/raw_1.txt,...,2089,articles/t1/clean_1.txt,1004,1082,yes,some violent video games,0.0,24.0,1.0,1.0
3,1,test,This house believes that the sale of violent v...,the sale of violent video games to minors,-1,2977,PRO,exposure to violent video games causes both sh...,exposure to violent video games causes both sh...,articles/t1/raw_1.txt,...,3695,articles/t1/clean_1.txt,1442,1577,yes,exposure to violent video games,0.0,31.0,-1.0,1.0
4,1,test,This house believes that the sale of violent v...,the sale of violent video games to minors,-1,2978,PRO,Violent video games increase the violent tende...,they increase the violent tendencies among youth,articles/t1/raw_1.txt,...,8167,articles/t1/clean_1.txt,3900,3948,yes,Violent video games,0.0,19.0,-1.0,1.0


In [5]:
df.columns

Index(['topicId', 'split', 'topicText', 'topicTarget', 'topicSentiment',
       'claims.claimId', 'claims.stance', 'claims.claimCorrectedText',
       'claims.claimOriginalText', 'claims.article.rawFile',
       'claims.article.rawSpan.start', 'claims.article.rawSpan.end',
       'claims.article.cleanFile', 'claims.article.cleanSpan.start',
       'claims.article.cleanSpan.end', 'claims.Compatible',
       'claims.claimTarget.text', 'claims.claimTarget.span.start',
       'claims.claimTarget.span.end', 'claims.claimSentiment',
       'claims.targetsRelation'],
      dtype='object')

In [6]:
print(df['topicId'].nunique())
print(df['topicText'].nunique())
print(df['topicTarget'].nunique())

55
55
55


In [7]:
print(df['claims.claimId'].nunique())
print(df['claims.claimOriginalText'].nunique())
print(df['claims.claimCorrectedText'].nunique())

2394
2379
2385


In [8]:
# Exploring differences between CorrectedText and OriginalText
df[df['claims.claimCorrectedText'] != df['claims.claimOriginalText']][['claims.claimCorrectedText', 'claims.claimOriginalText']]

Unnamed: 0,claims.claimCorrectedText,claims.claimOriginalText
0,Exposure to violent video games causes at leas...,exposure to violent video games causes at leas...
4,Violent video games increase the violent tende...,they increase the violent tendencies among youth
5,No conclusive link was found between video gam...,have shown no conclusive link between video ga...
10,No long-term relationship was found between pl...,no long-term relationship between playing viol...
12,No evidence has found that violent games are p...,no evidence violent games are psychologically ...
...,...,...
2372,Democracy is contrary to human nature,democracy was contrary to human nature
2378,The democratic system provides a way to replac...,The democratic system also provides a way to r...
2381,Democratic nations have less genocide and poli...,they have less genocide and politicide
2388,Democracy is reckless and arbitrary,democracy was reckless and arbitrary


### Split and save IBM dataset

In [9]:
df1 = df[['topicTarget', 'claims.stance', 'claims.claimCorrectedText', 'claims.claimSentiment']]

In [10]:
df1.head()

Unnamed: 0,topicTarget,claims.stance,claims.claimCorrectedText,claims.claimSentiment
0,the sale of violent video games to minors,PRO,Exposure to violent video games causes at leas...,-1.0
1,the sale of violent video games to minors,CON,video game violence is not related to serious ...,1.0
2,the sale of violent video games to minors,CON,some violent video games may actually have a p...,1.0
3,the sale of violent video games to minors,PRO,exposure to violent video games causes both sh...,-1.0
4,the sale of violent video games to minors,PRO,Violent video games increase the violent tende...,-1.0


In [32]:
# Remove topic with few samples
tmp = df1.groupby(by=['topicTarget']).agg({'claims.claimCorrectedText': 'count'}).reset_index()

In [37]:
topic_tbr = tmp[tmp['claims.claimCorrectedText']<40]['topicTarget']

In [40]:
df1 = df1[~df1['topicTarget'].isin(topic_tbr)]

In [41]:
train, test = train_test_split(df1, test_size=0.33, stratify=df1[['topicTarget', 'claims.stance']])

In [42]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1197 entries, 966 to 70
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   topicTarget                1197 non-null   object 
 1   claims.stance              1197 non-null   object 
 2   claims.claimCorrectedText  1197 non-null   object 
 3   claims.claimSentiment      1142 non-null   float64
dtypes: float64(1), object(3)
memory usage: 46.8+ KB


In [43]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 591 entries, 1176 to 271
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   topicTarget                591 non-null    object 
 1   claims.stance              591 non-null    object 
 2   claims.claimCorrectedText  591 non-null    object 
 3   claims.claimSentiment      564 non-null    float64
dtypes: float64(1), object(3)
memory usage: 23.1+ KB


In [44]:
train.rename(columns={'topicTarget': 'topic',
                      'claims.stance': 'stance',
                      'claims.claimCorrectedText': 'argument',
                      'claims.claimSentiment': 'argument_sentiment'}, inplace=True)
test.rename(columns={'topicTarget': 'topic',
                      'claims.stance': 'stance',
                      'claims.claimCorrectedText': 'argument',
                      'claims.claimSentiment': 'argument_sentiment'}, inplace=True)

In [45]:
train.to_csv(ibm_train, index=False)
test.to_csv(ibm_test, index=False)