In [11]:
import pandas as pd
import re
import numpy as np

In [12]:
file_path = '02_comments_with_immediate_conversations-annotated.csv'

data = pd.read_csv(file_path, delimiter=';')

print(data.head())

                  c_id                                             c_text  \
0  1391717608802631681  @100dusty @_Celopelo Wer so ein Profilbild wie...   
1  1387437960757452803  @isabellavonpar1 @Genossin_Anetta @Karl_Lauter...   
2  1389188826799673345  @_m_o_ni @2GRIMREAPER3 @schripinsky @nurcan_nu...   
3  1385241285645291521  @DennisKBerlin Aber Bild hat doch gerade deswe...   
4  1406592688921665538  @betxl5 @Xrxtn @Alice_Weidel @UweJunge Zuviele...   

               date       conv_id  \
0  10.05.2021 11:31           NaN   
1  28.04.2021 16:06  1.387325e+18   
2  03.05.2021 12:03           NaN   
3  22.04.2021 14:37           NaN   
4  20.06.2021 12:40  1.406508e+18   

                                           reaction1  \
0                                                NaN   
1  @isabellavonpar1 @Genossin_Anetta @Karl_Lauter...   
2                                                NaN   
3                                                NaN   
4  @Xrxtn @Ostwestfale60 @Alice_We

In [13]:
def clean_text(text):
    if pd.isna(text):
        return text
        
    text = text.replace('&amp;', '&')
    text = text.replace('&lt;', '<')
    text = text.replace('&gt;', '>')
    text = text.replace('\n', ' ')
    
    split_text = text.split(' ')
    split_text = [word for word in split_text if not word.startswith('http') and not word.startswith('@')]
    
    text = ' '.join(split_text)
    
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()

    return text

In [14]:
data['Generalisation0'] = np.full(len(data), 0)
data['Generalisation1'] = np.full(len(data), 0)
data['Generalisation2'] = np.full(len(data), 0)
data['Generalisation3'] = np.full(len(data), 0)

In [15]:
for index, row in data.iterrows():
    data.at[index, 'c_text'] = clean_text(row['c_text'])
    data.at[index, 'reaction1'] = clean_text(row['reaction1'])
    data.at[index, 'reaction2'] = clean_text(row['reaction2'])
    
    gerneralisation_value = row['Generalisation']
    match gerneralisation_value:
        case 0: data.at[index, 'Generalisation0'] = 1
        case 1: data.at[index, 'Generalisation1'] = 1
        case 2: data.at[index, 'Generalisation2'] = 1
        case 3: data.at[index, 'Generalisation3'] = 1

data = data.drop('Generalisation', axis=1)

In [16]:
data.head()

Unnamed: 0,c_id,c_text,date,conv_id,reaction1,reaction2,Ambiguous,Objective,Subjective,Disputed,Generalisation0,Generalisation1,Generalisation2,Generalisation3
0,1391717608802631681,Wer so ein Profilbild wie sie hochlädt kann nu...,10.05.2021 11:31,,,,0.0,0.0,1.0,0.0,0,0,0,1
1,1387437960757452803,-1 Ja schau schön artig auf die Bunden Bildche...,28.04.2021 16:06,1.387325e+18,-2 Die britische ist ja so schrecklich die ist...,"Sie meinen, wie eine lesbische Frau oder ein P...",1.0,0.0,1.0,1.0,0,0,1,0
2,1389188826799673345,Willst du jetzt etwa behaupten das Querdenker ...,03.05.2021 12:03,,,,1.0,0.0,1.0,0.0,0,0,0,1
3,1385241285645291521,Aber Bild hat doch gerade deswegen und diesbez...,22.04.2021 14:37,,,,0.0,1.0,1.0,0.0,0,1,0,0
4,1406592688921665538,"Zuviele Fremdwörter, manches davon nicht mal i...",20.06.2021 12:40,1.406508e+18,Fühlst du dich von Menschen mit Gehirnen diskr...,Diskriminierung gegen Heteros?! Hab ich das ri...,0.0,0.0,1.0,0.0,1,0,0,0


In [17]:
def is_annotated(individual_row):
    if individual_row['c_text'] != '' and not pd.isna(individual_row['Generalisation']) and not pd.isna(individual_row['Ambiguous']) and not pd.isna(individual_row['Objective']) and not pd.isna(individual_row['Subjective']) and not pd.isna(individual_row['Disputed']):
        return True
    else:
        return False

In [18]:
dataWithReactions = []
for index, row in data.iterrows(): 
    if is_annotated(row) and (not pd.isna(row['reaction1']) or not pd.isna(row['reaction2'])):
        dataWithReactions.append(row)
        
dataWithReactions = pd.DataFrame(dataWithReactions, columns=data.columns)       
print(str(len(dataWithReactions)) + ' comments with an reaction')

KeyError: 'Generalisation'

In [None]:
dataAll = []
dataWithoutReactions = []
for index, row in data.iterrows(): 
    if is_annotated(row) and (pd.isna(row['reaction1']) and pd.isna(row['reaction2'])):
        dataWithoutReactions.append(row)
        
dataWithoutReactions = pd.DataFrame(dataWithoutReactions, columns=data.columns)               
print(str(len(dataWithoutReactions)) + ' comments without an reaction')

dataAll = pd.concat([dataWithoutReactions, dataWithReactions])
print(str(len(dataAll)) + ' comments without an reaction and with reaction combined')

dataWithoutReactions = dataAll.copy()
dataWithoutReactions = dataWithoutReactions.drop('reaction1', axis=1)
dataWithoutReactions = dataWithoutReactions.drop('reaction2', axis=1)
dataWithoutReactions = dataWithoutReactions.drop('Disputed', axis=1)

print(str(len(dataWithoutReactions)) + ' comments without an reaction and with reaction (but removed) combined')

In [None]:
countTotal = 0

for index, row in data.iterrows(): 
    if is_annotated(row):
        countTotal += 1

print(str(countTotal) + ' comments rated in total')

In [None]:
data.to_csv('03_comments_with_immediate_conversations-annotated-clean.csv', sep=';', index=False, header=True, encoding='utf-8-sig') 
dataAll.to_csv('04_comments_annotated-values.csv', sep=';', index=False, header=True, encoding='utf-8-sig') 
dataWithReactions.to_csv('04_comments_annotated-reactions.csv', sep=';', index=False, header=True, encoding='utf-8-sig') 
dataWithoutReactions.to_csv('04_comments_annotated-no_reactions.csv', sep=';', index=False, header=True, encoding='utf-8-sig') 