In [109]:
import pandas as pd
import re

In [110]:
file_path = '02_comments_with_immediate_conversations-annotated.csv'

data = pd.read_csv(file_path, delimiter=';')

print(data.head())

                  c_id                                             c_text  \
0  1391717608802631681  @100dusty @_Celopelo Wer so ein Profilbild wie...   
1  1387437960757452803  @isabellavonpar1 @Genossin_Anetta @Karl_Lauter...   
2  1389188826799673345  @_m_o_ni @2GRIMREAPER3 @schripinsky @nurcan_nu...   
3  1385241285645291521  @DennisKBerlin Aber Bild hat doch gerade deswe...   
4  1406592688921665538  @betxl5 @Xrxtn @Alice_Weidel @UweJunge Zuviele...   

               date       conv_id  \
0  10.05.2021 11:31           NaN   
1  28.04.2021 16:06  1.387325e+18   
2  03.05.2021 12:03           NaN   
3  22.04.2021 14:37           NaN   
4  20.06.2021 12:40  1.406508e+18   

                                           reaction1  \
0                                                NaN   
1  @isabellavonpar1 @Genossin_Anetta @Karl_Lauter...   
2                                                NaN   
3                                                NaN   
4  @Xrxtn @Ostwestfale60 @Alice_We

In [111]:
def clean_text(text):
    if pd.isna(text):
        return text
        
    text = text.replace('&amp;', '&')
    text = text.replace('&lt;', '<')
    text = text.replace('&gt;', '>')
    text = text.replace('\n', ' ')
    
    split_text = text.split(' ')
    split_text = [word for word in split_text if not word.startswith('http') and not word.startswith('@')]
    
    text = ' '.join(split_text)
    
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()

    return text

In [112]:
for index, row in data.iterrows():
    data.at[index, 'c_text'] = clean_text(row['c_text'])
    data.at[index, 'reaction1'] = clean_text(row['reaction1'])
    data.at[index, 'reaction2'] = clean_text(row['reaction2'])
    
print(data.head())

                  c_id                                             c_text  \
0  1391717608802631681  Wer so ein Profilbild wie sie hochlädt kann nu...   
1  1387437960757452803  -1 Ja schau schön artig auf die Bunden Bildche...   
2  1389188826799673345  Willst du jetzt etwa behaupten das Querdenker ...   
3  1385241285645291521  Aber Bild hat doch gerade deswegen und diesbez...   
4  1406592688921665538  Zuviele Fremdwörter, manches davon nicht mal i...   

               date       conv_id  \
0  10.05.2021 11:31           NaN   
1  28.04.2021 16:06  1.387325e+18   
2  03.05.2021 12:03           NaN   
3  22.04.2021 14:37           NaN   
4  20.06.2021 12:40  1.406508e+18   

                                           reaction1  \
0                                                NaN   
1  -2 Die britische ist ja so schrecklich die ist...   
2                                                NaN   
3                                                NaN   
4  Fühlst du dich von Menschen mit

In [113]:
def is_annotated(individual_row):
    if individual_row['c_text'] != '' and not pd.isna(individual_row['Generalisation']) and not pd.isna(individual_row['Ambiguous']) and not pd.isna(individual_row['Objective']) and not pd.isna(individual_row['Subjective']) and not pd.isna(individual_row['Disputed']):
        return True
    else:
        return False

In [114]:
countWithReaction = 0
for index, row in data.iterrows(): 
    if is_annotated(row) and (not pd.isna(row['reaction1']) and not pd.isna(row['reaction2'])):
        countWithReaction += 1
        
print(str(countWithReaction) + ' comments with an reaction')

972 comments with an reaction


In [115]:
countWithoutReaction = 0
for index, row in data.iterrows(): 
    if is_annotated(row) and (pd.isna(row['reaction1']) and pd.isna(row['reaction2'])):
        countWithoutReaction += 1
        
print(str(countWithoutReaction) + ' comments without an reaction')

781 comments without an reaction


In [116]:
countTotal = 0

for index, row in data.iterrows(): 
    if is_annotated(row):
        countTotal += 1

print(str(countTotal) + ' comments rated in total')

1835 comments rated in total


In [117]:
data.to_csv('03_comments_with_immediate_conversations-annotated-clean.csv', sep=';', index=False, header=True, encoding='utf-8-sig') 