In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
files = os.listdir(os.curdir)  #files and directories

In [3]:
files

['article788173482.task3.labels',
 'article701225819.txt',
 'article758477392.task3.labels',
 'article698092698.task2.labels',
 'article757713354.task3.labels',
 'article754131438.txt',
 'article787955075.task3.labels',
 'article7618745059.txt',
 'article728680557.task3.labels',
 'article782086447.txt',
 'article795689901.task2.labels',
 'article710376094.txt',
 'article754179642.txt',
 'article757243988.txt',
 'article765385479.task2.labels',
 'article761969038.task3.labels',
 'article729303442.txt',
 'article111111132.task2.labels',
 'article789370909.task2.labels',
 'article761564397.task2.labels',
 'article731063195.task3.labels',
 'article761969038.txt',
 'article769962236.task3.labels',
 'article787529309.task2.labels',
 'article999001188.txt',
 'article729581752.txt',
 'article761564397.txt',
 'article775448623.task3.labels',
 'article735815173.task2.labels',
 'article756114837.txt',
 'article696735702.task2.labels',
 'article729651527.task3.labels',
 'article736231219.txt',
 'a

In [4]:
txt_ids = [file.replace('.txt','') for file in files if '.txt' in file]

In [5]:
len(txt_ids)

293

In [6]:
#function to read in the article txt file, propaganda sentence-level labels, multi class propaganda labels

def read_in_txt(article_id):
    with open(f"{article_id}.txt", 'r') as f2:
        txt = f2.read()
    with open(f"{article_id}.task2.labels", 'r') as f2:
        label_2 = f2.read()
    with open(f"{article_id}.task3.labels", 'r') as f2:
        label_3 = f2.read()
        
    return txt, label_2, label_3


In [7]:
#function to create list of char_index for the beginning of each sentence in the txt file.
#we need this in order to later be able to match on fine-grain propaganda classes onto range of char_indices

def list_of_sent_starts(txt):
    list_of_sent_starts = [0]
    for i in range(len(txt)):
        if txt[i] == '\n':
            list_of_sent_starts.append(i)
    else:
        pass
    return list_of_sent_starts

In [8]:
#function to create list of char_index for the end of each sentence in the txt file.

def list_of_sent_ends(txt):
    list_of_sent_ends = []
    for i in range(len(txt)):
        if txt[i] == '\n':
            list_of_sent_ends.append(i)
    else:
        pass

    list_of_sent_ends.append(list_of_sent_ends[-1])
    return list_of_sent_ends

In [9]:
#function to split sentence label into list, where each element includes the article_id, the sentence#,
#and the label (propaganda or non-propaganda) as a string

def parsing_label2(label_2):
    sents_labeled_2 =  label_2.split('\n')
    return sents_labeled_2


In [10]:
#parse and create dataframe for multi-class fine-grained propaganda labels (label_3)

def parsing_label3(label_3):
    label_3 = label_3.split('\n')[0:-1]
    label_3 = [row.split('\t') for row in label_3]
    
    return label_3
    
def create_label_3_df(label_3):    

    label_3_df = pd.DataFrame()
    label_3_df['article_id']= [row[0] for row in label_3]
    label_3_df['propaganda-type']= [row[1] for row in label_3]
    label_3_df['start_char']= [row[2] for row in label_3]
    label_3_df['end_char']= [row[3] for row in label_3]
    label_3_df['start_char'] = label_3_df.start_char.astype(int)
    label_3_df['end_char'] = label_3_df.end_char.astype(int)
    
    return label_3_df

In [11]:
def article_id(sents_labeled_2):
    return [sent.split('\t')[0] for sent in sents_labeled_2[0:-1]]

def sentence_start_char(list_of_sent_starts):
    return [num for num in list_of_sent_starts[0:-1]]

def sentence_end_char(list_of_sent_ends):
    return [num for num in list_of_sent_ends[0:-1]]

def propaganda_label(sents_labeled_2):
    return [sent.split('\t')[2] for sent in sents_labeled_2[0:-1]]



In [12]:
#function that takes in article txt and parsed label2s and creates a dataframe where each sentence of 
# the article is a different observation

def create_article_tbl(sents_labeled_2 , list_of_sent_starts, list_of_sent_ends):
    
    article_df = pd.DataFrame()

    article_df['article_id'] = article_id(sents_labeled_2)
    article_df['sentence_char_start'] = sentence_start_char(list_of_sent_starts)
    article_df['sentence_char_end'] = sentence_end_char(list_of_sent_ends)
    article_df['propaganda'] = propaganda_label(sents_labeled_2)
    article_df['propaganda_type'] = pd.Series()
    article_df['text'] = pd.Series()
    
    
    return article_df

In [13]:
#update propaganda multi_class type in dataframe

def update_propaganda_type(article_df, label_3_df):
    for i in range(len(article_df)):
        for j in range(len(label_3_df)):
            if article_df['sentence_char_start'][i] <= label_3_df['start_char'][j]:
                if label_3_df['end_char'][j]<= article_df['sentence_char_end'][i]:
                    article_df['propaganda_type'][i] = label_3_df['propaganda-type'][j]
    return article_df


In [14]:
#update text col with sentences

def update_txt_col(article_df, txt):
    for i in range(len(article_df)):
        article_df['text'][i] = txt[article_df['sentence_char_start'][i]:article_df['sentence_char_end'][i]]
    return article_df

In [15]:
# drop non-sentence rows and reset index
def drop_non_sent_rows(article_df):
    #drop rows
    article_df = article_df[article_df['sentence_char_end']-article_df['sentence_char_start'] > 1]
    #reset index
    article_df = article_df.reset_index(drop=True)
    return article_df


In [16]:
#create sentence number col

def sent_num_col(article_df):
    article_df['sent_#'] = article_df.index + 1
    return article_df

In [17]:
#apply above functions to turn 3 txt files identified by their article ids into one dataframe

def txt_to_df(article_id):

    txt, label_2, label_3 = read_in_txt(article_id)
    sent_starts = list_of_sent_starts(txt)
    sent_ends = list_of_sent_ends(txt)
    sents_labeled_2 = parsing_label2(label_2)
    sents_labeled_3 = parsing_label3(label_3)
    label_3_df = create_label_3_df(sents_labeled_3)
    
    #create and update table
    article_df = create_article_tbl(sents_labeled_2 , sent_starts, sent_ends)
    article_df = update_propaganda_type(article_df, label_3_df)
    article_df = update_txt_col(article_df, txt)
    article_df = drop_non_sent_rows(article_df)
    article_df = sent_num_col(article_df)
    
    return article_df

In [18]:
test_table_final = txt_to_df('article788173482')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [19]:
test_table_final

Unnamed: 0,article_id,sentence_char_start,sentence_char_end,propaganda,propaganda_type,text,sent_#
0,788173482,0,32,propaganda,"Name_Calling,Labeling",Exposing the Deep State Plotters,1
1,788173482,33,326,non-propaganda,,\nPresident Trump’s sweeping order this week d...,2
2,788173482,326,596,propaganda,Loaded_Language,"\nIn an interview with Hill.TV yesterday, the ...",3
3,788173482,596,666,propaganda,Flag-Waving,\n“What we’ve done is a great service to the c...,4
4,788173482,666,845,propaganda,"Exaggeration,Minimisation","\n“I hope to be able to call this, along with ...",5
5,788173482,845,1013,non-propaganda,,\nTrump criticized how the FBI handled the Rus...,6
6,788173482,1013,1263,propaganda,"Exaggeration,Minimisation",\n“They know this is one of the great scandals...,7
7,788173482,1263,1369,propaganda,Loaded_Language,\nThey used Carter Page as a foil in order to ...,8
8,788173482,1369,1417,propaganda,"Exaggeration,Minimisation","\n“It’s a hoax, beyond a witch hunt,” Trump said.",9
9,788173482,1417,1561,propaganda,"Name_Calling,Labeling",\nThe documents affected involve a FISA warran...,10


In [38]:
test_table_final.text[20]

'\nSpecifically, the statement that Obama wanted “to know everything we’re doing” came in a private Sept. 2, 2016, text message from FBI lawyer Lisa Page to FBI agent Peter Strzok, with whom she was having an extramarital affair at the time.'

In [39]:
# create dataframes for each article id and concatenate into one mega-df
list_of_dfs = []
for id_ in txt_ids:
    df = txt_to_df(id_)
    list_of_dfs.append(df)

mega_df = pd.concat(list_of_dfs)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [41]:
mega_df

Unnamed: 0,article_id,sentence_char_start,sentence_char_end,propaganda,propaganda_type,text,sent_#
0,701225819,0,59,non-propaganda,,South Florida Muslim Leader Sofian Zakkout’s D...,1
1,701225819,60,207,propaganda,"Name_Calling,Labeling","\nDavid Duke, the white supremacist icon and f...",2
2,701225819,207,382,propaganda,Loaded_Language,"\nHowever, one individual who represents the M...",3
3,701225819,382,525,non-propaganda,,"\nLast month, once again, Zakkout chose to sho...",4
4,701225819,525,595,non-propaganda,,\nThe postings can be rivaled only by Zakkout’...,5
5,701225819,595,698,non-propaganda,,\nSofian Abdelaziz Zakkout is the President of...,6
6,701225819,698,785,non-propaganda,,\nHe has ties to two dozen mosques stretching ...,7
7,701225819,785,868,non-propaganda,,"\nOn September 21, 2017, Zakkout took to Faceb...",8
8,701225819,868,1178,propaganda,"Name_Calling,Labeling",\nThree of the videos were put out by Duke’s o...,9
9,701225819,1178,1300,propaganda,Appeal_to_fear-prejudice,"\nAt this same rally, a white nationalist plow...",10


In [42]:
mega_df_final = mega_df.reset_index(drop=True)
mega_df_final

Unnamed: 0,article_id,sentence_char_start,sentence_char_end,propaganda,propaganda_type,text,sent_#
0,701225819,0,59,non-propaganda,,South Florida Muslim Leader Sofian Zakkout’s D...,1
1,701225819,60,207,propaganda,"Name_Calling,Labeling","\nDavid Duke, the white supremacist icon and f...",2
2,701225819,207,382,propaganda,Loaded_Language,"\nHowever, one individual who represents the M...",3
3,701225819,382,525,non-propaganda,,"\nLast month, once again, Zakkout chose to sho...",4
4,701225819,525,595,non-propaganda,,\nThe postings can be rivaled only by Zakkout’...,5
5,701225819,595,698,non-propaganda,,\nSofian Abdelaziz Zakkout is the President of...,6
6,701225819,698,785,non-propaganda,,\nHe has ties to two dozen mosques stretching ...,7
7,701225819,785,868,non-propaganda,,"\nOn September 21, 2017, Zakkout took to Faceb...",8
8,701225819,868,1178,propaganda,"Name_Calling,Labeling",\nThree of the videos were put out by Duke’s o...,9
9,701225819,1178,1300,propaganda,Appeal_to_fear-prejudice,"\nAt this same rally, a white nationalist plow...",10


In [43]:
mega_df_final['text'] = [ text.replace('\n','')for text in mega_df_final['text'] ]

In [44]:
mega_df_final

Unnamed: 0,article_id,sentence_char_start,sentence_char_end,propaganda,propaganda_type,text,sent_#
0,701225819,0,59,non-propaganda,,South Florida Muslim Leader Sofian Zakkout’s D...,1
1,701225819,60,207,propaganda,"Name_Calling,Labeling","David Duke, the white supremacist icon and for...",2
2,701225819,207,382,propaganda,Loaded_Language,"However, one individual who represents the Mus...",3
3,701225819,382,525,non-propaganda,,"Last month, once again, Zakkout chose to showc...",4
4,701225819,525,595,non-propaganda,,The postings can be rivaled only by Zakkout’s ...,5
5,701225819,595,698,non-propaganda,,Sofian Abdelaziz Zakkout is the President of t...,6
6,701225819,698,785,non-propaganda,,He has ties to two dozen mosques stretching fr...,7
7,701225819,785,868,non-propaganda,,"On September 21, 2017, Zakkout took to Faceboo...",8
8,701225819,868,1178,propaganda,"Name_Calling,Labeling",Three of the videos were put out by Duke’s off...,9
9,701225819,1178,1300,propaganda,Appeal_to_fear-prejudice,"At this same rally, a white nationalist plowed...",10


In [50]:
mega_df_final.to_csv('/sentence_classifications.csv')