# Data Wrangling

## 1. Load the data and process only extractive

In [None]:
def load_json(file_path):
    counter=0
    data=[]
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
            counter +=1
    print('Number of text lines processed', counter)
    #convert to pandas dataframe format and return.
    return pd.DataFrame(data)

In [None]:
#read training data from jsonl file\
train_df = load_json('./Data/train-stats.jsonl')

#read test data from jsonl file
test_df=load_json('./Data/test-stats.jsonl')

train_df.head(2)

Number of text lines processed 995041
Number of text lines processed 108862


Unnamed: 0,url,archive,title,date,text,summary,compression,coverage,density,compression_bin,coverage_bin,density_bin
0,http://www.nytimes.com/2006/06/04/sports/socce...,http://web.archive.org/web/20060618204254id_/h...,Surge in Racist Mood Raises Concerns on Eve of...,20060618204254,"HAMBURG, Germany, June 3  As he left the socc...",A surge in discriminatory behavior toward blac...,137.470588,1.0,7.823529,high,high,mixed
1,http://www.nytimes.com/2005/12/24/politics/24s...,http://web.archive.org/web/20060620043011id_/h...,"Spy Agency Mined Vast Data Trove, Officials Re...",20060620043011,"WASHINGTON, Dec. 23 - The National Security Ag...","The volume of information harvested, without \...",33.636364,0.909091,4.727273,medium,medium,mixed


In [None]:
#filter for extractive summaries only
train_df = train_df[train_df.density_bin == 'extractive']
test_df = test_df[test_df.density_bin == 'extractive']

**Save both filtered data to pickle files to reload after session crash**

In [None]:
#save to pickle files
with open('Data/train_df.pickle', 'wb') as handle:                                     
    pickle.dump(train_df, handle)

with open('Data/test_df.pickle', 'wb') as handle:                                     
    pickle.dump(test_df, handle)

## 1a. **Load train and test data after session reconnect**

In [None]:
train_df = pd.read_pickle('Data/train_df.pickle')
train_df.head(2)

Unnamed: 0,url,archive,title,date,text,summary,compression,coverage,density,compression_bin,coverage_bin,density_bin
2,http://www.nytimes.com/2006/04/23/business/you...,http://web.archive.org/web/20060909062911id_/h...,Investors vs. Pfizer: Guess Who Has the Guns?,20060909062911,IF outsized executive pay has indeed become a ...,The battle between Pfizer Inc.'s investors and...,33.88,1.0,11.72,medium,high,extractive
3,http://www.nydailynews.com/archives/gossip/199...,http://web.archive.org/web/20080313232743id_/h...,REX FLEXED PECS FOR SKIN PICS,20080313232743,BY A.J. BENZA & MICHAEL LEWITTES\n\nIf Simon R...,"If Simon Rex looks a little familiar, it may n...",11.894118,0.988235,38.988235,low,high,extractive


In [None]:
test_df = pd.read_pickle('Data/test_df.pickle')
train_df.head(2)

Unnamed: 0,url,archive,title,date,text,summary,compression,coverage,density,compression_bin,coverage_bin,density_bin
2,http://www.nytimes.com/2006/04/23/business/you...,http://web.archive.org/web/20060909062911id_/h...,Investors vs. Pfizer: Guess Who Has the Guns?,20060909062911,IF outsized executive pay has indeed become a ...,The battle between Pfizer Inc.'s investors and...,33.88,1.0,11.72,medium,high,extractive
3,http://www.nydailynews.com/archives/gossip/199...,http://web.archive.org/web/20080313232743id_/h...,REX FLEXED PECS FOR SKIN PICS,20080313232743,BY A.J. BENZA & MICHAEL LEWITTES\n\nIf Simon R...,"If Simon Rex looks a little familiar, it may n...",11.894118,0.988235,38.988235,low,high,extractive


In [None]:
print('Total number of samples in tain data', len(train_df))
print('Total number of samples in test data', len(test_df))

Total number of samples in tain data 332131
Total number of samples in test data 36165


## 2. Sentence tokenization and embedding using spacy and sentence transformer

In [None]:
embedder = SentenceTransformer('distilbert-base-nli-mean-tokens') # sent embedder
def clean_embed(text, 
                nlp = spacy.load("en_core_web_lg"), 
                embedder = embedder,
                min_len=2):
    
    text = nlp(text)  
    sents = list(text.sents) #convert to list of sentences
                                                                                             
    sents_clean = [sentence.text for sentence in sents if len(sentence)> min_len] #remove short sentences by threshhold       
    
    sents_clean = [sentence for sentence in sents_clean if len(sentence)!=0] #remove entries with empty list
    
    sents_embedding= np.array(embedder.encode(sents_clean, convert_to_tensor=True)) #embed sentences (deafult uses BERT SentenceTransformer)
    
    return sents_clean, sents_embedding

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
#truncate for faster computation
train_df = train_df.head(5000).reset_index(drop=True)
test_df = test_df.head(500).reset_index(drop=True)

#extract clean sentence and sentence embedding for each article TEXT
df_tmp = train_df['text'].apply(clean_embed)
train_df['text_clean'] = df_tmp.apply(lambda x: x[0])
train_df['text_embedding'] = df_tmp.apply(lambda x: x[1])
df_tmp = test_df['text'].apply(clean_embed)
test_df['text_clean'] = df_tmp.apply(lambda x: x[0])
test_df['text_embedding'] = df_tmp.apply(lambda x: x[1])

#extract clean sentence list and sentence embedding for each article SUMMARY
df_temp = train_df['summary'].apply(lambda summ: clean_embed(summ, min_len = 0))
train_df['summary_clean'] = df_temp.apply(lambda x: x[0])
train_df['summary_embedding'] = df_temp.apply(lambda x: x[1])
df_temp = test_df['summary'].apply(lambda summ: clean_embed(summ, min_len = 0))
test_df['summary_clean'] = df_temp.apply(lambda x: x[0])
test_df['summary_embedding'] = df_temp.apply(lambda x: x[1])

train_df.head(2)

Unnamed: 0,url,archive,title,date,text,summary,compression,coverage,density,compression_bin,coverage_bin,density_bin,text_clean,text_embedding,summary_clean,summary_embedding
0,http://www.nytimes.com/2006/04/23/business/you...,http://web.archive.org/web/20060909062911id_/h...,Investors vs. Pfizer: Guess Who Has the Guns?,20060909062911,IF outsized executive pay has indeed become a ...,The battle between Pfizer Inc.'s investors and...,33.88,1.0,11.72,medium,high,extractive,[IF outsized executive pay has indeed become a...,"[[0.29463938, -0.5955452, -0.07718178, -1.1811...",[The battle between Pfizer Inc.'s investors an...,"[[-0.416416, -0.69821906, -0.30584174, -1.0871..."
1,http://www.nydailynews.com/archives/gossip/199...,http://web.archive.org/web/20080313232743id_/h...,REX FLEXED PECS FOR SKIN PICS,20080313232743,BY A.J. BENZA & MICHAEL LEWITTES\n\nIf Simon R...,"If Simon Rex looks a little familiar, it may n...",11.894118,0.988235,38.988235,low,high,extractive,"[BY A.J. BENZA & MICHAEL LEWITTES\n\n, If Simo...","[[-0.70827645, 0.46155792, 0.2582969, -1.15755...","[If Simon Rex looks a little familiar, it may ...","[[-0.050814025, -0.23244114, 0.03911164, -0.86..."


In [None]:
print('Total number of samples in tain data after truncation', len(train_df))
print('Total number of samples in test data after truncation', len(test_df))

Total number of samples in tain data after truncation 5000
Total number of samples in test data after truncation 500


In [None]:
#save to pickle files
with open('Data/train_clean_embed.pickle', 'wb') as handle:                                     
    pickle.dump(train_df, handle)

with open('Data/test_clean_embed.pickle', 'wb') as handle:                                     
    pickle.dump(test_df, handle)

In [None]:
# Uncomment the following to load clean train-test data after session disconnect or to start from here
train_df = pd.read_pickle('Data/train_clean_embed.pickle')
test_df = pd.read_pickle('Data/test_clean_embed.pickle')

## 3. Calculate Target Labels

In [None]:
def indx_max_cosine(summary_sent_embed, doc_emedd):
    '''returns array of indices for max cosine similarity per summary sentences'''
    cos_sim_mat = cosine_similarity(summary_sent_embed, doc_emedd)
    idx_arr = np.argmax(cos_sim_mat, axis=0)
    return idx_arr

def label_sent(text_embd, summary_embd):
    '''returns index list and binary target labels in an array'''
    num_doc = text_embd.shape[0]
    
    labels = [np.zeros(doc.shape[0]) for doc in text_embd.tolist()] #initialize list of labels with zeros
    
    idx_list = [np.sort(indx_max_cosine(text_embd[j], summary_embd[j])) for j in range(num_doc)] #calc idx for most similar
    for j in range(num_doc):
        labels[j][idx_list[j]]= 1 
    return idx_list, labels


In [None]:
#get index list and target labels from traning set
idx_list, labels = label_sent(train_df.text_embedding, train_df.summary_embedding)
train_df['labels'] = labels
train_df['labels_idx'] = idx_list

#get index list and target labels from test data
idx_list, labels = label_sent(test_df.text_embedding, test_df.summary_embedding)
test_df['labels'] = labels
test_df['labels_idx'] = idx_list

train_df.head(2) #print 1st two documents from training set

Unnamed: 0,url,archive,title,date,text,summary,compression,coverage,density,compression_bin,coverage_bin,density_bin,text_clean,text_embedding,summary_clean,summary_embedding,labels,labels_idx
0,http://www.nytimes.com/2006/04/23/business/you...,http://web.archive.org/web/20060909062911id_/h...,Investors vs. Pfizer: Guess Who Has the Guns?,20060909062911,IF outsized executive pay has indeed become a ...,The battle between Pfizer Inc.'s investors and...,33.88,1.0,11.72,medium,high,extractive,[IF outsized executive pay has indeed become a...,"[[0.29463938, -0.5955452, -0.07718178, -1.1811...",[The battle between Pfizer Inc.'s investors an...,"[[-0.416416, -0.69821924, -0.3058418, -1.08710...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[11]
1,http://www.nydailynews.com/archives/gossip/199...,http://web.archive.org/web/20080313232743id_/h...,REX FLEXED PECS FOR SKIN PICS,20080313232743,BY A.J. BENZA & MICHAEL LEWITTES\n\nIf Simon R...,"If Simon Rex looks a little familiar, it may n...",11.894118,0.988235,38.988235,low,high,extractive,"[BY A.J. BENZA & MICHAEL LEWITTES\n\n, If Simo...","[[-0.7082764, 0.46155843, 0.25829712, -1.15755...","[If Simon Rex looks a little familiar, it may ...","[[-0.05081406, -0.23244114, 0.039111733, -0.86...","[0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1, 2, 3]"


## 4. Subject domain labels for each article

### Compute embeding features and domain features using sentence number and document length

In [None]:
# def comp_dom_feature(df_embed, df_label):
    
#     return df

def comp_features(df):
    embed_text = df.text_embedding
    y_labels= df.labels

    #label docs
    doc_label = pd.Series(range(df.shape[0]), name = 'doc_label')
    doc_mean = embed_text.apply(lambda x: x.mean(axis=0).reshape(1,-1))
    doc_length = embed_text.apply(lambda x: x.shape[0])

    #create values for each sentence in doc 
    X_doc_label_list =[]
    X_doc_mean_list = []
    X_doc_length_list = []
    X_sent_num_list = []

    for j in range(len(df)):
        X_doc_label = doc_label[j]
        X_doc_mean = doc_mean[j]
        X_doc_length = doc_length[j]
        X_text = embed_text[j]
        n = X_text.shape[0]
        
        #store temporary to repeat by one less than number of sentences
        X_doc_label1 = X_doc_label
        X_doc_mean1 = X_doc_mean
        X_doc_length1 = X_doc_length 
        sent_num = []
        for i in range(n-1): 
            X_doc_label = np.vstack((X_doc_label, X_doc_label1 )) 
            X_doc_mean = np.vstack((X_doc_mean, X_doc_mean1 )) 
            X_doc_length = np.vstack((X_doc_length, X_doc_length1)) 
            sent_num.append(i)
        sent_num.append(n-1)
        
        X_doc_label_list.append(X_doc_label)
        X_doc_mean_list.append(X_doc_mean)
        X_doc_length_list.append(X_doc_length)
        X_sent_num_list.append(np.array(sent_num).reshape(-1,1))
        
    #from list to pandas series
    doc_label = pd.Series(X_doc_label_list)
    doc_mean = pd.Series(X_doc_mean_list)
    doc_length = pd.Series(X_doc_length_list)
    sent_num = pd.Series(X_sent_num_list)

    #concatenate documents with rows = sentences
      #intialize
    Xy_doc_label = doc_label.values[0]
    X = np.hstack((embed_text[0], doc_mean[0], sent_num[0], doc_length[0]))
    y= y_labels[0].reshape(-1,1)
      #recursive population
    f = np.vectorize(lambda x: x if type(x) == np.ndarray else np.array([[x]]))  
    for j in range(1, len(df)):
        Xy_doc_label_new = doc_label.values[j]
        
        X_text_new = embed_text [j]
        X_sent_num_new = sent_num[j]
        X_doc_mean_new = doc_mean[j]
        X_doc_length_new = f(doc_length[j])
        y_new = y_labels[j].reshape(-1,1)
        
        X_new = np.hstack((X_text_new, X_doc_mean_new, X_sent_num_new, X_doc_length_new))
        
        X = np.vstack((X, X_new))
        y = np.vstack((y, y_new))           
        
        Xy_doc_label = np.vstack((Xy_doc_label, Xy_doc_label_new))
            
    #wrap X in dataframe with lables
    labels_text_embedding = ['Sent_BERT_D_' + str(j) for j in range(768)]
    labels_doc_mean = ['Doc_BERT_D_' + str(j) for j in range(768)]
    other_labels = ['Sent_Number', 'Doc_Length']
    col_names = labels_text_embedding + labels_doc_mean + other_labels

    X_df = pd.DataFrame(X, columns = col_names)

    # domFeature_df = comp_dom_feature()

    return X_df, Xy_doc_label, y

In [None]:
def dom_feature(X_df, Xy_doc_label, embedder):    
    #Make single df with only Embeddings and doc label
    df_embed = X_df.loc[:,'Doc_BERT_D_0': 'Doc_BERT_D_767']
    df_doc_label = pd.DataFrame(Xy_doc_label, columns=['doc_label'])
    Dom_df = pd.concat([df_doc_label, df_embed], axis=1)
    # print(len(Dom_df))
    Dom_df = Dom_df.drop_duplicates().set_index('doc_label', drop=True)
    # print(len(Dom_df))

    #embed function
    embed = lambda x: embedder.encode(x, convert_to_tensor=False)

    #define subject domains
    domains = ['entertainment','politics', 'business', 'crime']
    #find domain word embeddings using BERT
    domain_embed = [embed(dom) for dom in domains]
    #wrap in dataframe
    df_dom_embed = pd.DataFrame(domain_embed, index = domains,
                                columns = Dom_df.columns)
    #calculate cosine similarity between article and each subject
    cos_matrix = cosine_similarity(Dom_df, df_dom_embed)

    #return subject word from index number function
    f = np.vectorize(lambda x: domains[x])
    #find max cos sim and return matching subject
    doc_domain = f(np.argmax(cos_matrix, axis=1))
    #Add to primary dataframe
    Dom_df['domain'] = doc_domain
    return Dom_df

In [None]:
#Compute embedding and domain features for train dataset
xTrain, Train_doc_label, yTrain = comp_features(train_df)
domTrain_df = dom_feature(xTrain,Train_doc_label, embedder)

xTest, Test_doc_label, yTest = comp_features(test_df)
domTest_df = dom_feature(xTest,Test_doc_label, embedder)

### Save the outcome of the data wrangling: Train Data and Test Data ready to be used for trainig ML and testing

In [None]:
# convert labels to numpy 1darray and save all the data
Train_doc_label = Train_doc_label.reshape(-1)
yTrain = yTrain.reshape(-1)
dataTrain = {'train_df': train_df, 'Train_doc_label': Train_doc_label, 'xTrain': xTrain, 'yTrain': yTrain, 'domTrain': domTrain_df}
with open('Data/train_feature.pickle', 'wb') as handle:                                     
    pickle.dump(dataTrain, handle)

Test_doc_label = Test_doc_label.reshape(-1)
yTest = yTest.reshape(-1)
dataTest = {'test_df': test_df, 'Test_doc_label': Test_doc_label, 'xTest': xTest, 'yTest': yTest, 'domTest':domTest_df}
with open('Data/test_feature.pickle', 'wb') as handle:                                     
    pickle.dump(dataTest, handle)

**Train Features and Test Features saved in previous section are the final features extracted for training the ML model.**