In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import random
import os
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

In [None]:
directory_path = '/home/sina/Codes/fair-summarization/input_docs/docs'
directory_files = os.listdir(directory_path)

## TextRank Cluster-H


In [None]:
final_summary_df = pd.DataFrame(columns=['Topic','summary'])
final_summary_df['Topic'] = list(directory_files)

In [None]:
def combine_and_jumble_lists(list1, list2):
  combined_list = list1 + list2
  random.shuffle(combined_list)
  return combined_list

In [None]:
dir_dict = {'White': 'H-A', 'Hisp': 'A-W', 'AA': 'W-H'}

for remove_label in ['White', 'Hisp', 'AA']:
  summary_list = []
  for i in directory_files:
    file = pd.read_csv(os.path.join(directory_path, i))
    file = file[file['label'] != remove_label]
    index = 0

    grouped_list = list(file.groupby('label')['text'].apply(list).values)
    list1 = grouped_list[0]
    list2 = grouped_list[1]

    sentences = list1
    vectorizer = CountVectorizer(stop_words="english")
    sentence_vectors = vectorizer.fit_transform(sentences)

    similarity_matrix = cosine_similarity(sentence_vectors)

    graph = nx.from_numpy_array(similarity_matrix)

    scores = nx.pagerank(graph)

    num_sentences = 6
    top_sentence_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:num_sentences]

    summary_1 = []

    for j in top_sentence_indices:
      summary_1.append(sentences[j])

    sentences = list2

    vectorizer = CountVectorizer(stop_words="english")
    sentence_vectors = vectorizer.fit_transform(sentences)


    similarity_matrix = cosine_similarity(sentence_vectors)

    graph = nx.from_numpy_array(similarity_matrix)

    scores = nx.pagerank(graph)

    num_sentences = 6
    top_sentence_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:num_sentences]
    summary_2 = []

    for k in top_sentence_indices:
      summary_2.append(sentences[k])


    jumbled_list = combine_and_jumble_lists(summary_1,summary_2)

    sentences = jumbled_list

    vectorizer = CountVectorizer(stop_words="english")
    sentence_vectors = vectorizer.fit_transform(sentences)


    similarity_matrix = cosine_similarity(sentence_vectors)

    graph = nx.from_numpy_array(similarity_matrix)

    scores = nx.pagerank(graph)

    num_sentences = 6
    top_sentence_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:num_sentences]
    summary_final = []

    for l in top_sentence_indices:
      summary_final.append(sentences[l])

    summary_list.append(summary_final)
  final_summary_df['summary'] = summary_list

  for file in directory_files:
    file_df = pd.read_csv(directory_path + '/' + file)
    indices = []
    for i in range(len(file_df)):
      if file_df['text'][i] in final_summary_df['summary'].where(final_summary_df['Topic'] == file).dropna().values[0]:
        indices.append(i)
    file_df.iloc[indices].to_csv(f'output_TextRank_H/{dir_dict[remove_label]}/{file}', index = False)

## TextRank Cluster A

In [None]:
final_summary_df = pd.DataFrame(columns=['Topic','summary'])
final_summary_df['Topic'] = list(directory_files)

In [None]:
dir_dict = {'White': 'H-A', 'Hisp': 'A-W', 'AA': 'W-H'}

for remove_label in ['White', 'Hisp', 'AA']:
  summary_list = []
  for i in directory_files:
    file = pd.read_csv(os.path.join(directory_path, i))
    file = file[file['label'] != remove_label]
    index = 0
  
    text_data = list(file['text'])
    labels = list(file['label'])
  
  
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(text_data)
  
    label_encoder = LabelEncoder()
    numeric_labels = label_encoder.fit_transform(labels)
  
    num_clusters = 2
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(X)
  
    predicted_labels = kmeans.labels_
  
    grouped_data = {}
    for i, text in enumerate(text_data):
      cluster_label = predicted_labels[i]
      if cluster_label not in grouped_data:
        grouped_data[cluster_label] = []
      grouped_data[cluster_label].append(text)
  
    cluster_0 = []
    cluster_1 = []
  
    for key, value in grouped_data.items():
      if key == 0:
        cluster_0.extend(value)
      elif key == 1:
        cluster_1.extend(value)
  
    sentences = cluster_0
    vectorizer = CountVectorizer(stop_words="english")
    sentence_vectors = vectorizer.fit_transform(sentences)
  
    similarity_matrix = cosine_similarity(sentence_vectors)
  
    graph = nx.from_numpy_array(similarity_matrix)
  
    scores = nx.pagerank(graph)
  
    num_sentences = 6
    top_sentence_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:num_sentences]
  
    summary_1 = []
  
    for j in top_sentence_indices:
      summary_1.append(sentences[j])
  
  
    sentences = cluster_1
    vectorizer = CountVectorizer(stop_words="english")
    sentence_vectors = vectorizer.fit_transform(sentences)
  
    similarity_matrix = cosine_similarity(sentence_vectors)
  
    graph = nx.from_numpy_array(similarity_matrix)
  
    scores = nx.pagerank(graph)
  
    num_sentences = 6
    top_sentence_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:num_sentences]
  
    summary_2 = []
  
    for j in top_sentence_indices:
      summary_2.append(sentences[j])
  
  
    summary_final = summary_1 + summary_2
  
    sentences = summary_final
    vectorizer = CountVectorizer(stop_words="english")
    sentence_vectors = vectorizer.fit_transform(sentences)
  
    similarity_matrix = cosine_similarity(sentence_vectors)
  
    graph = nx.from_numpy_array(similarity_matrix)
  
    scores = nx.pagerank(graph)
  
    num_sentences = 6
    top_sentence_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:num_sentences]
  
    final = []
  
    for j in top_sentence_indices:
      final.append(sentences[j])
  
  
    summary_list.append(final)
  final_summary_df['summary'] = summary_list

  for file in directory_files:
    file_df = pd.read_csv(directory_path + '/' + file)
    indices = []
    for i in range(len(file_df)):
      if file_df['text'][i] in final_summary_df['summary'].where(final_summary_df['Topic'] == file).dropna().values[0]:
        indices.append(i)
    file_df.iloc[indices].to_csv(f'output_TextRank_A/{dir_dict[remove_label]}/{file}', index = False)

## TextRank Vanilla

In [None]:
dir_dict = {'White': 'H-A', 'Hisp': 'A-W', 'AA': 'W-H'}

for remove_label in ['White', 'Hisp', 'AA']:
    summary_list = []
    for i in directory_files:
        file = pd.read_csv(os.path.join(directory_path, i))
        file = file[file['label'] != remove_label]
        
        sentences = list(file['text'])
        vectorizer = CountVectorizer(stop_words="english")
        sentence_vectors = vectorizer.fit_transform(sentences)

        similarity_matrix = cosine_similarity(sentence_vectors)

        graph = nx.from_numpy_array(similarity_matrix)

        scores = nx.pagerank(graph)

        num_sentences = 6
        top_sentence_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:num_sentences]

        summary_final = []
        for j in top_sentence_indices:
          summary_final.append(sentences[j])

        summary_list.append(summary_final)
    final_summary_df['summary'] = summary_list
    

    for file in directory_files:
        file_df = pd.read_csv(directory_path + '/' + file)
        indices = []
        for i in range(len(file_df)):
            if file_df['text'][i] in final_summary_df['summary'].where(final_summary_df['Topic'] == file).dropna().values[0]:
                indices.append(i)
        file_df.iloc[indices].to_csv(f'output_TextRank_V/{dir_dict[remove_label]}/{file}', index = False)

## BERT Extractive Cluster H

In [None]:
from summarizer import Summarizer

In [None]:
from difflib import SequenceMatcher
import pandas as pd
import os
import re

def contains_only_symbols(input_string):
    # Define a regular expression pattern to match symbols
    symbol_pattern = r'^[\W_]+$'

    # Use re.match to check if the entire string matches the pattern
    if re.match(symbol_pattern, input_string):
        return True
    elif input_string == '':
        return True
    else:
        return False

def longestSubstring(str1,str2):
    seqMatch = SequenceMatcher(None,str1,str2)
    match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
    if (match.size!=0):
        return str1[match.a: match.a + match.size]
    else:
        return ''

def find_longest_common_subsequence(str1, str_list):
    indices = []
    subs = []
    loop = 0
    while not contains_only_symbols(str1):
      if loop > 20:
        break
      lcs = [""]*len(str_list)
      for i, s in enumerate(str_list):
          lcs[i] = longestSubstring(str1, s)
      idx = lcs.index(max(lcs, key=len))
      if not contains_only_symbols(lcs[idx]):  
        indices.append(idx)
        subs.append(lcs[idx])
      str1 = str1.replace(lcs[idx], '')
      lcs[idx] = ''
      loop += 1
    return indices, subs

In [None]:
dir_dict = {'White': 'H-A', 'Hisp': 'A-W', 'AA': 'W-H'}

for remove_label in ['White', 'Hisp', 'AA']:
  summary_list = []
  for i in directory_files:
    file = pd.read_csv(os.path.join(directory_path, i))
    file = file[file['label'] != remove_label]
    index = 0
  
    grouped_list = list(file.groupby('label')['text'].apply(list).values)
    hispanic_list = grouped_list[0]
    white_list = grouped_list[1]
  
    sentences = '.\n '.join(hispanic_list)
    summary_1 = []
    model = Summarizer()
    result = model(sentences, num_sentences=6)
    summary_1.append(result)
  
    sentences = '.\n '.join(white_list)
  
    summary_2 = []
    model = Summarizer()
    result = model(sentences, num_sentences=6)
    summary_2.append(result)
  
  
    jumbled_list = combine_and_jumble_lists(summary_1,summary_2)
  
    sentences = '.\n '.join(jumbled_list)
  
    model = Summarizer()
    result = model(sentences, num_sentences=6)
    summary_list.append(result)
  final_summary_df['summary'] = summary_list
  
  for file in directory_files:
    file_df = pd.read_csv(directory_path + '/' + file)
    summary = final_summary_df.where(final_summary_df['Topic'] == file).dropna().summary.values[0]
    indices, subs = find_longest_common_subsequence(summary, file_df['text'].values)

    # df = file_df.iloc[indices].copy()
    # df.text = subs
    # df.to_csv(f'output_BERT_H/{dir_dict[remove_label]}/{file}', index = False)

    persentage = []
    for i in range(len(indices)):
      persentage.append(len(subs[i])/len(file_df['text'][indices[i]]))
    top_indices = sorted(range(len(persentage)), key=lambda i: persentage[i], reverse=True)[:6]
    selected_values = [indices[i] for i in top_indices]
    file_df.iloc[selected_values].to_csv(f'output_BERT_H/{dir_dict[remove_label]}/{file}', index = False)

## BERT Extractive Cluster A

In [None]:
dir_dict = {'White': 'H-A', 'Hisp': 'A-W', 'AA': 'W-H'}

for remove_label in ['White', 'Hisp', 'AA']:
  summary_list = []
  for i in directory_files:
    summary_final = []
    file = pd.read_csv(os.path.join(directory_path, i))
    file = file[file['label'] != remove_label]
    index = 0
  
    text_data = list(file['text'])
    labels = list(file['label'])
  
  
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(text_data)
  
    label_encoder = LabelEncoder()
    numeric_labels = label_encoder.fit_transform(labels)
  
    num_clusters = 2
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(X)
  
    predicted_labels = kmeans.labels_
  
    grouped_data = {}
    for i, text in enumerate(text_data):
      cluster_label = predicted_labels[i]
      if cluster_label not in grouped_data:
        grouped_data[cluster_label] = []
      grouped_data[cluster_label].append(text)
  
    cluster_0 = []
    cluster_1 = []
  
    for key, value in grouped_data.items():
      if key == 0:
        cluster_0.extend(value)
      elif key == 1:
        cluster_1.extend(value)
  
    sentences = '. '.join(cluster_0)
  
    summary_1 = []
    model = Summarizer()
    result = model(sentences, num_sentences=6)
    summary_1.append(result)
  
    sentences = '. '.join(cluster_1)
  
    summary_2 = []
    model = Summarizer()
    result = model(sentences, num_sentences=6)
    summary_2.append(result)
  
  
    summary_final = summary_1 + summary_2
  
    sentences = '. '.join(summary_final)
  
    model = Summarizer()
    result = model(sentences, num_sentences=6)
  
    summary_list.append(result)
  final_summary_df['summary'] = summary_list
  
  for file in directory_files:
      file_df = pd.read_csv(directory_path + '/' + file)
      summary = final_summary_df.where(final_summary_df['Topic'] == file).dropna().summary.values[0]
      indices, subs = find_longest_common_subsequence(summary, file_df['text'].values)

      # df = file_df.iloc[indices].copy()
      # df.text = subs
      # df.to_csv(f'output_BERT_H/{dir_dict[remove_label]}/{file}', index = False)

      persentage = []
      for i in range(len(indices)):
          persentage.append(len(subs[i])/len(file_df['text'][indices[i]]))
      top_indices = sorted(range(len(persentage)), key=lambda i: persentage[i], reverse=True)[:6]
      selected_values = [indices[i] for i in top_indices]
      file_df.iloc[selected_values].to_csv(f'output_BERT_A/{dir_dict[remove_label]}/{file}', index = False)

## BERT Extractive Vanilla

In [None]:
dir_dict = {'White': 'H-A', 'Hisp': 'A-W', 'AA': 'W-H'}

for remove_label in ['White', 'Hisp', 'AA']:
    summary_list = []
    for i in directory_files:
        file = pd.read_csv(os.path.join(directory_path, i))
        file = file[file['label'] != remove_label]

        sentences = '.\n '.join(list(file['text']))

        model = Summarizer()
        result = model(sentences, num_sentences=6)
        summary_list.append(result)
    final_summary_df['summary'] = summary_list

    for file in directory_files:
        file_df = pd.read_csv(directory_path + '/' + file)
        summary = final_summary_df.where(final_summary_df['Topic'] == file).dropna().summary.values[0]
        indices, subs = find_longest_common_subsequence(summary, file_df['text'].values)
    
        # df = file_df.iloc[indices].copy()
        # df.text = subs
        # df.to_csv(f'output_BERT_H/{dir_dict[remove_label]}/{file}', index = False)
    
        persentage = []
        for i in range(len(indices)):
            persentage.append(len(subs[i])/len(file_df['text'][indices[i]]))
        top_indices = sorted(range(len(persentage)), key=lambda i: persentage[i], reverse=True)[:6]
        selected_values = [indices[i] for i in top_indices]
        file_df.iloc[selected_values].to_csv(f'output_BERT_V/{dir_dict[remove_label]}/{file}', index = False)