In [1]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import time
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors

# Manually define a set of English stopwords
stopwords_set = set([
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves",
    "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
    "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was",
    "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and",
    "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between",
    "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off",
    "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any",
    "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so",
    "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now", "d", "ll", "m", "o", "re", "ve", "y",
    "ain", "aren", "couldn", "didn", "doesn", "hadn", "hasn", "haven", "isn", "ma", "mightn", "mustn", "needn", "shan",
    "shouldn", "wasn", "weren", "won", "wouldn"
])

print('TensorFlow Version: {}'.format(tf.__version__))

TensorFlow Version: 2.14.0


In [2]:
# Read data from 'train.csv' into a DataFrame, handling potential parsing issues
df = pd.read_csv('train.csv', on_bad_lines='skip', engine="python")

# Drop the 'id' column from the DataFrame
df = df.drop(['id'], axis=1)

In [3]:
for i in range(6):
    print("Review #",i+1)
    print('Original Text : ')
    print(df.article[i])
   
    print('\n\nSummary Text : ')
    print(df.highlights[i])
    
    print('===========================================================================================================\n\n')

Review # 1
Original Text : 
By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A . State Immunization Program Manager Molly Howell says the risk is low, but officials feel it's important to alert people to the possible exposure. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. The diocese says he contracted the infection through contaminated food while attending a con

In [4]:

# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [5]:
def clean_text(text, remove_stopwords=True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''

    # Convert words to lower case
    text = text.lower()

    # Replace contractions with their longer forms
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)

    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\\', ' ', text)
    text = re.sub(r'\'', ' ', text)

    # Optionally, remove stop words
    if remove_stopwords:
        stops = stopwords_set  # Use your manually defined set
        text = text.split()
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text


In [6]:
# Cleaning highlights
clean_highlights = []
for i in df.highlights:
    clean_highlights.append(clean_text(i, remove_stopwords=False))
print("Summaries are complete.")

# Cleaning articles
clean_article = []
for k in df.article:
    clean_article.append(clean_text(k))
print("Texts are complete.")

Summaries are complete.
Texts are complete.


In [7]:
for i in range(5):
    print("Clean Review #",i+1)
    print(clean_highlights[i])
    print(clean_article[i])
    print()

Clean Review # 1
bishop john folda, of north dakota, is taking time off after being diagnosed . he contracted the infection through contaminated food in italy . church members in fargo, grand forks and jamestown could have been exposed .
. associated press . published: . 14:11 est, 25 october 2013 . | . updated: . 15:36 est, 25 october 2013 . bishop fargo catholic diocese north dakota exposed potentially hundreds church members fargo, grand forks jamestown hepatitis virus late september early october. state health department issued advisory exposure anyone attended five churches took communion. bishop john folda (pictured) fargo catholic diocese north dakota exposed potentially hundreds church members fargo, grand forks jamestown hepatitis . state immunization program manager molly howell says risk low, officials feel important alert people possible exposure. diocese announced monday bishop john folda taking time diagnosed hepatitis a. diocese says contracted infection contaminated foo

In [8]:
def count_words(count_dict, text):
    '''Count the number of occurrences of each word in a set of text'''
    for sentence in text:
        for word in sentence.split():
            if word not in count_dict:
                count_dict[word] = 1
            else:
                count_dict[word] += 1
     

In [9]:
# Find the number of times each word was used and the size of the vocabulary
word_counts = {}
count_words(word_counts, clean_highlights)
count_words(word_counts, clean_article)          
print("Size of Vocabulary:", len(word_counts))


Size of Vocabulary: 1619597


In [10]:
embeddings_index = {}
with open('numberbatch-en.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings:', len(embeddings_index))

Word embeddings: 417195


In [11]:
# Find the number of words that are missing from CN, and are used more than our threshold.
missing_words = 0
threshold = 20

for word, count in word_counts.items():
    if count > threshold:
        if word not in embeddings_index:
            missing_words += 1
            
missing_ratio = round(missing_words/len(word_counts),4)*100
            
print("Number of words missing from CN:", missing_words)
print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio))

Number of words missing from CN: 106112
Percent of words that are missing from vocabulary: 6.550000000000001%


In [12]:
# Limit the vocab that we will use to words that appear ≥ threshold or are in GloVe

#dictionary to convert words to integers
vocab_to_int = {} 

value = 0
for word, count in word_counts.items():
    if count >= threshold or word in embeddings_index:
        vocab_to_int[word] = value
        value += 1

# Special tokens that will be added to our vocab
codes = ["","","",""]   

# Add codes to vocab
for code in codes:
    vocab_to_int[code] = len(vocab_to_int)

# Dictionary to convert integers to words
int_to_vocab = {}
for word, value in vocab_to_int.items():
    int_to_vocab[value] = word

usage_ratio = round(len(vocab_to_int) / len(word_counts),4)*100

print("Total number of unique words:", len(word_counts))
print("Number of words we will use:", len(vocab_to_int))
print("Percent of words we will use: {}%".format(usage_ratio))

Total number of unique words: 1619597
Number of words we will use: 227489
Percent of words we will use: 14.05%


In [13]:
# Need to use 300 for embedding dimensions to match CN's vectors.
embedding_dim = 300
nb_words = len(vocab_to_int)

# Create matrix with default values of zero
word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
for i, word in enumerate(vocab_to_int):
    if word in embeddings_index:
        word_embedding_matrix[i] = embeddings_index[word]
    else:
        new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        embeddings_index[word] = new_embedding
        word_embedding_matrix[i] = new_embedding
print("nb_words:", nb_words)
print("Size of word_embedding_matrix:", word_embedding_matrix.shape)

nb_words: 227489
Size of word_embedding_matrix: (227489, 300)


In [14]:
def convert_to_ints(text, word_count, unk_count, eos=False):
    '''Convert words in text to an integer.
       If word is not in vocab_to_int, use UNK's integer.
       Total the number of words and UNKs.
       Add EOS token to the end of texts'''
    ints = []
    for sentence in text:
        sentence_ints = []
        for word in sentence.split():
            word_count += 1
            if word in vocab_to_int:
                sentence_ints.append(vocab_to_int[word])
            else:
                sentence_ints.append(vocab_to_int[""])
                unk_count += 1
        if eos:
            sentence_ints.append(vocab_to_int[""])
        ints.append(sentence_ints)
    return ints, word_count, unk_count
     

In [15]:
# Apply convert_to_ints to clean_summaries and clean_texts
word_count = 0
unk_count = 0

int_summaries, word_count, unk_count = convert_to_ints(clean_highlights, word_count, unk_count)
int_texts, word_count, unk_count = convert_to_ints(clean_article, word_count, unk_count, eos=True)

unk_percent = round(unk_count/word_count,4)*100

print("Total number of words in headlines:", word_count)
print("Total number of UNKs in headlines:", unk_count)
print("Percent of words that are UNK: {}%".format(unk_percent))

Total number of words in headlines: 132496893
Total number of UNKs in headlines: 3811664
Percent of words that are UNK: 2.88%


In [16]:
def create_lengths(text):
    '''Create a data frame of the sentence lengths from a text'''
    lengths = []
    for sentence in text:
        lengths.append(len(sentence))
    return pd.DataFrame(lengths, columns=['counts'])
     

In [17]:

lengths_summaries = create_lengths(int_summaries)
lengths_texts = create_lengths(int_texts)

print("Summaries:")
print(lengths_summaries.describe())
print()
print("Texts:")
print(lengths_texts.describe())
     

Summaries:
              counts
count  287113.000000
mean       52.170463
std        21.464734
min         0.000000
25%        39.000000
50%        49.000000
75%        61.000000
max      1296.000000

Texts:
              counts
count  287113.000000
mean      410.309488
std       197.996908
min         9.000000
25%       265.000000
50%       375.000000
75%       517.000000
max      2055.000000


In [18]:

print(np.percentile(lengths_texts.counts, 90))
print(np.percentile(lengths_texts.counts, 95))
print(np.percentile(lengths_texts.counts, 99))

687.0
805.0
995.0


In [19]:
# Inspect the length of summaries
print(np.percentile(lengths_summaries.counts, 90))
print(np.percentile(lengths_summaries.counts, 95))
print(np.percentile(lengths_summaries.counts, 99))

78.0
91.0
121.0


In [20]:
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def sentence_similarity(sent1, sent2, vectorizer):
    vectors = vectorizer.transform([sent1, sent2])
    similarity = cosine_similarity(vectors)
    return similarity[0][1]

def build_sentence_graph(sentences, vectorizer):
    G = nx.Graph()
    for i in range(len(sentences)):
        for j in range(i + 1, len(sentences)):
            similarity = sentence_similarity(sentences[i], sentences[j], vectorizer)
            if similarity > 0.2:
                G.add_edge(i, j, weight=similarity)
    return G

def textrank_scores(sentences, vectorizer):
    try:
        G = build_sentence_graph(sentences, vectorizer)
        scores = nx.pagerank(G)
        return scores
    except nx.NetworkXError as e:
        print(f"Error in building sentence graph: {e}")
        return {}

def extractive_summarization(sentences, vectorizer, num_sentences=5):
    try:
        scores = textrank_scores(sentences, vectorizer)
        if not scores:
            # Handle the case where the graph is empty
            return ["Unable to generate summary. Check input document."]
        
        sorted_sentences = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        top_sentences = [sentences[i] for i, _ in sorted_sentences[:num_sentences]]
        return top_sentences
    except Exception as e:
        print(f"Error in extractive summarization: {e}")
        return ["Unable to generate summary. Check input document."]

# Example usage
article = clean_article[0]
sentences = [sent for sent in article.split('. ') if sent.strip()]  # Filter out empty sentences

# Use TfidfVectorizer outside the sentence_similarity function to avoid empty vocabulary issues
vectorizer = TfidfVectorizer()
vectorizer.fit(sentences)

summary_sentences = extractive_summarization(sentences, vectorizer)
summary = '. '.join(summary_sentences)

print("Original Article:")
print(article)
print("\nExtractive Summary:")
print(summary)


Original Article:
. associated press . published: . 14:11 est, 25 october 2013 . | . updated: . 15:36 est, 25 october 2013 . bishop fargo catholic diocese north dakota exposed potentially hundreds church members fargo, grand forks jamestown hepatitis virus late september early october. state health department issued advisory exposure anyone attended five churches took communion. bishop john folda (pictured) fargo catholic diocese north dakota exposed potentially hundreds church members fargo, grand forks jamestown hepatitis . state immunization program manager molly howell says risk low, officials feel important alert people possible exposure. diocese announced monday bishop john folda taking time diagnosed hepatitis a. diocese says contracted infection contaminated food attending conference newly ordained bishops italy last month. symptoms hepatitis include fever, tiredness, loss appetite, nausea abdominal discomfort. fargo catholic diocese north dakota (pictured) bishop located .

Ex

# VALIDATION DATASET


In [21]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

# Manually define a set of English stopwords
stopwords_set = set([
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves",
    "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
    "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was",
    "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and",
    "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between",
    "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off",
    "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any",
    "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so",
    "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now", "d", "ll", "m", "o", "re", "ve", "y",
    "ain", "aren", "couldn", "didn", "doesn", "hadn", "hasn", "haven", "isn", "ma", "mightn", "mustn", "needn", "shan",
    "shouldn", "wasn", "weren", "won", "wouldn"
])

# ... (Your contractions and clean_text function)

# Load the validation dataset
df_validation = pd.read_csv('validation.csv', on_bad_lines='skip', engine="python")
df_validation = df_validation.drop(['id'], axis=1)

# Cleaning validation highlights
clean_highlights_validation = []
for i in df_validation.highlights:
    clean_highlights_validation.append(clean_text(i, remove_stopwords=False))
print("Validation Summaries are complete.")

# Cleaning validation articles
clean_article_validation = []
for k in df_validation.article:
    clean_article_validation.append(clean_text(k))
print("Validation Texts are complete.")

# Create a function for the entire process of extractive summarization
def extractive_summarization_full(article_text, highlights_text, vectorizer, num_sentences=5):
    try:
        # Clean article and highlights
        clean_article = clean_text(article_text)
        clean_highlights = clean_text(highlights_text, remove_stopwords=False)
        
        # Split article into sentences
        sentences = [sent for sent in clean_article.split('. ') if sent.strip()]
        
        # Use the vectorizer to transform sentences
        vectorizer.fit(sentences)
        
        # Perform extractive summarization
        summary_sentences = extractive_summarization(sentences, vectorizer, num_sentences)
        summary = '. '.join(summary_sentences)
        
        return summary
    except Exception as e:
        print(f"Error in extractive summarization: {e}")
        return "Unable to generate summary. Check input document."

# Example usage for validation set
for i in range(5):  # You can change the range based on the number of samples you want to process
    article_validation = clean_article_validation[i]
    highlights_validation = clean_highlights_validation[i]
    
    summary_validation = extractive_summarization_full(article_validation, highlights_validation, vectorizer)
    
    print(f"\nValidation Example #{i + 1}")
    print("Original Validation Article:")
    print(article_validation)
    print("\nExtractive Summary:")
    print(summary_validation)


Validation Summaries are complete.
Validation Texts are complete.

Validation Example #1
Original Validation Article:
sally forrest, actress-dancer graced silver screen throughout 40s 50s mgm musicals films 1956 noir city sleeps died march 15 home beverly hills, california. forrest, whose birth name katherine feeney, 86 long battled cancer. publicist, judith goffin, announced news thursday. scroll video . actress: sally forrest 1951 ida lupino-directed film hard, fast beautiful (left) 1956 fritz lang movie city sleeps san diego native, forrest became protege hollywood trailblazer ida lupino, cast starring roles films including critical commercial success wanted, never fear hard, fast beautiful. forrest film credits included bannerline, son sinbad, excuse dust, according imdb page. page also indicates forrest multiple climax! rawhide television episodes. forrest appeared episode ed sullivan show three episodes dinah shore chevy show, imdb page says. also starred broadway production seve

# TEST DATASET 

In [22]:
# Load test data from 'test.csv' into a DataFrame
test_df = pd.read_csv('test.csv', on_bad_lines='skip', engine="python")

# Drop unnecessary columns from the test DataFrame
test_df = test_df.drop(['id'], axis=1)

# Cleaning test highlights
clean_test_highlights = []
for i in test_df.highlights:
    clean_test_highlights.append(clean_text(i, remove_stopwords=False))
print("Test summaries are complete.")

# Cleaning test articles
clean_test_article = []
for k in test_df.article:
    clean_test_article.append(clean_text(k))
print("Test texts are complete.")

# Convert test data to integers
word_count = 0
unk_count = 0

int_test_summaries, word_count, unk_count = convert_to_ints(clean_test_highlights, word_count, unk_count)
int_test_texts, word_count, unk_count = convert_to_ints(clean_test_article, word_count, unk_count, eos=True)

# Example usage of the extractive summarization function on the first test article
test_article = clean_test_article[0]
test_sentences = [sent for sent in test_article.split('. ') if sent.strip()]

summary_sentences_test = extractive_summarization(test_sentences, vectorizer)
summary_test = '. '.join(summary_sentences_test)

print("Original Test Article:")
print(test_article)
print("\nExtractive Test Summary:")
print(summary_test)


Test summaries are complete.
Test texts are complete.
Original Test Article:
ever noticed plane seats appear getting smaller smaller? increasing numbers people taking skies, experts questioning packed planes putting passengers risk. say shrinking space aeroplanes uncomfortable - putting health safety danger. squabbling arm rest, shrinking space planes putting health safety danger? week, u.s consumer advisory group set department transportation said public hearing government happy set standards animals flying planes, stipulate minimum amount space humans. world animals rights space food humans, said charlie leocha, consumer representative committee. time dot faa take stand humane treatment passengers. could crowding planes lead serious issues fighting space overhead lockers, crashing elbows seat back kicking? tests conducted faa use planes 31 inch pitch, standard airlines decreased . many economy seats united airlines 30 inches room, airlines offer little 28 inches . cynthia corbertt, h

# FLASK APP

In [23]:
!pip install flask flask-ngrok




In [None]:


from flask import Flask

app = Flask(__name__)

@app.route('/')
def hello():
    return 'Hello, World!'

if __name__ == '__main__':
    app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
