In [None]:
!pip3 install torch
!pip3 install transformers
!pip3 install flair

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 18.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 2.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 36.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 23.1 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 36.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    F

In [None]:
import torch
import transformers
import flair
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk.corpus import words
from sklearn.feature_extraction.text import TfidfVectorizer
import networkx as nx
import traceback
nltk.download('punkt')
nltk.download('words')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### Text Summarization by Extractive Process.

In [None]:
class NewsParsing(object):
    def __init__(self):
        self.regex_pattern = r'[^A-Za-z0-9.]'
        self.stopwords_list = stopwords.words('english')
        self.words_list = list(words.words())
        self.vectorizer = TfidfVectorizer()
        self.file_path = "/content/sample_data/"
    
    # function to read data...
    def read_data(self, file_name):
        try:
            # get the filedata..
            df_text = pd.read_excel(self.file_path+file_name)
            df_text['News_Article'] = df_text['News_Article'].apply(lambda x: x.replace('\n','').strip().lower())
            
            return df_text
        
        except Exception as e:
            print(traceback.format_exc(e))
        
    # function to tokenize the data...
    def tokenize_data(self, news_data):
        # Tokenize the articles...
        tokenized_article = sent_tokenize(news_data)
        return tokenized_article
    
    # function to build similarity matrix...
    def generate_similarity_score(self, sentence1, sentence2):
        try:
            # create the vectors....
            sparse_matrix = self.vectorizer.fit_transform([sentence1, sentence2])

            # get the feature names...
            feature_names = self.vectorizer.get_feature_names()

            dense = sparse_matrix.todense()

            denseList = dense.tolist()

            # create a dataframe
            df = pd.DataFrame(denseList, columns=feature_names)

            vector1 = list(df.iloc[0])
            vector2 = list(df.iloc[0])

            # build cosine similarity score...
            cos_distance = cosine_distance(vector1, vector2)

            cosine_similarity = (1 - cos_distance)

            return cosine_similarity
        
        except Exception as e:
            print(traceback.format_exc(e))

    # function to create a similarity matrix...
    def generate_similarity_matrix(self, tokenized_article):
        try:
            # Create a similarity matrix...
            similarity_matrix = np.zeros((len(tokenized_article), len(tokenized_article)))

            # Iterate over the sentences...
            for index1 in range(0, len(tokenized_article)):
                for index2 in range(0, len(tokenized_article)):
                    if index1 == index2:
                        similarity_matrix[index1][index2] = 1.0
                    else:
                        # Build a similarity matrix here....
                        similarity_matrix[index1][index2] = self.generate_similarity_score(tokenized_article[index1], tokenized_article[index2])

            return similarity_matrix
        
        except Exception as e:
            print(traceback.format_exc(e))
            
    # function to rank the sentences...
    def rank_sentences(self, similarity_matrix, tokenized_article, top_n=5):
        summarized_text = []
        # Create a sentence similarity graph...
        sentence_similarity_graph = nx.from_numpy_array(similarity_matrix)
        scores = nx.pagerank(sentence_similarity_graph)
        ranked_sentence = sorted(((scores[i], s) for i, s in enumerate(tokenized_article)), reverse=True)
        
        for i in range(top_n):
            summarized_text.append("".join(ranked_sentence[i][1]))
        
        # Step 6 - output the summarize text
        summarized_text = " ".join(summarized_text)
        summarized_text = " ".join(summarized_text.split())
        summarized_text = summarized_text.strip()
        
        return summarized_text 
        
    # function to summarize the text...
    def summarize_text(self, complete_article):
        # Step 1: Tokenize the data...
        tokenized_article = self.tokenize_data(complete_article)
        
        # Step 2: Generate Similarity Matrix...
        similarity_matrix = self.generate_similarity_matrix(tokenized_article)
        
        # Step 3: Rank the Sentences...
        summarized_text = self.rank_sentences(similarity_matrix, tokenized_article)
        
        return summarized_text


In [None]:
file_name = "News_Articles.xlsx" # Initialize the file names...

In [None]:
obj_parse = NewsParsing()  # Initialize the object...

In [None]:
df_text = obj_parse.read_data(file_name)

In [None]:
# Creating summary by extractive process
df_text['Summary_Extractive_Process'] = df_text['News_Article'].apply(lambda x: obj_parse.summarize_text(x))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [None]:
df_text.head()

Unnamed: 0,Article_ID,News_Article,Summary_Extractive_Process
0,1,the global forensic audit market reached a val...,"the report covers marketing channels, upstream..."
1,2,a small library that provides audio and video ...,any attacker sitting on the same network as a ...
2,3,the latest research report financial auditing ...,of the industry. this market analysis enables ...
3,4,enterprises have been loading more of their op...,"“varmour’s platform provides the visibility, c..."
4,5,let’s rewind to last june. the first lockdown ...,you get the idea: back to business as usual an...


### Summary by Abstractive Process.

In [None]:
import torch
from transformers import AutoTokenizer
from transformers import AutoModelWithLMHead
import re
import flair

In [None]:
tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = AutoModelWithLMHead.from_pretrained("t5-base", return_dict=True)
flair_sentiment = flair.models.TextClassifier.load('en-sentiment')
regex_pattern = r'[^A-Za-z0-9.]'

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]



Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

2022-04-04 16:33:40,383 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-distilbert/sentiment-en-mix-distillbert_4.pt not found in cache, downloading to /tmp/tmpjnjhbnm4


100%|██████████| 265512723/265512723 [00:09<00:00, 27255876.62B/s]

2022-04-04 16:33:50,447 copying /tmp/tmpjnjhbnm4 to cache at /root/.flair/models/sentiment-en-mix-distillbert_4.pt





2022-04-04 16:33:50,973 removing temp file /tmp/tmpjnjhbnm4
2022-04-04 16:33:51,017 loading file /root/.flair/models/sentiment-en-mix-distillbert_4.pt


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
def summarize_text_abstractive(complete_article):
    # Create inputs encodings...
    inputs = tokenizer.encode("summarize: " + complete_article,
                          return_tensors='pt',
                          max_length=512,
                          truncation=True)
    # summary ids..
    summary_ids = model.generate(inputs, max_length=150, min_length=80, length_penalty=5., num_beams=2)
    # get the summary...
    summary = tokenizer.decode(summary_ids[0])

    return summary

In [None]:
df_text['Summary_Abstractive_Process'] = df_text['News_Article'].apply(lambda x: summarize_text_abstractive(x))

In [None]:
df_text.head(10)

Unnamed: 0,Article_ID,News_Article,Summary_Extractive_Process,Summary_Abstractive_Process
0,1,the global forensic audit market reached a val...,"the report covers marketing channels, upstream...",<pad> the global forensic audit market reached...
1,2,a small library that provides audio and video ...,any attacker sitting on the same network as a ...,<pad> bug affects software development kit (sd...
2,3,the latest research report financial auditing ...,of the industry. this market analysis enables ...,<pad> the financial auditing professional serv...
3,4,enterprises have been loading more of their op...,"“varmour’s platform provides the visibility, c...",<pad> varmour provides ways to manage in real ...
4,5,let’s rewind to last june. the first lockdown ...,you get the idea: back to business as usual an...,<pad> piers morgan tweets photo of rishi sunak...
5,6,adroit market research has announced the relea...,the report was developed taking into account b...,<pad> adroit market research has released a fr...


In [None]:
df_text.to_excel("/content/sample_data/Article_Summary.xlsx")  # Save as excel...

### Generating Metadata

* Top N-Grams
* Sentiment Analysis
* Entity Extraction

In [None]:
from flair.data import Sentence
from flair.models import TextClassifier
import spacy

In [None]:
sentiment_classifier = TextClassifier.load('sentiment')
nlp = spacy.load("en_core_web_sm")
words_list = list(words.words())

2022-04-04 04:24:22,116 loading file /root/.flair/models/sentiment-en-mix-distillbert_4.pt


In [None]:
# function to generate sentiment of the article...
def generate_sentiment(complete_article):
    try:
        obj_sentence = Sentence(complete_article)
        sentiment_classifier.predict(obj_sentence, mini_batch_size=32)
        sentiment_type = obj_sentence.labels[0].value
        sentiment_score = round(float(obj_sentence.labels[0].score),3)

        return sentiment_type, sentiment_score
    except Exception as e:
        print(traceback.format_exc())

# function to extract entities...
def extract_entities(complete_article):
    try:
        # Create a summarized article doc...
        complete_news_doc = nlp(complete_article)

        # Extract specific labels for reference tagging....
        specific_reference_tags = ['PERSON', 'ORG', 'PRODUCT', 'GPE', 'LOC', 'MONEY']

        # Creating a dictionary of tags...
        tags_dict = {}

        tags_dict['ORG'] = 'Organisation'
        tags_dict['PERSON'] = 'Person'
        tags_dict['GPE'] = 'Location'
        tags_dict['MONEY'] = 'Monetary Value'
        tags_dict['PRODUCT'] = 'Product'
        tags_dict['LOC'] = 'Location'

        # create entity details...
        entity_details = list(complete_news_doc.ents)
        filtered_entities = []
        entity_labels = []

        # Iterating through the entities...
        for entity in entity_details:
            if entity.label_ in specific_reference_tags:
                # get the tag value...
                tag_value = tags_dict[str(entity.label_)]
                # append the entities and their tags to the list...
                filtered_entities.append(str(entity))
                entity_labels.append(tag_value)

        filtered_entities = ','.join(filtered_entities)
        entity_labels = ','.join(entity_labels)

        filtered_entities = filtered_entities+';'+entity_labels

        return filtered_entities

    except Exception as e:
          print(traceback.format_exc())

# function to generate top_n_grams...
def top_n_grams(complete_article):
    tokenized_text = sent_tokenize(complete_article)
    tfidf_bigram = TfidfVectorizer(sublinear_tf=True, norm='l2', ngram_range=(2,2), stop_words='english')

    X2 = tfidf_bigram.fit_transform(tokenized_text)
    features = (tfidf_bigram.get_feature_names())

    # Getting top ranking features...
    sums = X2.sum(axis=0)
    data1 = []
    for col, term in enumerate(features):
        data1.append((term, sums[0, col]))
    
    ranking = pd.DataFrame(data1, columns=['Bigrams', 'Rank'])
    words = (ranking.sort_values('Rank', ascending=False))

    bigrams_list = list(words['Bigrams'])[:5]
    bigrams = ','.join(bigrams_list)

    return bigrams

# Function to generate metadata...
def generate_metadata(df_text):
    list_sentiment_type = []
    list_sentiment_score = []
    list_top_n_grams = []
    list_entities = []

    # Articles list...
    articles_list = list(df_text['News_Article'])

    for article in articles_list:
        sentiment_type, sentiment_score = generate_sentiment(article)
        n_grams = top_n_grams(article)
        entities = extract_entities(article)

        list_sentiment_type.append(sentiment_type)
        list_sentiment_score.append(sentiment_score)
        list_top_n_grams.append(n_grams)
        list_entities.append(entities)

    return list_sentiment_type, list_sentiment_score, list_top_n_grams, list_entities

In [None]:
list_sentiment_type, list_sentiment_score, list_top_n_grams, list_entities = generate_metadata(df_text)



In [None]:
df_text['Sentiment_Type'] = list_sentiment_type
df_text['Sentiment_Score'] = list_sentiment_score
df_text['Top_N_Grams'] = list_top_n_grams
df_text['Entities'] = list_entities

In [None]:
df_text.to_excel("Summary_Metadata.xlsx")