21011101112-Sanjai
21011101119-Shiva RK

In [None]:
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from rouge_score import rouge_scorer

### Corpus Creation

In [None]:
df = pd.read_csv("C:/Users/sanja/Downloads/Articles.csv")
data = df["content"].values

print("Number of Articles : ", len(data))

Number of Articles :  1303


#### Pre-Processing

In [None]:
# Preprocessing function
def pre_processing(text):
    tokenized = sent_tokenize(text)
    pattern = re.compile(r'[^a-zA-Z0-9\s]')
    tokenized = [pattern.sub('', sent).strip().lower() for sent in tokenized]
    return tokenized

# Create corpus
corpus = []
for doc in data:
    corpus.extend(pre_processing(doc))

### Pre-Processing on Input Text

In [None]:
# Input text
input_text = """
Millions go missing at China bank.
Two senior officials at one of China's top commercial banks have reportedly disappeared after funds
worth up to $120m (£64m) went missing. The pair both worked at Bank of China in the northern city
of Harbin, the South China Morning Post reported.

The latest scandal at Bank of China will do nothing to reassure foreign investors that China's
big four banks are ready for international listings. Government policy sees the bank listings as vital
economic reforms. Bank of China is one of two frontrunners in the race to list overseas. The other is
China Construction Bank. Both are expected to list abroad during 2005. They shared a $45bn state
bailout in 2003, to help clean up their balance sheets in preparation for a foreign stock market debut.
"""

input_text = input_text.replace("\n", " ")
sentences = sent_tokenize(input_text)
input_tok = pre_processing(input_text)

# Expected summary for ROUGE score
expected = """
The other is China Construction Bank. The latest scandal at Bank of China will do nothing to reassure
foreign investors that China's big four banks are ready for international listings.
Bank of China is the country's biggest foreign exchange dealer, while China Construction Bank is the
largest deposit holder. Bank of China is one of two frontrunners in the race to list overseas.
Although he committed the offences whilst running Bank of China in New York, Mr. Wang was head of
China Construction Bank when the scandal broke. Earlier this month, a China Construction Bank branch
manager was jailed for life in a separate case. The pair both worked at Bank of China in the
northern city of Harbin, the South China Morning Post reported. The most high-profile case involved
the ex-president of Bank of China, Wang Xuebing, jailed for 12 years in 2003.
Two senior officials at one of China's top commercial banks have reportedly disappeared after funds
worth up to $120m (£64m) went missing. China’s banks used to act as cash offices for state
enterprises and did not require checks on credit worthiness.
"""

expected = expected.replace("\n", " ").strip()

### ROUGE Score

In [None]:
from rouge_score import rouge_scorer

expected = """
The other is China Construction Bank. The latest scandal at Bank of China will do nothing to reassure
foreign investors that China's big four banks are ready for international listings.
Bank of China is the country's biggest foreign exchange dealer, while China Construction Bank is the
largest deposit holder. Bank of China is one of two frontrunners in the race to list overseas.
Although he committed the offences whilst running Bank of China in New York, Mr. Wang was head of
China Construction Bank when the scandal broke. Earlier this month, a China Construction Bank branch
manager was jailed for life in a separate case. The pair both worked at Bank of China in the
northern city of Harbin, the South China Morning Post reported. The most high-profile case involved
the ex-president of Bank of China, Wang Xuebing, jailed for 12 years in 2003.
Two senior officials at one of China's top commercial banks have reportedly disappeared after funds
worth up to $120m (£64m) went missing. China’s banks used to act as cash offices for state
enterprises and did not require checks on credit worthiness.
"""

expected = expected.replace("\n", " ").strip()

def rouge_metrics(summary):

    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    scores = scorer.score(summary, expected)
    print("Rouge Score : ", scores, end="\n\n")

### Summarize Function

In [None]:
# ROUGE Score function
def rouge_metrics(summary):
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    scores = scorer.score(summary, expected)
    print("Rouge Score : ", scores, end="\n\n")

# Summarize Function
def summarize(input_vec):
    similarity_matrix = cosine_similarity(input_vec, input_vec)
    G = nx.from_numpy_array(similarity_matrix)
    pagerank_scores = nx.pagerank(G)
    sorted_sentences = sorted(pagerank_scores, key=pagerank_scores.get, reverse=True)
    top_k = 10
    summary = [sentences[i] for i in sorted_sentences[:top_k]]
    rouge_metrics(" ".join(summary))
    print(" ".join(summary))
    return summary

### Vectorization

In [None]:
# Bag of Words (BoW)
bag_of_words = CountVectorizer()
corpus_bow = bag_of_words.fit_transform(corpus)
input_bow = bag_of_words.transform(input_tok)
summary_bow = summarize(input_bow)

# TF-IDF
tf_idf = TfidfVectorizer()
corpus_idf = tf_idf.fit_transform(corpus)
input_idf = tf_idf.transform(input_tok)
summary_idf = summarize(input_idf)

# Continuous Bag of Words (CBOW)
g_model = Word2Vec(sentences=[word_tokenize(sent) for sent in corpus], vector_size=200, window=5, workers=5, epochs=500)

def get_embeddings(sent_l):
    vec = np.array([g_model.wv[word] if word in g_model.wv else np.zeros((200)) for word in sent_l])
    vec = vec.sum(axis=0)
    return vec

input_cbow = np.array([get_embeddings(sent) for sent in [word_tokenize(sent) for sent in input_tok]])
summary_cbow = summarize(input_cbow)

# Word2Vec
import gensim.downloader as api
model = api.load("glove-wiki-gigaword-200")

def get_embeddings(sent_l):
    vec = np.array([model[word] if word in model else np.zeros((200)) for word in sent_l])
    vec = vec.sum(axis=0)
    return vec
input_wv = np.array([get_embeddings(sent) for sent in [word_tokenize(sent) for sent in input_tok]])
summary_wv = summarize(input_wv)

Rouge Score :  {'rouge1': Score(precision=0.5340314136125655, recall=0.7555555555555555, fmeasure=0.6257668711656441)}

Bank of China is one of two frontrunners in the race to list overseas. The pair both worked at Bank of China in the northern city of Harbin, the South China Morning Post reported. The latest scandal at Bank of China will do nothing to reassure foreign investors that China's big four banks are ready for international listings. The other is China Construction Bank.  Millions go missing at China bank. Two senior officials at one of China's top commercial banks have reportedly disappeared after funds worth up to $120m (£64m) went missing. Government policy sees the bank listings as vital economic reforms. They shared a $45bn state bailout in 2003, to help clean up their balance sheets in preparation for a foreign stock market debut. Both are expected to list abroad during 2005.
Rouge Score :  {'rouge1': Score(precision=0.5340314136125655, recall=0.7555555555555555, fmeasu

In [None]:
common_sentences = list(set(summary_bow) & set(summary_idf) & set(summary_cbow))
final_summary = " ".join(common_sentences)
print(final_summary)