In [86]:
import pandas as pd
import numpy as np
import textwrap
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [88]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SAUMITRA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SAUMITRA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [90]:
df = pd.read_csv('bbc_text_cls.csv')

In [92]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [94]:
doc = df[df.labels == 'business']['text'].sample(random_state=42)

In [96]:
def wrap(x):
  return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

In [98]:
print(wrap(doc.iloc[0]))

Christmas sales worst since 1981

UK retail sales fell in December,
failing to meet expectations and making it by some counts the worst
Christmas since 1981.

Retail sales dropped by 1% on the month in
December, after a 0.6% rise in November, the Office for National
Statistics (ONS) said.  The ONS revised the annual 2004 rate of growth
down from the 5.9% estimated in November to 3.2%. A number of
retailers have already reported poor figures for December.  Clothing
retailers and non-specialist stores were the worst hit with only
internet retailers showing any significant growth, according to the
ONS.

The last time retailers endured a tougher Christmas was 23 years
previously, when sales plunged 1.7%.

The ONS echoed an earlier
caution from Bank of England governor Mervyn King not to read too much
into the poor December figures.  Some analysts put a positive gloss on
the figures, pointing out that the non-seasonally-adjusted figures
showed a performance comparable with 2003. The Novembe

In [100]:
print(doc.iloc[0].split("\n", 1)[1])


UK retail sales fell in December, failing to meet expectations and making it by some counts the worst Christmas since 1981.

Retail sales dropped by 1% on the month in December, after a 0.6% rise in November, the Office for National Statistics (ONS) said. The ONS revised the annual 2004 rate of growth down from the 5.9% estimated in November to 3.2%. A number of retailers have already reported poor figures for December. Clothing retailers and non-specialist stores were the worst hit with only internet retailers showing any significant growth, according to the ONS.

The last time retailers endured a tougher Christmas was 23 years previously, when sales plunged 1.7%.

The ONS echoed an earlier caution from Bank of England governor Mervyn King not to read too much into the poor December figures. Some analysts put a positive gloss on the figures, pointing out that the non-seasonally-adjusted figures showed a performance comparable with 2003. The November-December jump last year was roughl

In [102]:
sents = nltk.sent_tokenize(doc.iloc[0].split("\n", 1)[1])

In [104]:
featurizer = TfidfVectorizer(
    stop_words=stopwords.words('english'),
    norm='l1')

In [106]:
X = featurizer.fit_transform(sents)

In [108]:
# compute similarity matrix
S = cosine_similarity(X) # we want to define a relation between each sentence and the remainig

In [110]:
S.shape #We have 17 sentences

(17, 17)

In [28]:
len(sents)

17

In [132]:
''' Method 1'''
# normalize similarity matrix to make it a markov matrix
S /= S.sum(axis=1, keepdims=True)

In [134]:
S[0].sum()

1.0

In [136]:
# uniform transition matrix (to apply add one smoothing)
U = np.ones_like(S) / len(S)

In [138]:
U[0].sum()

1.0

In [140]:
# smoothed similarity matrix
factor = 0.15
S = (1 - factor) * S + factor * U

In [142]:
S[0].sum()

1.0

In [144]:
# find the limiting / stationary distribution
eigenvals, eigenvecs = np.linalg.eig(S.T) #We transpose as we want eigen values row-wise

In [146]:
eigenvals

array([1.        , 0.20608646, 0.61291969, 0.57497504, 0.2957161 ,
       0.29254706, 0.32868514, 0.34283527, 0.35367287, 0.37602804,
       0.54323499, 0.53173273, 0.50084186, 0.48834525, 0.41234689,
       0.43629783, 0.45029066])

In [148]:
eigenvecs[:,0]

array([-0.24054885, -0.26365315, -0.22652984, -0.27600203, -0.24940965,
       -0.24747007, -0.26969325, -0.22218512, -0.22749246, -0.23094004,
       -0.21806384, -0.22132132, -0.23646867, -0.24197939, -0.23647324,
       -0.28193641, -0.21927795])

In [150]:
eigenvecs[:,0].dot(S) #checking if our eigen values are correct

array([-0.24054885, -0.26365315, -0.22652984, -0.27600203, -0.24940965,
       -0.24747007, -0.26969325, -0.22218512, -0.22749246, -0.23094004,
       -0.21806384, -0.22132132, -0.23646867, -0.24197939, -0.23647324,
       -0.28193641, -0.21927795])

In [152]:
eigenvecs[:,0] / eigenvecs[:,0].sum() #To make sum = 1

array([0.0585356 , 0.06415784, 0.05512419, 0.06716284, 0.06069181,
       0.06021982, 0.06562765, 0.05406694, 0.05535844, 0.05619738,
       0.05306406, 0.05385674, 0.05754272, 0.05888371, 0.05754384,
       0.06860693, 0.0533595 ])

In [154]:
''' Method 2 '''
limiting_dist = np.ones(len(S)) / len(S)
threshold = 1e-8
delta = float('inf')
iters = 0
while delta > threshold:
  iters += 1

  # Markov transition
  p = limiting_dist.dot(S)

  # compute change in limiting distribution
  delta = np.abs(p - limiting_dist).sum()

  # update limiting distribution
  limiting_dist = p

print(iters)

27


In [156]:
limiting_dist

array([0.0585356 , 0.06415784, 0.05512419, 0.06716284, 0.0606918 ,
       0.06021982, 0.06562765, 0.05406694, 0.05535843, 0.05619737,
       0.05306406, 0.05385674, 0.05754272, 0.05888371, 0.05754384,
       0.06860693, 0.0533595 ])

In [158]:
limiting_dist.sum()

1.0000000000000009

In [160]:
np.abs(eigenvecs[:,0] / eigenvecs[:,0].sum() - limiting_dist).sum()

1.5113477545847243e-08

In [162]:
scores = limiting_dist

In [164]:
sort_idx = np.argsort(-scores)

In [166]:
# Many options for how to choose which sentences to include:

# 1) top N sentences
# 2) top N words
# 3) top X% sentences or top X% words
# 4) sentences with scores > average score
# 5) sentences with scores > factor * average score

# You also don't have to sort. May make more sense in order.

print("Generated summary:")
for i in sort_idx[:5]:
  print(wrap("%.2f: %s" % (scores[i], sents[i])))

Generated summary:
0.07: "The retail sales figures are very weak, but as Bank of England
governor Mervyn King indicated last night, you don't really get an
accurate impression of Christmas trading until about Easter," said Mr
Shaw.
0.07: A number of retailers have already reported poor figures for
December.
0.07: The ONS echoed an earlier caution from Bank of England governor
Mervyn King not to read too much into the poor December figures.
0.06: Retail sales dropped by 1% on the month in December, after a
0.6% rise in November, the Office for National Statistics (ONS) said.
0.06: Clothing retailers and non-specialist stores were the worst hit
with only internet retailers showing any significant growth, according
to the ONS.


In [66]:
doc.iloc[0].split("\n")[0]

'Christmas sales worst since 1981'

In [68]:
def summarize(text, factor = 0.15):
  # extract sentences
  sents = nltk.sent_tokenize(text)

  # perform tf-idf
  featurizer = TfidfVectorizer(
      stop_words=stopwords.words('english'),
      norm='l1')
  X = featurizer.fit_transform(sents)

  # compute similarity matrix
  S = cosine_similarity(X)

  # normalize similarity matrix
  S /= S.sum(axis=1, keepdims=True)

  # uniform transition matrix
  U = np.ones_like(S) / len(S)

  # smoothed similarity matrix
  S = (1 - factor) * S + factor * U

  # find the limiting / stationary distribution
  eigenvals, eigenvecs = np.linalg.eig(S.T)

  # compute scores
  scores = eigenvecs[:,0] / eigenvecs[:,0].sum()
  
  # sort the scores
  sort_idx = np.argsort(-scores)

  # print summary
  for i in sort_idx[:5]:
    print(wrap("%.2f: %s" % (scores[i], sents[i])))

In [70]:
doc = df[df.labels == 'entertainment']['text'].sample(random_state=123)
summarize(doc.iloc[0].split("\n", 1)[1])

0.11: Goodrem, Green Day and the Black Eyed Peas took home two awards
each.
0.10: As well as best female, Goodrem also took home the Pepsi Viewers
Choice Award, whilst Green Day bagged the prize for best rock video
for American Idiot.
0.10: Other winners included Green Day, voted best group, and the
Black Eyed Peas.
0.10: The Black Eyed Peas won awards for best R 'n' B video and
sexiest video, both for Hey Mama.
0.10: Local singer and songwriter Missy Higgins took the title of
breakthrough artist of the year, with Australian Idol winner Guy
Sebastian taking the honours for best pop video.


In [72]:
doc.iloc[0].split("\n")[0]

'Goodrem wins top female MTV prize'