In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt') # one time execution
nltk.download('stopwords')# one time execution
import re
import os,glob


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jahan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jahan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [51]:
# Extract word vectors
word_embeddings = {}
f = open("../model/glove.6B.50d.txt",'r', errors = 'ignore', encoding='utf8')


#f = open('../model/glove.6B.50d.txt', encoding='windows-1252')
for line in f:
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
    except:
        f.__next__()
    word_embeddings[word] = coefs
f.close()

StopIteration: 

In [39]:
# text cleaning
def preprocessing(sentences):
    # remove punctuations, numbers and special characters
    clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

    # make alphabets lowercase
    clean_sentences = [s.lower() for s in clean_sentences]
    return clean_sentences

In [4]:
# function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [45]:
def create_sentence_vectors(clean_sentences):
    sentence_vectors = []
    for i in clean_sentences:
        if len(i) != 0:
            v = sum([word_embeddings.get(w, np.zeros((50,))) for w in i.split()])/(len(i.split())+0.001)
        else:
            v = np.zeros((50,))
        sentence_vectors.append(v)
    return sentence_vectors

In [46]:
from sklearn.metrics.pairwise import cosine_similarity

# find similarity between sentences using cosine-similarity
def create_similarity_matrix(sentences, sentence_vectors):
    sim_mat = np.zeros([len(sentences), len(sentences)])
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,50), sentence_vectors[j].reshape(1,50))[0,0]
            
    return sim_mat

In [58]:
import networkx as nx

# convert similarity matrix into a graph using page rank algorithm. 
# Nodes of the graph will be sentences and endges will be similarity scores
# extract the top N scored sentences
def page_rank(sim_mat, sentences, summary_length):
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)
    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
    final_sentences=''
    for i in range(summary_length):
        final_sentences += ''.join(ranked_sentences[i][1])
    return final_sentences
    

In [125]:
# split the the text in the articles into sentences
# Read the CSV file
import json
from time import sleep


def create_summary(filename, summary_length):
        sentences = []
        title = []
        for line in open(filename, 'r'):
            json_entry = (json.loads(line))
            if line != '':
                    #break the sentences into individual sentences
                    sentences.append(sent_tokenize(json_entry['text']))
                    title.append(json_entry['title'])
        # flatten the list
        sentences = [y for x in sentences for y in x]
        summary_length = int((summary_length/100) * len(sentences))
        clean_sentences = preprocessing(sentences)
        sentences_vector = create_sentence_vectors(clean_sentences)
        sim_matrix = create_similarity_matrix(sentences, sentences_vector)
        summary = page_rank(sim_matrix, sentences, summary_length)
        
        return title[0], summary

In [126]:
title, summary = create_summary("./huffpost4491/huffpost8.json", 50)

In [128]:
title

"Mike Myers Reveals He'd 'Like To' Do A Fourth Austin Powers Film"

In [129]:
summary

'In an interview with People, Myers says he’s having the “happiest time” of his life raising his three kids, but that even they are eager for him to get back into movies, specifically as his character Austin Powers.The “Saturday Night Live” alum, who turned 55 on Friday, last played the British spy in 2002′s “Goldmember.” Myers said it was his son Spike who asked him if there was going to be a new Austin Powers movie.“I think I may have found the right balance now, and I’m super thrilled.”\n\nEarlier this month, Myers appeared on Jimmy Kimmel, performing as Trump’s physician Dr. Harold Bornstein, and he killed it.Actor Mike Myers is currently starring with Margot Robbie in the film “Terminal,” and after about a five-year break from films, it looks like he’s here to stay.“I have a lot of stuff coming,” said Myers.'