In [None]:
nltk.download('punkt') # one time execution

### Implementation of the TextRank Algorithm

In [1]:
#Import Required Libraries
import numpy as np
import pandas as pd
import nltk
import re

In [2]:
#read data
df = pd.read_csv("cleandata.csv")

In [3]:
#inspect the data
df.head()

Unnamed: 0,Text,cleaned_text
0,\nWhether you can’t get motivated to clean you...,whether get motivate clean house feel motivate...
1,\nSome people just seem to have a knack for ma...,people seem knack make conversation others str...
2,"\nEmotional intelligence, sometimes referred t...",emotional intelligence sometimes refer emotion...
3,\nIndividuals with social intelligence can sen...,individuals social intelligence sense people f...
4,\nExpertise is what separates the amateur from...,expertise separate amateur true master almost ...


In [4]:
def split_text(row):
    sentence_list = nltk.sent_tokenize(row)
    return sentence_list
df['sentences'] = df['Text'].apply(split_text)

In [5]:
df

Unnamed: 0,Text,cleaned_text,sentences
0,\nWhether you can’t get motivated to clean you...,whether get motivate clean house feel motivate...,[\nWhether you can’t get motivated to clean yo...
1,\nSome people just seem to have a knack for ma...,people seem knack make conversation others str...,[\nSome people just seem to have a knack for m...
2,"\nEmotional intelligence, sometimes referred t...",emotional intelligence sometimes refer emotion...,"[\nEmotional intelligence, sometimes referred ..."
3,\nIndividuals with social intelligence can sen...,individuals social intelligence sense people f...,[\nIndividuals with social intelligence can se...
4,\nExpertise is what separates the amateur from...,expertise separate amateur true master almost ...,[\nExpertise is what separates the amateur fro...
...,...,...,...
311,\nAnti-racism is a process of actively identif...,anti racism process actively identify oppose r...,[\nAnti-racism is a process of actively identi...
312,\nAs the nation wrestles with institutional ra...,nation wrestle institutional racism attempt he...,[\nAs the nation wrestles with institutional r...
313,"\nIn a large-scale, multi-country study lookin...",large scale multi country study look cardiovas...,"[\nIn a large-scale, multi-country study looki..."
314,\nThe Women's Preventive Services Initiative (...,women preventive service initiative wpsi recen...,[\nThe Women's Preventive Services Initiative ...


In [7]:
#importation for token
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

In [8]:
def clean_sent(l):
    new = []
    for item in l:
        #ch = re.sub(r'http\s+', '',item)
        #ch = re.sub('[^a-zA-Z]',' ',ch)
        ch = str(item).lower()
        tokens = word_tokenize(ch)
        tokens = [item for item in tokens if item not in stop_words]
        tokens = [lemma.lemmatize(word=w, pos='v') for w in tokens]
        tokens = [i for i in tokens if len(i) > 2]
        ch1 = ' '.join(tokens)
        new.append(ch1)
    return new

df['cleaned_sent'] = df['sentences'].apply(clean_sent)
df['cleaned_sent'][0]

['whether get motivate clean house feel motivate lose weight lack motivation biggest obstacle reach goals',
 'motivation complete task even start one consider possible reason struggle',
 'develop plan help motivate get',
 'keep mind every strategy work everyone—or every situation',
 'perform behavioral experiment see strategies best help reach goals',
 'sometimes motivation problem',
 'time merely symptom bigger problem',
 'example perfectionist lack motivation may stem fear complete task flawlessly',
 'address need perfect motivation likely increase',
 'time lack motivation may cause procrastinate',
 'procrastinate less motivate feel',
 'case improve motivation get work help feel better perform better',
 'important take minutes consider might trouble motivate',
 'common reason lack motivation common reason people sometimes lack motivation',
 'might find lack motivation stem issue like fear people think desire please everyone',
 'carefully consider underlie thoughts feel affect drive',

In [11]:
type(df['cleaned_sent'][0])

list

In [12]:
l =df['cleaned_sent'][0]
l[0]

'whether get motivate clean house feel motivate lose weight lack motivation biggest obstacle reach goals'

In [14]:
# Extract word vectors
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [15]:
len(word_embeddings)

400000

In [17]:
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

In [18]:
def summary(row):
    # create vectors for our sentences
    sentence_vectors = []
    for i in row:
        if len(i) != 0:
            v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
        else:
            v = np.zeros((100,))
        sentence_vectors.append(v)
    # similarity matrix
    sim_mat = np.zeros([len(row), len(row)])
    #initialize the matrix with cosinus similarity
    for i in range(len(row)):
        for j in range(len(row)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]
    #Applying PageRank Algorithm
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)
    #Summary Extraction
    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(row)), reverse=True)
    # Extract top 10 sentences as the summary
    for i in range(10):
        summ = ranked_sentences[i][1]
    return summ

In [19]:
import swifter

In [20]:
df['summary'] = df['cleaned_sent'].apply(summary)

In [21]:
df

Unnamed: 0,Text,cleaned_text,sentences,cleaned_sent,summary
0,\nWhether you can’t get motivated to clean you...,whether get motivate clean house feel motivate...,[\nWhether you can’t get motivated to clean yo...,[whether get motivate clean house feel motivat...,keep mind people underestimate long something ...
1,\nSome people just seem to have a knack for ma...,people seem knack make conversation others str...,[\nSome people just seem to have a knack for m...,[people seem knack make conversation others st...,best discussions involve mixture ask question ...
2,"\nEmotional intelligence, sometimes referred t...",emotional intelligence sometimes refer emotion...,"[\nEmotional intelligence, sometimes referred ...",[emotional intelligence sometimes refer emotio...,probably things job love things hate.try focus...
3,\nIndividuals with social intelligence can sen...,individuals social intelligence sense people f...,[\nIndividuals with social intelligence can se...,[individuals social intelligence sense people ...,tactful appropriate humorous sincere conversat...
4,\nExpertise is what separates the amateur from...,expertise separate amateur true master almost ...,[\nExpertise is what separates the amateur fro...,[expertise separate amateur true master almost...,simply rehearse skills make better areas n't l...
...,...,...,...,...,...
311,\nAnti-racism is a process of actively identif...,anti racism process actively identify oppose r...,[\nAnti-racism is a process of actively identi...,[anti-racism process actively identify oppose ...,example merriam-webster dictionary define raci...
312,\nAs the nation wrestles with institutional ra...,nation wrestle institutional racism attempt he...,[\nAs the nation wrestles with institutional r...,[nation wrestle institutional racism attempt h...,finally check letter black live multilingual r...
313,"\nIn a large-scale, multi-country study lookin...",large scale multi country study look cardiovas...,"[\nIn a large-scale, multi-country study looki...",[large-scale multi-country study look cardiova...,recent study find link depression higher risk ...
314,\nThe Women's Preventive Services Initiative (...,women preventive service initiative wpsi recen...,[\nThe Women's Preventive Services Initiative ...,[women preventive service initiative wpsi rece...,preventive mental health care important preven...


In [23]:
df.to_csv("summary.csv", index = False)