In [None]:
import nltk

'''
word_tokenize - function to tokenize a given sentence into a list of words
sent_tokenize - function to tokenize a given paragraph into a list of sentences
stopwords - function to generate the list of stopwords pertaining to a given language
PorterStemmer - algorithm to find out the root stem of a given word
string - a class used to remove punctuations from the given text
'''
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string

In [None]:
def preprocess(givenFile):
    '''
    This function is used for preprocessing the data contained in the 
    supplied file object givenFile argument. It tokenises the given 
    data into a set of sentences first, followed by word level as well.
    '''
    
    '''Returns a list consisting of the sentences of the article'''
    webPageSentences = sent_tokenize(givenFile)
    
    '''Returns a list consisting of the sentences of the article without punctuations'''
    webPageRemovedPunctuations = ["".join([char for char in s if char not in string.punctuation]) for s in webPageSentences]
    
    '''Returns a list consisting of the sentences of the article, where each sentence is now a list consisting 
        of the corresponding words that make it up'''
    webPageWords = [word_tokenize(n) for n in webPageRemovedPunctuations]
    
    '''Stop words are removed'''
    stop_words = stopwords.words("english")
    webPageFilteredWords = [[word for word in s if word not in stop_words] for s in webPageWords]
    
    '''Each word present is converted to its root stem using PorterStemmer Algorithm'''
    porter = PorterStemmer()
    webPageStemmed = [[porter.stem(word) for word in s] for s in webPageFilteredWords]
    
    return webPageStemmed
    

In [None]:
import csv

fileTitle = ""
fileSummary = ""
fileContent = ""

check = 0
with open('wikihowAll.csv', newline='', encoding='utf-8') as f:
    reader = csv.reader(f)
    for row in reader:
        if check:
            fileSummary = row[0]
            fileTitle = row[1]
            fileContent = row[2]
            break
        check += 1
        
print("ARTICLE TITLE : \n {} \n\n ACTUAL CONTENT : \n {} \n\n SUMMARY : \n {} \n\n".format(fileTitle, fileContent, fileSummary))
        


In [None]:
preProcessedContent = preprocess(fileContent)
print(preProcessedContent)

'''Average sentence length'''
averageSentLength = 0

'''Finding out the vocabulary of the document'''
bagOfWords = []
for i in preProcessedContent:
    bagOfWords.extend(i)
    averageSentLength += len(i)
    
bagOfWords = set(bagOfWords)



In [None]:
def feature_vector():
    ''' This function will return a tuple of 10 elements
    corresponding to the value of the 10 features.'''
    
    '''f1 = Sentence position.We assume that the first sentences of a paragraph are the most important. 
    Therefore, we rank a paragraph sentence according to its position in the paragraph and we consider 
    maximum positions of 5.'''
    positionOfSentence = preProcessedContent.index(sentence)
    f1 = 0
    if positionOfSentence in range(0, 5):
        f1 = (5 - positionOfSentence) / 5
    else:
        f1 = 0
        
    ''' f4 = Sentence centrality (similarity with rest of document).Sentence centrality is the vocabulary overlap
    between this sentence and other sentences in the document.'''
    distinctWordsInSent = set(sentence)
    f4 = len(distinctWordsInSent.intersection(bagOfWords)) / len(distinctWordsInSent.union(bagOfWords))
    
    '''f6 = Sentence Resemblance to the title.Sentence resemblance to the title is the vocabulary overlap
    between this sentence and the document title.'''
    distinctWordsInTitle = set(fileTitle)
    f6 = len(distinctWordsInSent.intersection(distinctWordsInTitle)) / len(distinctWordsInSent.union(distinctWordsInTitle))
    
    
    '''f7 = sentence relative length.This feature is employed to penalize sentences that are too short, since these
    sentences are not expected to belong to the summary.'''
    f7 = len(sentence) * averageSentLength
    
    
    
    
    
    

In [None]:
feature_vector(preProcessedContent[0])