# Importing libraries

In [None]:
import glob

In [None]:
import nltk,re,string
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from heapq import nlargest
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Lemmatizing the text

In [None]:
from nltk.stem import WordNetLemmatizer 

class LemmaTokenizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.lemmatizer.lemmatize(t) for t in word_tokenize(articles)]

# Merges an acronyms in a given sentence

In [None]:
def merge_acronyms(s):
    """Merges all acronyms in a given sentence. For example M.I.T -> MIT"""
    
    s = re.sub(r'(?<!\w)([A-Z])\.', r'\1', s)
    return s

# Preprocessing to remove unwanted punctuations

In [None]:
def preprocessing(line):
    line = line.lower()
    braces = ['[',']','{','}','(',')','-','...','_']
    line = re.sub(r"[0-9]".format(string.punctuation), " ", line)
    line = merge_acronyms(line)
    for b in braces:
        line = line.replace(b,'')
    return line

# Ranking the sentences according to the Tf-idf scores

In [None]:
def maxTfIdfIndex(sentence_list, feature_names, tfidf):
    
    word_frequencies = {}
    doc = 0
    lemma = LemmaTokenizer()
    feature_index = tfidf[doc,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf[doc, x] for x in feature_index])

    for w, s in [(feature_names[j], s) for (j, s) in tfidf_scores]:
        word_frequencies[w] = s

    sum_all_words = sum(word_frequencies.values())
    #print(sum_all_words)
    sen1 = ""
    
    sentence_scores = {}  
    for sent in sentence_list: 
        sent = preprocessing(sent)
        #nltk.word_tokenize(sent.lower())
        sen1 = sent[:]
        for word in lemma.__call__(sen1):
            if word in word_frequencies.keys():
                if sent not in sentence_scores.keys():
                    # sentence_score = [sum(tf-idf of all words of sentence) / sum(tf-idf of all words of document)] 
                    sentence_scores[sent] = word_frequencies[word]
                else:
                    sentence_scores[sent] += word_frequencies[word]

        if sent in sentence_scores.keys(): 
            sentence_scores[sent] = sentence_scores[sent]/sum_all_words
        
    key_list = list(sentence_scores.keys()) 
    val_list = list(sentence_scores.values()) 
  
    #print(key_list[val_list.index(max(val_list))]) 
    #print(val_list.index(max(val_list)))
    #print()
    return val_list.index(max(val_list))

# Determining the optimal number of clusters using combination of Silhouette Analysis and Thumb Rule

In [None]:
from sklearn.metrics import silhouette_score
import math

def cluster_no(X, text_size):
    """if (text_size <100):
        no_of_clusters = text_size
    else:
        no_of_clusters = int(math.sqrt(text_size//2))+10
    """
    no_of_clusters = text_size
    score_list = [0 for i in range(no_of_clusters)]
    for i in range(2,no_of_clusters):
        print(i,end=" ")
        kmeans = KMeans(n_clusters = i, init='k-means++', max_iter=100, n_init=1).fit(X)
        y_pred = list(kmeans.predict(tfidf))
        score = silhouette_score(X, y_pred, metric='euclidean')
        score_list[i] = score
        
        #print("score for the n_cluster {} is {}".format(i,score))
    # Visualizing the Silhouette score.
    plt.scatter(list(range(no_of_clusters)), score_list, c = "r")
    plt.plot(list(range(no_of_clusters)), score_list, c = "r")
    plt.xlabel("Number of clusters")
    plt.ylabel("Silhouette Score")
    plt.title("Silhouette Score vs no. of clusters")
    plt.show()

    return (score_list.index(max(score_list)))

# Reading Multiple documents for Current working directory

In [None]:
all_doc_data = []
path = input("Enter path:")
for filename in glob.glob(path+'/*.txt'):
    with open(filename,'r') as file:
        text = file.read()
        if len(text) > 0:
            all_doc_data.append(text)
print(len(all_doc_data))

# Performing actual Summarization of individual document according to their TF-IDF scores

In [None]:
from sklearn.datasets import fetch_20newsgroups

clusters_list = []
count_lines_doc = 0
sent_lines = []
final_sum = []
for i in range(len(all_doc_data)):
    doc = i
    print("Document  {}".format(i+1))
    print()
    #newsgroups_train = fetch_20newsgroups(subset='train', categories=['alt.atheism', 'sci.space'])
    sentences_list = []
    sentenc_list = nltk.sent_tokenize(all_doc_data[i])
    count_lines_doc += len(sentenc_list)
    print("Number of Lines in Actual text")
    print(len(sentenc_list))
    
    for i in range(len(sentenc_list)):
        if sentenc_list[i] not in sent_lines:
            sent_lines.append(sentenc_list[i])
            sentences_list.append(sentenc_list[i])
            
    #sentences_list = newsgroups_train.data
    if(len(sentences_list)>2):
        vectorizer = TfidfVectorizer(analyzer='word',min_df = 0,preprocessor=preprocessing,stop_words='english',tokenizer=LemmaTokenizer(), sublinear_tf=True)
        vectorizer = vectorizer.fit(sentences_list)
        tfidf = vectorizer.transform(sentences_list)
        feature_names = vectorizer.get_feature_names()
        #print(feature_names)

        #Clustering the sentences using KMeans clustering
        n_cluster = cluster_no(tfidf, len(sentences_list))
        clusters_list.append(n_cluster)
        print("Optimal cluster number is ",n_cluster)
        print()
        kmeans = KMeans(n_clusters=n_cluster, init='k-means++', max_iter=100, n_init=1).fit(tfidf)
        kmean_indices = list(kmeans.fit_predict(tfidf))
        #Extracting sentences  with highest tf-idf values
    
        ind = maxTfIdfIndex(sentences_list, feature_names, tfidf)
        cluster_max = [] 

        index = kmean_indices[ind]
        sent = []

        for i in range(len(sentences_list)):
            if kmean_indices[i] == index:
                sent.append(sentences_list[i])
                #print(sentences_list[i])
        print("Number of Lines in Summarized text ")
        print(len(sent))
        print()
        
        final_sum.extend(sent)
        summary = ' '.join(sent)
        
        #file1 = open(path+'/SummarizedText/summarized_text'+str(i+1)+'.txt','w+')
        #file1.write(summary)
        #file1.close()
        print(summary)
        print()
    else:
        final_sum.extend(sentences_list)
        print("Number of Lines in Summarized text ")
        print(len(sentences_list))
        print(' '.join(sentences_list))
        clusters_list.append(None)
        
print("No. of lines in actual document is ",count_lines_doc)
print("No of lines in summarized document is ",len(final_sum))
print("Final summary of the Multi-Document is ")
print(' '.join(final_sum))

# Visualizing the data

In [None]:
from sklearn.decomposition import PCA

for i in range(len(all_doc_data)):
    print("Document  {}".format(i+1))
    print()
    if clusters_list[i] != None :
        sentences_list = nltk.sent_tokenize(all_doc_data[i])
        #sentences_list = newsgroups_train.data
        
        # vectorizing the Document
    
        vectorizer = TfidfVectorizer(analyzer='word',min_df = 0,preprocessor=preprocessing,stop_words='english',tokenizer=LemmaTokenizer(), sublinear_tf=True)
        tfidf = vectorizer.fit_transform(sentences_list)
    
        #Clustering the sentences using KMeans clustering
        kmeans = KMeans(n_clusters=clusters_list[i], init='k-means++', max_iter=100, n_init=1).fit(tfidf)
        kmean_indices = kmeans.fit_predict(tfidf)
    
        #Decomposing the sparse matrix into list . Inplace of  todense() toarray() can be used
        pca = PCA(n_components=2)
        scatter_plot_points = pca.fit_transform(tfidf.todense())
    
        #Assigning 1st column to x_axis
        x_axis = [o[0] for o in scatter_plot_points]
    
        #Assigning 2nd column to y_axis
        y_axis = [o[1] for o in scatter_plot_points]

    
        colors = ["r", "m","g","c","y","k","w"]*((clusters_list[i]//7)+1)
        fig, ax = plt.subplots(figsize=(8,4))
        ax.scatter(x_axis, y_axis, c=[colors[d] for d in kmean_indices])

        #Plotting centroids in the subplot
        centers2D = pca.transform(kmeans.cluster_centers_)
        ax.scatter(centers2D[:,0], centers2D[:,1], marker='x', s=200, linewidths=3, c="b") 
    
    
        #for i, txt in enumerate(sentences_list):
            #ax.annotate(txt, (x_axis[i], y_axis[i]))
        plt.show()
    else:
        print("Visualization is not possible for text with redundant data")

#  Deleting variables after use

In [None]:
del(all_doc_data)
del(clusters_list)
del(count_lines_doc)
del(sent_lines)
del(final_sum)

In [None]:
import gc

In [None]:
collected = gc.collect()
print ("Garbage collector: collected {} objects.".format(collected))