In [50]:
import pandas as pd
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [51]:
from pathlib import Path
import re

file_paths = ['savedrecs1.txt', 'savedrecs2.txt', 'savedrecs3.txt', 'savedrecs4.txt', 'savedrecs5.txt']

texts = ''
for file_path in file_paths:
    text = Path(file_path).read_text(encoding='utf-8')
    texts += text

In [52]:
final_text = texts.strip('\ufeffPT\tDT\tAU\tAA\tED\tCA\tSP\tPN\tAE\tTI\tFT\tSO\tLA\tLS\tU1\tU2\tAB\tC1\tRI\tOI\tPA\tSC\tPI\tSS\tID\tCN\tPY\tVL\tIS\tBP\tEP\tSN\tBN\tNR\tPG\tDI\tOA\tHC\tHP\tDA\tUT\nJ\t\t')

In [53]:
new2 = final_text.split('.\t\t\t\t\t\t\t\t\t\t')
list1 = [new2[0]]
list2 = []
for i in range(1, len(new2)-1):
    list1.append(new2[i].split('\nJ\t\t')[1])
    if i == len(new2)-2:
        list1.append(new2[i].split('\nJ\t\t')[2])
        list1.append(new2[i].split('\nJ\t\t')[3])
        

for i in list1:
    list2.append(re.sub(r"[\t]+", "\t", i))

In [54]:
data = {
    'Authors': [],
    'Journal': [],
    'Article': [],
    'Abstract': []
}

In [55]:
for string in list2:
    section = string.split('\t')
    data['Authors'].append(section[0])
    data['Article'].append(section[1])
    data['Journal'].append(section[2])
    data['Abstract'].append(section[3])

In [56]:
df = pd.DataFrame(data)

In [8]:
df

Unnamed: 0,Authors,Journal,Article,Abstract
0,u-Xiao Wang; Yue Xin; Jun-Yi Yin; Xiao-Jun Hua...,Food Chemistry,Revealing the architecture and solution proper...,Macrolepiota albuminosa (Berk.) Pegler is abun...
1,Yu-Xiao Wang; Yue Xin; Xiao-Jun Huang; Jun-Yi ...,Food Chemistry,A branched galactoglucan with flexible chains ...,A homogeneous galactoglucan was purified from ...
2,Yu-Xiao Wang; Ting Zhang; Jun-Yi Yin; Xiao-Jun...,Food Hydrocolloids,Structural characterization and rheological pr...,A homogeneous beta-glucan (JHMP-70) was obtain...
3,Yu-Xin Gu; Tian-Ci Yan; Zi-Xuan Yue; Min-Hui L...,Food Analytical Methods,Dispersive micro-solid-phase extraction of aca...,A novel dispersive micro-solid-phase extractio...
4,Yu-Xue Xu; Ze-Dong Jiang; Xi-Ping Du; Ming-Jin...,Food Chemistry,The identification of biotransformation pathwa...,The yeast Saccharomyces cerevisiae is effectiv...
...,...,...,...,...
3121,Anbuhkani Muniandy; Patnarin Benyathiar; Dharm...,Foods,Dynamic thermal properties estimation using se...,Thermal conductivity determination of food at ...
3122,Anchalee Ruengdech; Ubonrat Siripatrawan,LWT -- Food Science and Technology,Application of catechin nanoencapsulation with...,The efficiency of ultrasonic nanoemulsion to i...
3123,"Ancora, D.; Milavec, J.; Gradisek, A.; Cifelli...",Journal of Agricultural and Food Chemistry,Sensitivity of proton NMR relaxation and proto...,"Olive oils and, in particular, extra-virgin ol..."
3124,"Andersen, B. V.; Chan, R. C. K.; Byrne, D. V.",Foods,A conceptual framework for multi-dimensional m...,"In modern times, the majority of food intake i..."


#  -----------------------------------------------------------------------------------------------------------------
## Summary

In [9]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

nlp = spacy.load('en_core_web_lg')

In [10]:
summaries = []
for abstracts in list(df.Abstract.values):
    
    s=' '
    from nltk.tokenize import RegexpTokenizer

    tokenizer = RegexpTokenizer(r'\w+\.*')
    new_text = s.join(tokenizer.tokenize(abstracts))
    new_text = nlp(new_text)
    
    tokens = [token.text for token in new_text]

    from sklearn.feature_extraction.text import TfidfVectorizer
    tfidf = TfidfVectorizer(stop_words='english')

    vec = tfidf.fit_transform([new_text.text])
    feature_names = tfidf.get_feature_names()

    sentence_tokens = [sent for sent in new_text.sents]
    vec.toarray()[0]                              #returns the tfidf values of the words
    #feature_names.index(word.text)                  #this returns the index of that word(word.text)
    #vec.toarray()[0][feature_names.index(word.text) together it returns the tfidf value of that particular word

    sentence_scores = {}
    for sent in sentence_tokens:
        score=0
        for word in sent:
            if word.text in feature_names:
                score = score + vec.toarray()[0][feature_names.index(word.text)]
                sentence_scores[sent] = score

    from heapq import nlargest
    select_length = int(len(sentence_tokens)*0.25)
    summary = nlargest(select_length, sentence_scores, key = sentence_scores.get)
#     for sent in summary:
#         print(str(sent))

    sorted_summary=[]

    for i in sentence_tokens:
        if i in summary:
            sorted_summary.append(i.text)
    
    sorted_summary = ''.join(sorted_summary)
    
    summaries.append(sorted_summary)

In [11]:
df['Summary'] = summaries

In [12]:
df

Unnamed: 0,Authors,Journal,Article,Abstract,Summary
0,u-Xiao Wang; Yue Xin; Jun-Yi Yin; Xiao-Jun Hua...,Food Chemistry,Revealing the architecture and solution proper...,Macrolepiota albuminosa (Berk.) Pegler is abun...,In this study water extracted polysaccharides ...
1,Yu-Xiao Wang; Yue Xin; Xiao-Jun Huang; Jun-Yi ...,Food Chemistry,A branched galactoglucan with flexible chains ...,A homogeneous galactoglucan was purified from ...,A homogeneous galactoglucan was purified from ...
2,Yu-Xiao Wang; Ting Zhang; Jun-Yi Yin; Xiao-Jun...,Food Hydrocolloids,Structural characterization and rheological pr...,A homogeneous beta-glucan (JHMP-70) was obtain...,Results indicated JHMP 70 was a branched beta ...
3,Yu-Xin Gu; Tian-Ci Yan; Zi-Xuan Yue; Min-Hui L...,Food Analytical Methods,Dispersive micro-solid-phase extraction of aca...,A novel dispersive micro-solid-phase extractio...,A novel dispersive micro solid phase extractio...
4,Yu-Xue Xu; Ze-Dong Jiang; Xi-Ping Du; Ming-Jin...,Food Chemistry,The identification of biotransformation pathwa...,The yeast Saccharomyces cerevisiae is effectiv...,Sensory evaluation GC MS GC MS O and odor acti...
...,...,...,...,...,...
3121,Anbuhkani Muniandy; Patnarin Benyathiar; Dharm...,Foods,Dynamic thermal properties estimation using se...,Thermal conductivity determination of food at ...,The objective of this study was to determine t...
3122,Anchalee Ruengdech; Ubonrat Siripatrawan,LWT -- Food Science and Technology,Application of catechin nanoencapsulation with...,The efficiency of ultrasonic nanoemulsion to i...,The catechin nanoemulsions CaNE were fabricate...
3123,"Ancora, D.; Milavec, J.; Gradisek, A.; Cifelli...",Journal of Agricultural and Food Chemistry,Sensitivity of proton NMR relaxation and proto...,"Olive oils and, in particular, extra-virgin ol...",Self diffusion coefficients on this set of oli...
3124,"Andersen, B. V.; Chan, R. C. K.; Byrne, D. V.",Foods,A conceptual framework for multi-dimensional m...,"In modern times, the majority of food intake i...",Various factors have been found to influence t...


#  -----------------------------------------------------------------------------------------------------------------
## Topic modelling

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')

from sklearn.decomposition import LatentDirichletAllocation
LDA = LatentDirichletAllocation(n_components=1,random_state=42)  #n_components is the number of topics
                                                                 #and each word has value(kind of probabilty) assigned to each of those topics

In [14]:

final_topics = []
for abstracts in list(df.Abstract.values):
    vec = tfidf.fit_transform([abstracts])
    LDA.fit(vec)
    
    single_topic = LDA.components_
    single_topic.argsort()[0][-7:]             #grabing the top five words with highest values 
    
    top_word_indices = single_topic.argsort()[0][-7:]
    potential_topics = []
    for index in top_word_indices:
        
        potential_topics.append(tfidf.get_feature_names()[index])
    final_topics.append(potential_topics)

In [15]:
df['Potential_topics'] = final_topics


In [16]:
df

Unnamed: 0,Authors,Journal,Article,Abstract,Summary,Potential_topics
0,u-Xiao Wang; Yue Xin; Jun-Yi Yin; Xiao-Jun Hua...,Food Chemistry,Revealing the architecture and solution proper...,Macrolepiota albuminosa (Berk.) Pegler is abun...,In this study water extracted polysaccharides ...,"[composition, albuminosa, properties, solution..."
1,Yu-Xiao Wang; Yue Xin; Xiao-Jun Huang; Jun-Yi ...,Food Chemistry,A branched galactoglucan with flexible chains ...,A homogeneous galactoglucan was purified from ...,A homogeneous galactoglucan was purified from ...,"[structure, polysaccharides, albuminosa, beta,..."
2,Yu-Xiao Wang; Ting Zhang; Jun-Yi Yin; Xiao-Jun...,Food Hydrocolloids,Structural characterization and rheological pr...,A homogeneous beta-glucan (JHMP-70) was obtain...,Results indicated JHMP 70 was a branched beta ...,"[chain, concentration, viscosity, behavior, jh..."
3,Yu-Xin Gu; Tian-Ci Yan; Zi-Xuan Yue; Min-Hui L...,Food Analytical Methods,Dispersive micro-solid-phase extraction of aca...,A novel dispersive micro-solid-phase extractio...,A novel dispersive micro solid phase extractio...,"[adsorbent, analytes, dmspe, good, method, ext..."
4,Yu-Xue Xu; Ze-Dong Jiang; Xi-Ping Du; Ming-Jin...,Food Chemistry,The identification of biotransformation pathwa...,The yeast Saccharomyces cerevisiae is effectiv...,Sensory evaluation GC MS GC MS O and odor acti...,"[dehydrogenases, pathways, pentylfuran, produc..."
...,...,...,...,...,...,...
3121,Anbuhkani Muniandy; Patnarin Benyathiar; Dharm...,Foods,Dynamic thermal properties estimation using se...,Thermal conductivity determination of food at ...,The objective of this study was to determine t...,"[dependent, models, r12b10t1, experiments, tem..."
3122,Anchalee Ruengdech; Ubonrat Siripatrawan,LWT -- Food Science and Technology,Application of catechin nanoencapsulation with...,The efficiency of ultrasonic nanoemulsion to i...,The catechin nanoemulsions CaNE were fabricate...,"[antioxidant, activity, catechin, ph, cane, cm..."
3123,"Ancora, D.; Milavec, J.; Gradisek, A.; Cifelli...",Journal of Agricultural and Food Chemistry,Sensitivity of proton NMR relaxation and proto...,"Olive oils and, in particular, extra-virgin ol...",Self diffusion coefficients on this set of oli...,"[measured, nmr, diffusion, vegetable, oil, oli..."
3124,"Andersen, B. V.; Chan, R. C. K.; Byrne, D. V.",Foods,A conceptual framework for multi-dimensional m...,"In modern times, the majority of food intake i...",Various factors have been found to influence t...,"[eating, response, related, scale, hedonic, pl..."


#  -----------------------------------------------------------------------------------------------------------------
## Authors with most publicaitons 

In [57]:
df['Authors'] = df['Authors'].str.split(';')

df2 = df.explode('Authors')

In [18]:
df4 = df2.Authors.value_counts(ascending = False).nlargest(10)

In [19]:
df4

 Puente, S. L.      69
 Petkova, M.        69
 Kouba, M.          69
 Marcon, F.         68
 Durjava, M. F.     68
 Christensen, H.    68
 Mayo, B.           68
Bampidis, V.        68
 Sanz, Y.           68
 Dusemund, B.       68
Name: Authors, dtype: int64

In [20]:
df4.to_csv('author.csv')

#  -----------------------------------------------------------------------------------------------------------------
## Most number of Journals 

In [21]:
csv_file = pd.DataFrame(df.Journal.value_counts().nlargest(10))

In [22]:
list4 = list(csv_file.index)

In [23]:
list4

['Food Chemistry',
 'LWT -- Food Science and Technology',
 'Foods',
 'Food & Function',
 'Journal of Food Processing and Preservation',
 'Food Research International',
 'Journal of Agricultural and Food Chemistry',
 'Food Hydrocolloids',
 'Journal of the Science of Food and Agriculture',
 'Food Control']

In [24]:
# csv_file.to_csv('journal.csv')

#  -----------------------------------------------------------------------------------------------------------------
## Similarity between abstracts

### Cosine similarity with BERT

In [25]:
df100 = df.head(100)

In [26]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(text1, text2):
    embeddings = model.encode([text1, text2])
    similarity_score = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    return similarity_score

def find_similar_articles(df, threshold):
    similar_pairs = []
    n = len(df)

    for i in range(n):
        for j in range(i+1, n):
            abstract1 = df.loc[i, 'Abstract']
            abstract2 = df.loc[j, 'Abstract']

            similarity_score = calculate_similarity(abstract1, abstract2)

            if similarity_score > threshold:
                article1 = df.loc[i, 'Article']
                article2 = df.loc[j, 'Article']
                similar_pairs.append((article1, article2))

    return similar_pairs

model = SentenceTransformer('bert-base-nli-mean-tokens')

threshold = 0.70
similar_article_pairs = find_similar_articles(df100, threshold)
print(similar_article_pairs)

[('Revealing the architecture and solution properties of polysaccharide fractions from Macrolepiota albuminosa (Berk.) Pegler.', 'A branched galactoglucan with flexible chains from the basidioma of Macrolepiota albuminosa (Berk.) Pegler.'), ('Revealing the architecture and solution properties of polysaccharide fractions from Macrolepiota albuminosa (Berk.) Pegler.', 'Structural characterization and rheological properties of an alkali-extracted beta-glucan from Hypsizygus marmoreus.'), ('Revealing the architecture and solution properties of polysaccharide fractions from Macrolepiota albuminosa (Berk.) Pegler.', 'The identification of biotransformation pathways for removing fishy malodor from Bangia fusco-purpurea using fermentation with Saccharomyces cerevisiae.'), ('Revealing the architecture and solution properties of polysaccharide fractions from Macrolepiota albuminosa (Berk.) Pegler.', 'Major royal jelly proteins alleviate non-alcoholic fatty liver disease in mice model by regulati

In [27]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_similarity(text1, text2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    similarity_matrix = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
    cosine_similarity_score = similarity_matrix[0][0]
    return cosine_similarity_score

def find_similar_articles(df):
    similar_pairs = []
    n = len(df)
    
    for i in range(n):
        for j in range(i+1, n):
            abstract1 = df.loc[i, 'Abstract']
            abstract2 = df.loc[j, 'Abstract']
            
            similarity_score = calculate_cosine_similarity(abstract1, abstract2)
            
            if similarity_score > 0.70:
                article1 = df.loc[i, 'Article']
                article2 = df.loc[j, 'Article']
                similar_pairs.append((article1, article2))
        
    return similar_pairs

similar_article_pairs = find_similar_articles(df100)
print(similar_article_pairs)

[]


### Jaccard similarity

In [28]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

def preprocess_text(text):
    text = text.lower()
    
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    words = nltk.word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    preprocessed_text = ' '.join(words)
    
    return preprocessed_text

def jaccard_similarity(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

def find_similar_articles(df, threshold):
    similar_pairs = []
    n = len(df)
    
    preprocessed_abstracts = [preprocess_text(abstract) for abstract in df['Abstract']]
    
    for i in range(n):
        for j in range(i+1, n):
            abstract1 = preprocessed_abstracts[i]
            abstract2 = preprocessed_abstracts[j]
            
            similarity_score = jaccard_similarity(set(abstract1.split()), set(abstract2.split()))
            
            if similarity_score > threshold:
                article1 = df.loc[i, 'Article']
                article2 = df.loc[j, 'Article']
                similar_pairs.append((article1, article2))
    
    return similar_pairs

threshold = 0.70
similar_article_pairs = find_similar_articles(df100, threshold)
print(similar_article_pairs)


[]


In [29]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import spacy

def calculate_similarity(text1, text2):
    doc1 = nlp(text1)
    doc2 = nlp(text2)
    similarity_score = doc1.similarity(doc2)
    return similarity_score

def find_similar_articles(df, threshold):
    similar_pairs = []
    n = len(df)
    
    for i in range(n):
        for j in range(i+1, n):
            abstract1 = df.loc[i, 'Abstract']
            abstract2 = df.loc[j, 'Abstract']
            
            similarity_score = calculate_similarity(abstract1, abstract2)
            
            if similarity_score > threshold:
                article1 = df.loc[i, 'Article']
                article2 = df.loc[j, 'Article']
                similar_pairs.append((article1, article2))
    
    return similar_pairs

threshold = 0.70
similar_article_pairs = find_similar_articles(df100, threshold)
print(similar_article_pairs)


[('Revealing the architecture and solution properties of polysaccharide fractions from Macrolepiota albuminosa (Berk.) Pegler.', 'A branched galactoglucan with flexible chains from the basidioma of Macrolepiota albuminosa (Berk.) Pegler.'), ('Revealing the architecture and solution properties of polysaccharide fractions from Macrolepiota albuminosa (Berk.) Pegler.', 'Structural characterization and rheological properties of an alkali-extracted beta-glucan from Hypsizygus marmoreus.'), ('Revealing the architecture and solution properties of polysaccharide fractions from Macrolepiota albuminosa (Berk.) Pegler.', 'Dispersive micro-solid-phase extraction of acaricides from fruit juice and functional food using cucurbituril as sorbent.'), ('Revealing the architecture and solution properties of polysaccharide fractions from Macrolepiota albuminosa (Berk.) Pegler.', 'The identification of biotransformation pathways for removing fishy malodor from Bangia fusco-purpurea using fermentation with 

#  -----------------------------------------------------------------------------------------------------------------
## Clustering

In [30]:
df50 = df.head(50)

In [31]:
abstracts = df50.Abstract.tolist()

In [32]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import stopwords
import string

def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    
    tokens = [token for token in tokens if token not in string.punctuation]
    
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]
    
    processed_text = ' '.join(tokens)
    
    return processed_text

def cluster_abstracts(abstracts, num_clusters):
    processed_abstracts = [preprocess_text(abstract) for abstract in abstracts]
    
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(processed_abstracts)

    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(tfidf_matrix)
    labels = kmeans.labels_

    clusters = {}
    for i, label in enumerate(labels):
        if label not in clusters:
            clusters[label] = []
        clusters[label].append(abstracts[i])

    return clusters

num_clusters = 10
result = cluster_abstracts(abstracts, num_clusters)

for label, cluster in result.items():
    print(f"Cluster {label+1}:")
    for abstract in cluster:
        print(abstract)
    print()


Cluster 9:
Macrolepiota albuminosa (Berk.) Pegler is abundant in active polysaccharides, but little is known about their structures and solution properties. In this study, water-extracted polysaccharides from M. albuminosa (MAWP) were purified into three fractions with structural heterogeneity, which was attributed to the diversity in molecular weight, monosaccharide composition and linkage patterns, further affecting their solution properties. Methylation and NMR analysis revealed MAWP-60p and MAWP-70 were a 3-O-methylated glucomannogalactan and a previously unreported glucomannogalactan, whereas MAWP-80 was elucidated as a branched galactoglucan. Besides, three fractions exhibited random coil conformation in aqueous solution, while MAWP-60p had the highest viscosity due to its highest molecular weight, mean square radius of gyration (Rg) and O-methyl group attached to the backbone. The molecular weight, monosaccharide composition and glycosidic linkages might be the major contributor

In [33]:
final_clusters = list(dict(result.items()).values())
abstract_clusters = []
for i in final_clusters:
    abstract_clusters.append(' '.join(i))

In [34]:
abstract_clusters

['Macrolepiota albuminosa (Berk.) Pegler is abundant in active polysaccharides, but little is known about their structures and solution properties. In this study, water-extracted polysaccharides from M. albuminosa (MAWP) were purified into three fractions with structural heterogeneity, which was attributed to the diversity in molecular weight, monosaccharide composition and linkage patterns, further affecting their solution properties. Methylation and NMR analysis revealed MAWP-60p and MAWP-70 were a 3-O-methylated glucomannogalactan and a previously unreported glucomannogalactan, whereas MAWP-80 was elucidated as a branched galactoglucan. Besides, three fractions exhibited random coil conformation in aqueous solution, while MAWP-60p had the highest viscosity due to its highest molecular weight, mean square radius of gyration (Rg) and O-methyl group attached to the backbone. The molecular weight, monosaccharide composition and glycosidic linkages might be the major contributors to the 

In [35]:
summaries = []
final_topics = []
for abstracts in abstract_clusters:
    
    s=' '
    from nltk.tokenize import RegexpTokenizer

    tokenizer = RegexpTokenizer(r'\w+\.*')
    new_text = s.join(tokenizer.tokenize(abstracts))
    new_text = nlp(new_text)
    
    tokens = [token.text for token in new_text]

    from sklearn.feature_extraction.text import TfidfVectorizer
    tfidf = TfidfVectorizer(stop_words='english')

    vec = tfidf.fit_transform([new_text.text])
    
    feature_names = tfidf.get_feature_names()

    sentence_tokens = [sent for sent in new_text.sents]
    vec.toarray()[0]                              #returns the tfidf values of the words
    #feature_names.index(word.text)                  #this returns the index of that word(word.text)
    #vec.toarray()[0][feature_names.index(word.text) together it returns the tfidf value of that particular word

    sentence_scores = {}
    for sent in sentence_tokens:
        score=0
        for word in sent:
            if word.text in feature_names:
                score = score + vec.toarray()[0][feature_names.index(word.text)]
                sentence_scores[sent] = score

    from heapq import nlargest
    select_length = int(len(sentence_tokens)*0.25)
    summary = nlargest(select_length, sentence_scores, key = sentence_scores.get)
#     for sent in summary:
#         print(str(sent))

    sorted_summary=[]

    for i in sentence_tokens:
        if i in summary:
            sorted_summary.append(i.text)
    '''----------------------------------------------------------------------------------------------------------------------'''
    vec2 = tfidf.fit_transform([abstracts])
    
    LDA.fit(vec2)
    
    single_topic = LDA.components_
    single_topic.argsort()[0][-3:]             #grabing the top five words with highest values 
    
    top_word_indices = single_topic.argsort()[0][-5:]
    potential_topics = []
    for index in top_word_indices:
        
        potential_topics.append(tfidf.get_feature_names()[index])
    final_topics.append(potential_topics)
    '''----------------------------------------------------------------------------------------------------------------------'''
    sorted_summary = ''.join(sorted_summary)
    
    summaries.append(sorted_summary)

In [36]:
summaries

['In this study water extracted polysaccharides from M. albuminosa MAWP were purified into three fractions with structural heterogeneity which was attributed to the diversity in molecular weight monosaccharide composition and linkage patterns further affecting their solution properties.Besides three fractions exhibited random coil conformation in aqueous solution while MAWP 60p had the highest viscosity due to its highest molecular weight mean square radius of gyration Rg and O methyl group attached to the backbone.All rights reserved Elsevier A homogeneous beta glucan JHMP 70 was obtained from Hypsizygus marmoreus by alkali extraction and ethanol precipitation and had an apparent molecular weight of 394 kDa.Results indicated JHMP 70 was a branched beta 1 6 glucan substituted at O 3 by the non reducing beta Glcp and or beta 1 3 linked beta Glcp side chains at a ratio of 1 1.This work can provide a basis theory to expand the application of the polysaccharides from H. marmoreus in food i

In [37]:
new_df = pd.DataFrame({'Potential_topics': final_topics, 'summary_of_clusters': summaries})

In [38]:
new_df

Unnamed: 0,Potential_topics,summary_of_clusters
0,"[addition, molecular, fcns, gluten, beta]",In this study water extracted polysaccharides ...
1,"[moisture, content, aps, esters, mcpd]",A novel dispersive micro solid phase extractio...
2,"[levels, oa, acid, high, ps]",The removal of fishy malodor was related to th...
3,"[microbiota, gut, acid, ape, mice]",Major royal jelly proteins MRJPs the water sol...
4,"[dtsb, expression, formation, ye, biofilm]",All rights reserved Elsevier This study invest...
5,"[water, inulin, pe, surimi, inu]",This work investigated the effects of inulin o...
6,"[screening, analysis, kernels, food, maize]",Considering the richness of data and progress ...
7,"[apples, sdtw, ppm, coli, phage]",Although transduction of E. coli by Stx phages...
8,"[emulsions, epa, asta, astaxanthin, cs]",In this study the ability of astaxanthin loade...
9,"[ethyl, rutinoside, aglycones, bran, tb]",Keng shells PASs and to isolate and identify t...


In [39]:
new_df.to_csv('final.csv')

#  -----------------------------------------------------------------------------------------------------------------

In [60]:
df2 = df2.drop(columns = ['Article', "Abstract"])

In [61]:
df3 = pd.DataFrame(df2.groupby('Journal').Authors.value_counts().sort_values(ascending=False))

In [62]:
df3.rename(columns={'Authors': 'Count'}, inplace=True)

In [63]:
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,Count
Journal,Authors,Unnamed: 2_level_1
Food Technology,"Buss, D.",7
Food Technology,"Brewster, E.",7
Food & Function,Jianxin Zhao,6
Food & Function,Wei Chen,6
Food & Function,Hao Zhang,6
...,...,...
Journal of Food Processing and Preservation,Yookyung Kim,1
Journal of Food Processing and Preservation,Youling Xiong,1
Journal of Food Processing and Preservation,Yousef Ramezan,1
Journal of Food Processing and Preservation,Yu Liu,1


In [64]:
df3.head(10).to_csv('top_author2.csv')

#  -----------------------------------------------------------------------------------------------------------------

In [65]:
ordered_df = df3.loc[list4, :]

In [66]:
top_authors_df = ordered_df.groupby('Journal').apply(lambda x: x.nlargest(5, 'Count'))

In [67]:
top_authors_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Count
Journal,Journal,Authors,Unnamed: 3_level_1
Food & Function,Food & Function,Jianxin Zhao,6
Food & Function,Food & Function,Wei Chen,6
Food & Function,Food & Function,Hao Zhang,6
Food & Function,Food & Function,"Vos, P. de",3
Food & Function,Food & Function,Lu Zhang,3
Food Chemistry,Food Chemistry,Xiaosong Hu,4
Food Chemistry,Food Chemistry,Shanbai Xiong,4
Food Chemistry,Food Chemistry,Bin Li,4
Food Chemistry,Food Chemistry,Changhu Xue,3
Food Chemistry,Food Chemistry,Jinyuan Sun,3


In [68]:
top_authors_df.to_csv('final2.csv')

#  -----------------------------------------------------------------------------------------------------------------

In [69]:
df2.Authors.nunique()

14990

In [70]:
df2.Journal.nunique()

217

In [71]:
df.shape

(3126, 4)