In [1]:
#Installing Packages
!pip install Bio
!pip install advertools
!pip install spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Bio
  Downloading bio-1.5.2-py3-none-any.whl (273 kB)
[K     |████████████████████████████████| 273 kB 29.8 MB/s 
Collecting biopython==1.79
  Downloading biopython-1.79-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)
[K     |████████████████████████████████| 2.7 MB 64.2 MB/s 
[?25hCollecting mygene
  Downloading mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)
Collecting biothings-client>=0.2.6
  Downloading biothings_client-0.2.6-py2.py3-none-any.whl (37 kB)
Installing collected packages: biothings-client, mygene, biopython, Bio
Successfully installed Bio-1.5.2 biopython-1.79 biothings-client-0.2.6 mygene-3.2.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting advertools
  Downloading advertools-0.13.2-py2.py3-none-any.whl (310 kB)
[K     |████████████████████████████████| 310 kB 28.5 MB/s 
[?25hColl

In [2]:
#Calling libraries
import urllib
import pandas as pd
import numpy as np
from Bio import Entrez, Medline
import advertools as adv
import spacy
import time

In [5]:
#Identifying myself to NCBI Entrez API
Entrez.email = 'turkiabdelwaheb@hotmail.fr'

In [3]:
#Retrieving files
urllib.request.urlretrieve("https://github.com/SisonkeBiotik-Africa/AfriBioML/blob/main/XLS/scopus_2020.xls?raw=true", "scopus_2020.xls")
urllib.request.urlretrieve("https://github.com/SisonkeBiotik-Africa/AfriBioML/blob/main/XLS/scopus_2021.xls?raw=true", "scopus_2021.xls")
urllib.request.urlretrieve("https://github.com/SisonkeBiotik-Africa/AfriBioML/blob/main/XLS/scopus_2022.xls?raw=true", "scopus_2022.xls")
urllib.request.urlretrieve("https://github.com/SisonkeBiotik-Africa/AfriBioML/blob/main/XLS/scopus_Until2019.xls?raw=true", "scopus_Until2019.xls")

('scopus_Until2019.xls', <http.client.HTTPMessage at 0x7f8a94d11790>)

In [4]:
#Creating dataframes
df2020 = pd.read_excel("scopus_2020.xls")
df2021 = pd.read_excel("scopus_2021.xls")
df2022 = pd.read_excel("scopus_2022.xls")
df2019 = pd.read_excel("scopus_Until2019.xls")

In [5]:
#Merging dataframes
df = df2020
df = df.append(df2021, ignore_index=True)
df = df.append(df2022, ignore_index=True)
df = df.append(df2019, ignore_index=True)

In [6]:
df.shape

(3772, 54)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 54 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   ï»¿Authors                     3772 non-null   object 
 1   Author(s) ID                   3772 non-null   object 
 2   Title                          3772 non-null   object 
 3   Year                           3772 non-null   int64  
 4   Source title                   3772 non-null   object 
 5   Volume                         3045 non-null   object 
 6   Issue                          1552 non-null   object 
 7   Art. No.                       1666 non-null   object 
 8   Page start                     2120 non-null   object 
 9   Page end                       2105 non-null   object 
 10  Page count                     2 non-null      float64
 11  Cited by                       2549 non-null   float64
 12  DOI                            3659 non-null   o

In [8]:
#Eliminating useless columns from the dataframe
considered = np.r_[2:4, 11, 15:19, 35, 38, 40, 42]
dfrestricted = df.iloc[:,considered]

In [9]:
dfrestricted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Title                      3772 non-null   object 
 1   Year                       3772 non-null   int64  
 2   Cited by                   2549 non-null   float64
 3   Authors with affiliations  3772 non-null   object 
 4   Abstract                   3772 non-null   object 
 5   Author Keywords            3170 non-null   object 
 6   Index Keywords             3149 non-null   object 
 7   PubMed ID                  948 non-null    float64
 8   Document Type              3772 non-null   object 
 9   Open Access                1945 non-null   object 
 10  EID                        3772 non-null   object 
dtypes: float64(2), int64(1), object(8)
memory usage: 324.3+ KB


In [10]:
#Identifying highly cited papers
dfrestricted["Highly Cited"] = dfrestricted["Cited by"] > 10
dfrestricted["Highly Cited"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfrestricted["Highly Cited"] = dfrestricted["Cited by"] > 10


False    3011
True      761
Name: Highly Cited, dtype: int64

In [11]:
#Identifying North Africa papers
dfrestricted["North Africa"] = dfrestricted["Authors with affiliations"].str.lower().str.find("tunisia")+dfrestricted["Authors with affiliations"].str.lower().str.find("algeria")+dfrestricted["Authors with affiliations"].str.lower().str.find("morocco")+dfrestricted["Authors with affiliations"].str.lower().str.find("libya")+dfrestricted["Authors with affiliations"].str.lower().str.find("egypt") != -5
dfrestricted["North Africa"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfrestricted["North Africa"] = dfrestricted["Authors with affiliations"].str.lower().str.find("tunisia")+dfrestricted["Authors with affiliations"].str.lower().str.find("algeria")+dfrestricted["Authors with affiliations"].str.lower().str.find("morocco")+dfrestricted["Authors with affiliations"].str.lower().str.find("libya")+dfrestricted["Authors with affiliations"].str.lower().str.find("egypt") != -5


True     2426
False    1346
Name: North Africa, dtype: int64

In [12]:
#Identifying Open Access papers
dfrestricted["OA"] = dfrestricted["Open Access"].notna()
dfrestricted["OA"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfrestricted["OA"] = dfrestricted["Open Access"].notna()


True     1945
False    1827
Name: OA, dtype: int64

In [13]:
dfrestricted["Journal Article"] = dfrestricted["Document Type"] == "Article"
dfrestricted["Journal Article"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfrestricted["Journal Article"] = dfrestricted["Document Type"] == "Article"


True     2212
False    1560
Name: Journal Article, dtype: int64

In [17]:
#Defining a function to extract MeSH Keywords from PubMed ID
def get_records(id):
    if id != "":
      handle = Entrez.efetch(db="pubmed", id=id, rettype="medline", retmode="text")
      records2 = Medline.parse(handle)
      for r in records2:
          MeSHTerms = r.get("MH", [])
      time.sleep(5)
    return MeSHTerms

In [None]:
#Extracting MeSH Keywords for every PubMed ID
id_list = list(dfrestricted["PubMed ID"])
records = []
for id in id_list:
  if not(np.isnan(id)): 
    records.append(get_records(id))
    print(len(records))
  else:
    records.append([])



[['Antibodies, Neutralizing/blood/immunology', 'Antibody Formation', 'COVID-19/blood/*immunology', 'COVID-19 Serological Testing', 'Cross Reactions', 'Cryoelectron Microscopy', '*Epitope Mapping', 'Epitopes/chemistry/genetics/*immunology', 'Female', 'Humans', 'Male', 'Protein Conformation', 'SARS-CoV-2/*immunology', 'Seroconversion', '*Severity of Illness Index']]
[['Antibodies, Neutralizing/blood/immunology', 'Antibody Formation', 'COVID-19/blood/*immunology', 'COVID-19 Serological Testing', 'Cross Reactions', 'Cryoelectron Microscopy', '*Epitope Mapping', 'Epitopes/chemistry/genetics/*immunology', 'Female', 'Humans', 'Male', 'Protein Conformation', 'SARS-CoV-2/*immunology', 'Seroconversion', '*Severity of Illness Index'], [], [], ['Adult', 'Aged', 'Aged, 80 and over', 'Algorithms', 'Betacoronavirus', 'COVID-19', 'Coronavirus Infections/*diagnostic imaging', 'Female', 'Humans', '*Machine Learning', 'Male', 'Middle Aged', 'Pandemics', 'Pneumonia, Viral/*diagnostic imaging', 'Radiograph

In [24]:
#Completing the list of MeSH Keywords after Exception
n = 0
for id in id_list:
  n += 1
  if n > len(records):
    if not(np.isnan(id)): 
      records.append(get_records(id))
      print(len(records))
    else:
      records.append([])

2105
2106
2121
2122
2123
2125
2127
2130
2134
2137
2139
2140
2141
2143
2150
2153
2154
2168
2169
2170
2171
2175
2176
2177
2178
2180
2181
2185
2187
2188
2189
2190
2191
2193
2194
2201
2204
2206
2209
2210
2211
2218
2219
2220
2222
2223
2224
2225
2227
2238
2239
2259
2260
2261
2264
2270
2274
2281
2297
2298
2300
2301
2302
2304
2308
2310
2314
2315
2317
2318
2328
2332
2336
2337
2347
2350
2354
2355
2357
2361
2369
2370
2372
2374
2378
2379
2390
2401
2403
2406
2414
2416
2418
2419
2428
2430
2431
2432
2433
2434
2438
2439
2440
2441
2448
2457
2459
2460
2466
2470
2471
2472
2473
2478
2479
2480
2481
2482
2484
2485
2487
2491
2492
2494
2497
2498
2505
2507
2508
2527
2531
2536
2537
2559
2562
2563
2564
2566
2568
2569
2570
2574
2577
2605
2607
2610
2613
2615
2616
2618
2624
2627
2642
2643
2659
2673
2674
2681
2687
2693
2701
2717
2719
2732
2736
2737
2740
2746
2747
2771
2772
2773
2774
2784
2785
2786
2792
2793
2794
2797
2807
2810
2814
2816
2826
2831
2832
2844
2849
2850
2853
2857
2862
2865
2866
2875
2879
2889
2896
2916


In [26]:
dfrestricted["raw_mesh"] = records

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfrestricted["raw_mesh"] = records


In [25]:
len(records)

3772

In [27]:
#Cleaning MeSH Keywords for Keyword Analysis
clean_mesh = []
for i in range(len(dfrestricted)):
  if dfrestricted["raw_mesh"][i] == []: 
    clean_mesh.append([])
  else:
    wordlist = []
    for word in dfrestricted["raw_mesh"][i]:
      if (word.find("/") == -1):
        if (word.find("*") == -1): wordlist.append(word)
        if (word.find("*") > -1): wordlist.append(word.replace("*","", word.count("*")))
      else:
        wordlist01 = word.split("/")
        wordlist01 = [w.replace("*","", word.count("*")) for w in wordlist01]
        wordlist += wordlist01
    clean_mesh.append(wordlist)
len(clean_mesh)

3772

In [28]:
dfrestricted["MeSH Keywords"] = clean_mesh

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfrestricted["MeSH Keywords"] = clean_mesh


In [14]:
#Defining Spacy model and stopwords for extracting noun phrases from titles and abstracts
model = "en_core_web_sm"
stopwords = adv.stopwords["english"]

In [15]:
#Defining raw noun phrases from titles and abstract
def raw_noun_phrases(model, texts):
    s = []
    if (model != ""):
        nlp = spacy.load(model)
        
        # Process each text in the column
        for text in texts:
            text = text.lower()
            doc = nlp(text)
            s.append([chunk.text for chunk in doc.noun_chunks])
            
    return s

In [16]:
#Finding raw noun phrases from titles
dfrestricted["raw_title"] = raw_noun_phrases(model, dfrestricted['Title'])
#Finding raw noun phrases from abstracts
dfrestricted["raw_abstract"] = raw_noun_phrases(model, dfrestricted['Abstract'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfrestricted["raw_title"] = raw_noun_phrases(model, dfrestricted['Title'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfrestricted["raw_abstract"] = raw_noun_phrases(model, dfrestricted['Abstract'])


In [17]:
#Defining a function to clean raw noun phrases
def clean_noun_phrases(raw, stopwords, model):
    #Eliminating stop words and punctuations from noun phrases
    sf = []
    for i in raw:
        word_tokens = i.split(" ")
        filtered_ngram = [w for w in word_tokens if not w.lower() in stopwords]
        ss = " ".join(filtered_ngram)
        for j in [",", ";", "(", ")", "[", "]", "{", "}", "."]:
            ss = ss.replace(j,"")
        #Singularizing noun phrases
        if (ss != ""):
            nlp = spacy.load(model)
            prep = nlp(ss)
            sing = [chunk1.lemma_ for chunk1 in prep.noun_chunks]
            if (sing != []): ss = sing[0]
        if (ss != ""): sf.append(ss)
    return sf

In [None]:
#Extracting clean noun phrases from titles
clean_titles = []
for i in range(len(dfrestricted)):
  clean_titles.append(clean_noun_phrases(dfrestricted["raw_title"][i], stopwords, model))
  print(i)
dfrestricted["Title Keywords"] = clean_titles

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [None]:
#Extracting clean noun phrases from abstracts
clean_abstracts = []
for i in range(len(dfrestricted)):
  clean_abstracts.append(clean_noun_phrases(dfrestricted["raw_abstract"][i], stopwords, model))
  print(i)
dfrestricted["Abstract Keywords"] = clean_abstracts

In [None]:
dfrestricted.info()

In [29]:
dfrestricted.head()

Unnamed: 0,ï»¿Authors,Author(s) ID,Title,Year,Source title,Volume,Issue,Art. No.,Page start,Page end,...,Funding Text 1,Funding Text 2,Funding Text 3,Funding Text 4,Funding Text 5,Funding Text 6,Funding Text 7,Funding Text 8,Funding Text 9,Funding Text 10
0,"Shrock E., Fujimura E., Kula T., Timms R.T., L...",56980000900;56585925100;55305960800;3623947680...,Viral epitope profiling of COVID-19 patients r...,2020,Science,370,6520.0,eabd4250,,,...,,,,,,,,,,
1,"Loey M., Smarandache F., Khalifa N.E.M.",57191889140;6506230265;57205754339;,Within the lack of chest COVID-19 X-ray datase...,2020,Symmetry,12,4.0,651,,,...,,,,,,,,,,
2,"Ali F., El-Sappagh S., Islam S.M.R., Kwak D., ...",56645835400;55233800700;36129968800;5563475710...,A smart healthcare monitoring system for heart...,2020,Information Fusion,63,,,208.0,222.0,...,,,,,,,,,,
3,"Elaziz M.A., Hosny K.M., Salah A., Darwish M.M...",57195591068;57205214086;55586510800;5720522912...,New machine learning method for imagebased dia...,2020,PLoS ONE,15,6.0,e0235187,,,...,,,,,,,,,,
4,"Al-Dhabyani W., Gomaa M., Khaled H., Fahmy A.",57209182206;55065442900;7003807559;57212923429;,Dataset of breast ultrasound images,2020,Data in Brief,28,,104863,,,...,,,,,,,,,,


In [30]:
#Adding Author Keywords to Keyword Dataset
with open("keywords.csv", "w") as f:
  f.write("Keyword;EID;Year;Type;Highly Cited;North Africa;OA;Journal Article\n")
  for item in range(len(dfrestricted["Author Keywords"])):
    if item in list(dfrestricted[dfrestricted["Author Keywords"].notna() == True].index):
      for keyword in dfrestricted["Author Keywords"][item].split("; "):
        f.write(keyword+";"+dfrestricted["EID"][item]+";"+str(dfrestricted["Year"][item])+";Author;"+str(dfrestricted["Highly Cited"][item])+";"+str(dfrestricted["North Africa"][item])+";"+str(dfrestricted["OA"][item])+";"+str(dfrestricted["Journal Article"][item])+"\n")

In [31]:
#Adding Index Keywords to Keyword Dataset
with open("keywords.csv", "a") as f1:
  f1.write("Keyword;EID;Year;Type;Highly Cited;North Africa;OA;Journal Article\n")
  for item in range(len(dfrestricted["Index Keywords"])):
    if item in list(dfrestricted[dfrestricted["Index Keywords"].notna() == True].index):
      for keyword in dfrestricted["Index Keywords"][item].split("; "):
        f1.write(keyword+";"+dfrestricted["EID"][item]+";"+str(dfrestricted["Year"][item])+";Index;"+str(dfrestricted["Highly Cited"][item])+";"+str(dfrestricted["North Africa"][item])+";"+str(dfrestricted["OA"][item])+";"+str(dfrestricted["Journal Article"][item])+"\n")

In [None]:
#Adding Title Keywords to Keyword Dataset
with open("keywords.csv", "a") as f1:
  f1.write("Keyword;EID;Year;Type;Highly Cited;North Africa;OA;Journal Article\n")
  for item in range(len(dfrestricted["Title Keywords"])):
    if item in list(dfrestricted[dfrestricted["Title Keywords"].notna() == True].index):
      for keyword in dfrestricted["Title Keywords"][item]:
        f1.write(keyword+";"+dfrestricted["EID"][item]+";"+str(dfrestricted["Year"][item])+";Title;"+str(dfrestricted["Highly Cited"][item])+";"+str(dfrestricted["North Africa"][item])+";"+str(dfrestricted["OA"][item])+";"+str(dfrestricted["Journal Article"][item])+"\n")

In [None]:
#Adding Abstract Keywords to Keyword Dataset
with open("keywords.csv", "a") as f1:
  f1.write("Keyword;EID;Year;Type;Highly Cited;North Africa;OA;Journal Article\n")
  for item in range(len(dfrestricted["Abstract Keywords"])):
    if item in list(dfrestricted[dfrestricted["Abstract Keywords"].notna() == True].index):
      for keyword in dfrestricted["Abstract Keywords"][item]:
        f1.write(keyword+";"+dfrestricted["EID"][item]+";"+str(dfrestricted["Year"][item])+";Abstract;"+str(dfrestricted["Highly Cited"][item])+";"+str(dfrestricted["North Africa"][item])+";"+str(dfrestricted["OA"][item])+";"+str(dfrestricted["Journal Article"][item])+"\n")

In [32]:
#Adding MeSH Keywords to Keyword Dataset
with open("keywords.csv", "a") as f1:
  f1.write("Keyword;EID;Year;Type;Highly Cited;North Africa;OA;Journal Article\n")
  for item in range(len(dfrestricted["MeSH Keywords"])):
    if item in list(dfrestricted[dfrestricted["MeSH Keywords"].notna() == True].index):
      for keyword in dfrestricted["MeSH Keywords"][item]:
        f1.write(keyword+";"+dfrestricted["EID"][item]+";"+str(dfrestricted["Year"][item])+";MeSH;"+str(dfrestricted["Highly Cited"][item])+";"+str(dfrestricted["North Africa"][item])+";"+str(dfrestricted["OA"][item])+";"+str(dfrestricted["Journal Article"][item])+"\n")