In [1]:
import os
import re
import numpy
import pandas as pd
# import PyPDF2
import requests
from bs4 import BeautifulSoup
from googlesearch import search

In [2]:

file = "Quantum_Machine_Learning_for_6G_Communication_Networks:_State-of-the-Art_and_Vision_for_the_Future" 
path = os.getcwd() + "/references/" + file +".txt"
csv_path = os.getcwd() + "/data/" + file +".csv"

In [3]:
class ResearchPaperList:
    def __init__(self):
        self.paper_data = ["title","author","date",0,"DOI",0,"publisher","url"]     
        self.ref_path = os.getcwd() + "/references/.txt"
        self.csv_path = os.getcwd() + "/data/Research_Paper_Data.csv"
        self.id_path = os.getcwd() + "/data/Research_Paper_List.csv"
        self.link_path = ""
        self.ref_data = ""
        self.link_data = ""
        self.open_id()
        self.open_csv()
    
    def open_id(self):
        if(not os.path.exists(self.id_path)):
            print("csv file not exists")
            return
        self.id_data = pd.read_csv(self.id_path)
        return
    
    def open_csv(self):
        if(not os.path.exists(self.csv_path)):
            print("csv file not exists")
            return False
        self.csv_data = pd.read_csv(self.csv_path, index_col = 0)
        return True
    
    def open_link(self):
        if(not os.path.exists(self.link_path)):
            print("csv file not exists")
            return False
        self.link_data = pd.read_csv(self.link_path, index_col = 0)
        return True
    
    def save_id_data(self):
        self.id_data.to_csv(self.id_path, sep=',', na_rep='NaN', index = False)
        
    def save_csv_data(self):
        self.csv_data.to_csv(self.csv_path,sep=',', na_rep='NaN')
    
    def save_link_data(self):
        self.link_data.to_csv(self.link_path,sep=',', na_rep='NaN')

    def get_id_by_DOI(self,doi):
        result = self.id_data.loc[q1.id_data["DOI"] == doi].values.tolist()
        if(not len(result)):
            return -1
        return result[0][0]

    def create_id(self, doi, title):
        result = [len(self.id_data), doi, title]
        self.id_data.loc[len(self.id_data)] = result
#         self.save_id_data()
        return result[0]

    
    
    def open_ref(self):
        if(not os.path.exists(self.ref_path)):
            print("file not exists")
            return False
        if(self.ref_data):
            self.ref_data.close()
        self.ref_data =  open(self.ref_path,"r", encoding='utf-8') 
        return True
        
    def print_ref(self, head_count = 5000):
        if not self.ref_data:
            self.open_ref()
        if(self.ref_data):
            while True:
                if(head_count == 0) : break
                line = self.ref_data.readline()
                if not line: break
                print(line) 
                head_count -= 1
    
    def close_ref(self):
        if(self.ref_data):
            self.ref_data.close()
            
    def set_data(self, paper_data, reference_list):
        self.paper_data = paper_data
        self.ref_path = os.getcwd() + "/references/" + reference_list +".txt"
        self.open_ref()
        paper_id = self.get_id_by_DOI(paper_data[4])
        if(paper_id == -1):
            paper_id = self.create_id(paper_data[4], paper_data[0])
            print(paper_id)
            self.save_id_data()
        self.link_path = os.getcwd() + "/data/"+ str(paper_id) +".csv"
        if (not self.open_link()):
            self.link_data = pd.DataFrame(columns=["id","title"])
            self.link_data.loc[0] = [paper_id] + [paper_data[0]]
            self.csv_data.loc[paper_id] = paper_data + [""]
            self.save_csv_data()
            self.save_link_data()
        
                
    def check_index(self, line,index):
        if(line[0]=="[" and line[1:1+len(str(index))] == str(index)):
            return True
        else:
            return False

    def get_paper_html(self, search_url):
        response = requests.get(search_url)
        if response.status_code == 200:
            html = response.text
            soup = BeautifulSoup(html, 'html.parser')
            return soup
        else:
            return -1

    def get_citations(self, html):
        html = str(html)
        data = 0
        temp = html.split('"citationCount":"')
        if(len(temp)>1):
            data = int(temp[1].split('"')[0])
        return data

    def get_INSPEC(self, html):
        html = str(html)
        data = 0
        temp = html.split('"accessionNumber":"')
        if(len(temp)>1):
            data = int(temp[1].split('"')[0])
        return data

    def get_DOI(self, html):
        html = str(html)
        data = ""
        temp = html.split('"doi":"')
        if(len(temp)>1):
            data = temp[1].split('"')[0]
        return data

    def get_date(self, html):
        html = str(html)
        data = ""
        temp = html.split('"dateOfInsertion":"')
        if(len(temp)>1):
            data = temp[1].split('"')[0]
        return data
        
    def handle_data(self, csv_data, current,index):
        temp = ["","","",0,"",0,"","",""]
        temp_link = [-1,""]
        current =current.replace("\n","")
#         print("index ",index,"\n",current)
        parse = re.split('“|”',current)
#         parse = re.split('‘‘|’’',current)
        if(len(parse)<2):
            return
        temp[0] = parse[1][:-1]
        temp_link[1] = temp[0]
        temp[1] = parse[0]
        is_url = False
        if("IEEE" in parse[2]):
            temp[6] = "IEEE"
            for search_url in search(temp[0], tld="co.in", num=5, stop=5, pause=1):
                if('ieee' in search_url):
                    temp[7] = search_url
                    is_url = True
                    break;
        if(is_url):
            html = self.get_paper_html(temp[7])
            temp[4] = self.get_DOI(html)
            temp_link[0] = self.get_id_by_DOI(temp[4])
            if(temp_link[0] == -1):
                temp_link[0] = self.create_id(temp[4], temp[0])
                temp[5] = self.get_citations(html)
                temp[3] = self.get_INSPEC(html)
                temp[2] = self.get_date(html)
                self.csv_data.loc[temp_link[0]] = temp
                
#                 self.save_csv_data()
#                 self.save_id_data()
#                 self.save_link_data()
        print("idx ", index, " ", temp_link, " ", temp)
        self.link_data.loc[index] = temp_link
        

    def extract_reference(self):
        index = 1
        current = ""
        
        while True:
            
            line = self.ref_data.readline()
#             print(line)
            if not line: break
            if(self.check_index(line, index+1)):
                self.handle_data(self.csv_data,current[3+len(str(index)):],index)
                current = line
                index += 1
            else:
                current += line
        self.ref_data.close()
        self.save_id_data()
        self.save_csv_data()
        self.save_link_data()
    
    
                
            

In [4]:
q1 = ResearchPaperList()
q1.set_data(["6G Wireless Communication Systems: Applications, Requirements, Technologies, Challenges, and Research Directions",
             "Mostafa Zaman Chowdhury, Md. Shahjalal, Shakil Ahmed, Yeong Min Jang",
             "20 July 2020",
             19955163,
             "10.1109/OJCOMS.2020.3010270",
             172,
             "IEEE",
             "https://ieeexplore.ieee.org/document/9144301"],
            "ref1")


In [5]:
print(len(q1.id_data))
# d1 = q1.id_data.loc[q1.id_data["DOI"] == "10.1109/OJCOMS.2020.3010270"].values.tolist()
# print(d1)
print(q1.get_id_by_DOI("10.1109/OJCOMS.20202.3010270"))


15
-1


In [6]:
q1.extract_reference()

idx  1   [1, 'Terahertz communication for vehicular networks']   ['Terahertz communication for vehicular networks', 'S. Mumtaz et al., ', '', 0, '10.1109/TVT.2017.2712878', 0, 'IEEE', 'https://ieeexplore.ieee.org/document/7982949', '']
idx  2   [-1, 'IMT traffic estimates for the years 2020 to 2030']   ['IMT traffic estimates for the years 2020 to 2030', '', '', 0, '', 0, '', '', '']
idx  3   [2, 'Quantum machine learning for 6G communica-tion networks: State-of-the-art and vision for the future']   ['Quantum machine learning for 6G communica-tion networks: State-of-the-art and vision for the future', 'S. J. Nawaz, S. K. Sharma, S. Wyne, M. N. Patwary, andM. Asaduzzaman, ', '', 0, '10.1109/ACCESS.2019.2909490', 0, 'IEEE', 'https://ieeexplore.ieee.org/document/8681450', '']
idx  4   [3, 'Toward 6G networks: Use cases and technologies']   ['Toward 6G networks: Use cases and technologies', 'M. Giordani, M. Polese, M. Mezzavilla, S. Rangan, and M. Zorzi,', '', 0, '10.1109/MCOM.001.1900411'

idx  42   [20, 'Acomparative survey of optical wireless technologies: Architectures andapplications']   ['Acomparative survey of optical wireless technologies: Architectures andapplications', 'M. Z. Chowdhury, M. T. Hossan, A. Islam, and Y. Min Jang, ', '13 March 2018', 17649076, '10.1109/ACCESS.2018.2792419', 218, 'IEEE', 'https://ieeexplore.ieee.org/document/8259465', '']
idx  43   [21, 'Wireless communications and applicationsabove 100 GHz: Opportunities and challenges for 6G and beyond']   ['Wireless communications and applicationsabove 100 GHz: Opportunities and challenges for 6G and beyond', 'T. S. Rappaport et al., ', '25 June 2019', 18770124, '10.1109/ACCESS.2019.2921522', 539, 'IEEE', 'https://ieeexplore.ieee.org/document/8732419', '']
idx  44   [-1, 'Integrated RF/optical wireless networks for improving QoS in indoorand transportation applications']   ['Integrated RF/optical wireless networks for improving QoS in indoorand transportation applications', 'M. Z. Chowdhury, M. T.

idx  70   [37, 'An incentive mechanism integrating jointpower, channel and link management for social-aware D2D contentsharing and proactive caching']   ['An incentive mechanism integrating jointpower, channel and link management for social-aware D2D contentsharing and proactive caching', 'C. Yi, S. Huang, and J. Cai, ', '05 March 2018', 17630120, '10.1109/TMC.2017.2741481', 49, 'IEEE', 'https://ieeexplore.ieee.org/document/8013127', '']


KeyboardInterrupt: 

In [9]:
# pd.set_option('display.max_rows', 300)
q1.csv_data.head(300)

Unnamed: 0,title,author,date,INSPEC,DOI,citations,publisher,url,keywords
0,6G Wireless Communication Systems: Application...,"Mostafa Zaman Chowdhury, Md. Shahjalal, Shakil...",20 July 2020,19955163,10.1109/OJCOMS.2020.3010270,172,IEEE,https://ieeexplore.ieee.org/document/9144301,


In [7]:
q1.save_csv_data()