In [1]:
from io import StringIO
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from tqdm.notebook import tqdm
import pandas as pd
import nltk

# Read label data and create training dataframe

In [2]:

lebel_data_path = "D:\hustle\Sravani_text_to_text\9th_oct\Dataset_Lot 2.xlsx"
label_data = pd.read_excel(lebel_data_path)
label_data.columns = ['Article name', 'Context', 'Key insights', 'Definitions', 'Key findings']
label_data.fillna("", inplace=True)

# Training df will have one line and its label like context, key insights, definitions, key findings
line_list = []
label_list = []
article_list = []
for article in label_data['Article name']:

    # Context
    context = label_data[label_data["Article name"] == article]["Context"]
    for i in context.item().split("\n\n"):
        sentences = nltk.sent_tokenize(i)
        for sentence in sentences:
            line_list.append(sentence)
            label_list.append("Context")
            article_list.append(article)

    # Key insights	
    key_insights = label_data[label_data["Article name"] == article]["Key insights"]
    for i in key_insights.item().split("\n\n"):
        sentences = nltk.sent_tokenize(i)
        for sentence in sentences:
            line_list.append(sentence)
            label_list.append("Key insights")
            article_list.append(article)

    # Definitions
    definitions = label_data[label_data["Article name"] == article]["Definitions"]
    for i in definitions.item().split("\n\n"):
        sentences = nltk.sent_tokenize(i)
        for sentence in sentences:
            line_list.append(sentence)
            label_list.append("Definitions")
            article_list.append(article)

    # key_findings
    key_findings = label_data[label_data["Article name"] == article]["Key findings"]
    for i in key_findings.item().split("\n\n"):
        sentences = nltk.sent_tokenize(i)
        for sentence in sentences:
            line_list.append(sentence)
            label_list.append("Key findings")
            article_list.append(article)

training_df = pd.DataFrame({"article": article_list, "line": line_list, "label": label_list})
#training_df.to_csv("training_data.csv", index=False, encoding="utf-8")

In [3]:
training_df

Unnamed: 0,article,line,label
0,Rethinking the corporate digital divide,the existing literature on the corporate digit...,Context
1,Rethinking the corporate digital divide,"Thus, to close the aforementioned gap, this pa...",Context
2,Rethinking the corporate digital divide,"Hence, the objective of\nthis research is to d...",Context
3,Rethinking the corporate digital divide,The main idea behind the\ntheory of the diffus...,Key insights
4,Rethinking the corporate digital divide,"In the development of this process,\nthere is ...",Key insights
...,...,...,...
121,Digital Labour in the Platform Economy,They recognize that digital labour is exploite...,Key findings
122,Digital Labour in the Platform Economy,"Consequently, it hides\nthe reality of exploit...",Key findings
123,Digital Labour in the Platform Economy,"However, Fuchs\nand Sevignani tend to take too...",Key findings
124,Digital Labour in the Platform Economy,"Instead, they affirm that Facebook is\na realm...",Key findings


# Function to read pdf files and extract text

In [4]:
# Function to read pdf file
def parse_pdf(pdf_file):
    rsrcmgr = PDFResourceManager()
    sio = StringIO()
    codec = "utf-8"
    laparams = LAParams()
    device = TextConverter(rsrcmgr, sio, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    pdfFile = open(pdf_file, "rb")
    for page in PDFPage.get_pages(pdfFile):
        interpreter.process_page(page)
    pdfFile.close()

    # Return text from StringIO
    text = sio.getvalue()

    # Freeing Up
    device.close()
    sio.close()

    return text



# Remove all label sentences which are there in parsed pdf text.
# And label all unwanted sentences from pdf files as 'unwanted'

In [5]:
for article in tqdm(training_df["article"].unique()):
    print(article)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Rethinking the corporate digital divide
The digital divide and its impact on the development of
The multi-dimensional digital divide
Evolution and determinants of digital divide in Brazil
Digital Labour in the Platform Economy



In [6]:
"""
article = "Rethinking the corporate digital divide"
pdf_path = os.path.join("Articles 5 nos", article+ ".pdf")
text = parse_pdf(pdf_path)
text = text.replace("\n", " ")
text = text.replace("\r", " ")

unknown_line = nltk.sent_tokenize(text)
unknown_label = ["Unknown"] * len(unknown_line)
unknown_article = [article] * len(unknown_line)

unknown_df = pd.DataFrame({"article": unknown_article, "line": unknown_line, "label": unknown_label})


for i in tqdm(training_df[training_df["article"] == article]["line"]):
    i = i.replace("\n", " ")
    print(i)
    unknown_df.drop(unknown_df.loc[unknown_df['line'].str.contains('the existing literature on the corporate digital', case=False)].index, inplace=True)
    print("-----------------------")

unknown_df.to_csv("unknown_data2.csv", index=False, encoding="utf-8")
"""

unknown_df_list = []
for article in tqdm(training_df["article"].unique()):
    print(article)
    pdf_path = os.path.join("Articles 5 nos", article+ ".pdf")
    text = parse_pdf(pdf_path)
    text = text.replace("\n", " ")
    text = text.replace("\r", " ")

    unknown_line = nltk.sent_tokenize(text)
    unknown_label = ["Unknown"] * len(unknown_line)
    unknown_article = [article] * len(unknown_line)

    unknown_df = pd.DataFrame({"article": unknown_article, "line": unknown_line, "label": unknown_label})


    for i in tqdm(training_df[training_df["article"] == article]["line"]):
        i = i.replace("\n", " ")
        #print(i)
        unknown_df.drop(unknown_df.loc[unknown_df['line'].str.contains('the existing literature on the corporate digital', case=False)].index, inplace=True)
        #print("-----------------------")

    unknown_df_list.append(unknown_df)

unknown_df = pd.concat(unknown_df_list, axis=0, ignore_index=True)
#unknown_df.to_csv("unknown_data3.csv", index=False, encoding="utf-8")


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Rethinking the corporate digital divide


HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))


The digital divide and its impact on the development of


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))


The multi-dimensional digital divide


HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))


Evolution and determinants of digital divide in Brazil


HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))


Digital Labour in the Platform Economy


HBox(children=(FloatProgress(value=0.0, max=55.0), HTML(value='')))





In [7]:
unknown_df

Unnamed: 0,article,line,label
0,Rethinking the corporate digital divide,Contents lists available at ScienceDirect Te...,Unknown
1,Rethinking the corporate digital divide,"Motivated by innovation-diffusion, competence-...",Unknown
2,Rethinking the corporate digital divide,This research is conducted using a vast datase...,Unknown
3,Rethinking the corporate digital divide,"Among the key findings, the digital-innovation...",Unknown
4,Rethinking the corporate digital divide,The digital-divide concept has been explicate...,Unknown
...,...,...,...
2956,Digital Labour in the Platform Economy,"2016, 16, 51–74.",Unknown
2957,Digital Labour in the Platform Economy,70.,Unknown
2958,Digital Labour in the Platform Economy,© 2018 by the authors.,Unknown
2959,Digital Labour in the Platform Economy,"Licensee MDPI, Basel, Switzerland.",Unknown


In [8]:
final_training_df = pd.concat([training_df, unknown_df],ignore_index=True)
#final_training_df.to_csv("final_training_data.csv", index=False, encoding="utf-8")

# Add other training data to training dataframe

In [9]:
other_training_exel_path = "D:/hustle/Sravani_text_to_text/rest_labelled_data"

import glob
import os


df_path_list = []
for df_path in glob.glob(other_training_exel_path + "/*.xlsx"):

    temp_df = pd.read_excel(df_path)
    print(temp_df.head(2))
    df_path_list.append(temp_df)
    print("-----------------------")
other_df = pd.concat(df_path_list, axis=0, ignore_index=True)


#lebel_data_path = "D:\hustle\Sravani_text_to_text\9th_oct\Dataset_Lot 2.xlsx"
other_df.columns = ['Article name', 'Context', 'Key insights', 'Definitions', 'Key findings']
other_df.fillna("", inplace=True)

                 Article name  \
0  ABS definition _Employment   
1              Making the cut   

                                             Context  \
0  This article describes three categories of peo...   
1  Most skilled immigrants to Australia arrive th...   

                                        Key insights  \
0  However, there are different interpretations o...   
1  This study is positioned to draw attention to ...   

                                        Definitions   \
0  This means that any economic activity, of what...   
1  Perspective Human capital theory (HCT) Labour ...   

                                        Key findings  
0  The criteria used to define a person as being ...  
1  The findings show how regional employers’ recr...  
-----------------------
                 Article name  \
0  ABS definition _Employment   
1              Making the cut   

                                             Context  \
0  This article describes three categories of pe

In [10]:
other_df

Unnamed: 0,Article name,Context,Key insights,Definitions,Key findings
0,ABS definition _Employment,This article describes three categories of peo...,"However, there are different interpretations o...","This means that any economic activity, of what...",The criteria used to define a person as being ...
1,Making the cut,Most skilled immigrants to Australia arrive th...,This study is positioned to draw attention to ...,Perspective Human capital theory (HCT) Labour ...,The findings show how regional employers’ recr...
2,DarkNetExplorer,Timely identification of terrorist networks wi...,Jeub et al. [11] argue that one way to discove...,Dark networks are covert social networks [1] t...,Node 18 and Node 64 in C2 both\ntrained togeth...
3,sardna khan paper on rmg 2019,The RMG sector is an important contributor of ...,"In the case of a supervisor and a worker, it w...",ILO (2016) defines “decent work” as “the oppor...,"However, the data reveal that owners and manag..."
4,gap of governance,In this chapter we will discuss the role of pr...,"Beyond the infrastructure challenges, a compre...",,Our hypothesis is that production almost neces...
...,...,...,...,...,...
162,,,,Immigrant professionals are totally responsibl...,
163,,,,integrating and finding employment that aligns...,
164,,,,their skill level and the SU is seen as a fail...,
165,,,,"responsibility (Becker, 1971; Welch, 1975)",


In [11]:


# Training df will have one line and its label like context, key insights, definitions, key findings
line_list = []
label_list = []
#article_list = []
for _index,row in other_df.iterrows():

    # Context
    context = row["Context"]
    for i in context.split("\n\n"):
        sentences = nltk.sent_tokenize(i)
        for sentence in sentences:
            line_list.append(sentence)
            label_list.append("Context")
            #article_list.append(article)

    # Key insights	
    key_insights = row["Key insights"]
    for i in key_insights.split("\n\n"):
        sentences = nltk.sent_tokenize(i)
        for sentence in sentences:
            line_list.append(sentence)
            label_list.append("Key insights")
            #article_list.append(article)

    # Definitions
    definitions = row["Definitions"]
    for i in definitions.split("\n\n"):
        sentences = nltk.sent_tokenize(i)
        for sentence in sentences:
            line_list.append(sentence)
            label_list.append("Definitions")
            #article_list.append(article)

    # key_findings
    key_findings = row["Key findings"]
    for i in key_findings.split("\n\n"):
        sentences = nltk.sent_tokenize(i)
        for sentence in sentences:
            line_list.append(sentence)
            label_list.append("Key findings")
            #article_list.append(article)

article_list = ["Other"] * len(line_list)
training_df2 = pd.DataFrame({"article": article_list, "line": line_list, "label": label_list})
#training_df2.to_csv("training_data2.csv", index=False, encoding="utf-8")

In [13]:
training_df2

Unnamed: 0,article,line,label
0,Other,This article describes three categories of peo...,Context
1,Other,The categories are:\npersons who are unemploye...,Context
2,Other,"However, there are different interpretations o...",Key insights
3,Other,Some commentators consider that the official m...,Key insights
4,Other,Recognising this interest in the broader conce...,Key insights
...,...,...,...
2481,Other,Immigrant professionals are totally responsibl...,Definitions
2482,Other,integrating and finding employment that aligns...,Definitions
2483,Other,their skill level and the SU is seen as a fail...,Definitions
2484,Other,"responsibility (Becker, 1971; Welch, 1975)",Definitions


In [12]:
final_training_df = pd.concat([final_training_df, training_df2],ignore_index=True)
final_training_df.to_csv("final_training_data.csv", index=False, encoding="utf-8")

In [5]:
from io import StringIO 
 
 
# The arbitrary string.
string =b'Hello and welcome to GeeksForGeeks.'
 
StringIO(string.decode("utf-8")).read()

# Using the StringIO method to
# set as file object.
#file = StringIO(string)
 
# Retrieve the entire content of the file.
#print(file.getvalue())

'Hello and welcome to GeeksForGeeks.'