# Extraction of Contribution Sentences from PDF 

In [2]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.




In [33]:
!pip install PyMuPDF


Collecting PyMuPDF
  Downloading PyMuPDF-1.23.6-cp39-none-win_amd64.whl (3.5 MB)
Collecting PyMuPDFb==1.23.6
  Downloading PyMuPDFb-1.23.6-py3-none-win_amd64.whl (24.5 MB)
Installing collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.23.6 PyMuPDFb-1.23.6
Note: you may need to restart the kernel to use updated packages.




In [28]:
import os
import re
import PyPDF2
import pandas as pd
import fitz

#function to Identify contribution sentences
def identify_contribution_sentences(sentence):
    pronouns = ['we', 'this paper', 'in this paper', 'i', 'the authors', 'our', 'the study', 'the research']
    verbs = ['propose', 'contribute', 'introduce', 'describe', 'present', 'report', 'discuss', 'employ', 'study',
             'explore', 'aim', 'hypothesize', 'argue', 're-use', 'evaluate', 'compare', 'baseline', 'result',
             'approach', 'method', 'extend', 'ensemble', 'transfer learning', 'architecture', 'neural method',
             'translation systems', 'system development', 'domain adaptation', 'train', 'experiment']

    sentence_lower = sentence.lower()  # Convert the sentence to lowercase for case-insensitive matching

    words = sentence_lower.split()  # Split the sentence into words
    if words:
        for pronoun in pronouns:
            if words[0].startswith(pronoun):  # Check if the first word matches a pronoun
                for verb in verbs:
                    if verb in words:  # Check if any word in the sentence matches a verb
                        return True
    return False

'''
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        num_pages = len(pdf_reader.pages)
        text = ''
        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text
'''
#fetching text from PDF file
def extract_text_from_pdf(pdf_path):
    text = ''
    try:
        pdf_document = fitz.open(pdf_path)
        for page_num in range(pdf_document.page_count):
            page = pdf_document.load_page(page_num)
            text += page.get_text()
    except Exception as e:
        print(f"Exception occurred while processing {pdf_path}: {e}")
    return text

#Process the PDf files to extract contribution and Non Contribution sentences
def process_pdf_files_in_folder(folder_path):
    contribution_sentences_all = []
    non_contribution_sentences_all = []

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            #Opening and reading PDF file
            if file.endswith(".pdf"):
                print(file)
                pdf_path = os.path.join(root, file)
                pdf_text = extract_text_from_pdf(pdf_path)
                sentences = pdf_text.split('.')
                
                contribution_sentences = []
                non_contribution_sentences = []

                for sentence in sentences:
                    if identify_contribution_sentences(sentence):
                        contribution_sentences.append(sentence.strip().replace('\n', ''))
                        #contribution_sentences.append(sentence)
                    else:
                        non_contribution_sentences.append(sentence.strip().replace('\n', ''))
                        #non_contribution_sentences.append(sentence)

                contribution_sentences_all.extend(contribution_sentences)
                non_contribution_sentences_all.extend(non_contribution_sentences)

    # Create DataFrames
    contribution_df = pd.DataFrame({'text': contribution_sentences_all})
    non_contribution_df = pd.DataFrame({'text': non_contribution_sentences_all})
    non_contribution_df=non_contribution_df.head(contribution_df.shape[0])
    contribution_df['label']=1
    non_contribution_df['label']=0
    Research_df=pd.concat([contribution_df,non_contribution_df])
    Research_df=Research_df.sample(frac=1)
    Research_df['year']='2022'
    Research_df['conference']='CoNLL'
    Research_df.to_csv('all_contribution_sentences_2022.csv', index=False)

# Replace 'your_folder_path' with the path to your folder containing PDFs
process_pdf_files_in_folder('C://Users//prash//CoNLL_final//2022')


A Fine-grained Interpretability Evaluation Benchmark for Neural NLP.pdf
A Multilingual Bag-of-Entities Model for Zero-Shot Cross-Lingual Text Classification.pdf
An Alignment-based Approach to Text Segmentation Similarity Scoring.pdf
Causal Analysis of Syntactic Agreement Neurons in Multilingual Language Models.pdf
Characterizing Verbatim Short-Term Memory in Neural Language Models.pdf
Cognitive Simplification Operations Improve Text Simplification.pdf
Collateral facilitation in humans and language models.pdf
Combining Noisy Semantic Signals with Orthographic Cues Cognate Induction for the Indic Dialect Continuum.pdf
Computational cognitive modeling of predictive sentence processing in a second language.pdf
Continual Learning for Natural Language Generations with Transformer Calibration.pdf
Detecting Unintended Social Bias in Toxic Language Datasets.pdf
Enhancing the Transformer Decoder with Transition-based Syntax.pdf
Entailment Semantics Can Be Extracted from an Ideal Language Model.p

In [30]:
#Run above code for each year and combine all the years to create final Dataset using below code
df1=pd.read_csv('all_contribution_sentences_2013.csv')
df2=pd.read_csv('all_contribution_sentences_2014.csv')
df3=pd.read_csv('all_contribution_sentences_2015.csv')
df4=pd.read_csv('all_contribution_sentences_2016.csv')
df5=pd.read_csv('all_contribution_sentences_2017.csv')
df6=pd.read_csv('all_contribution_sentences_2018.csv')
df7=pd.read_csv('all_contribution_sentences_2019.csv')
df8=pd.read_csv('all_contribution_sentences_2020.csv')
df9=pd.read_csv('all_contribution_sentences_2021.csv')
df10=pd.read_csv('all_contribution_sentences_2022.csv')
df_final=pd.concat([df1, df2, df3,df4,df5,df6,df7,df8,df9,df10], ignore_index=True)

In [31]:
df_final.to_csv('all_contribution_sentences.csv', index=False)

In [36]:
contribution_df

Unnamed: 0,Contribution Sentences
0,We propose a multilingual bag-of-entities (M-\...
1,We evaluate the performance of the M-BoE\nmode...
2,We train the model using training data in the ...
3,Our contributions are as follows:\n•We present...
4,Our method successfully improves the per-\nfor...
5,Our method dynami-\ncally injects entity infor...
6,"In particular, the perfor-\nmance of our model..."
7,We attribute this to the num-\nber of entities...
8,We at-\ntribute this to the ability of our met...
9,We achieved state-of-the-art results on\nthree...


In [35]:
contribution_df

Unnamed: 0,Contribution Sentences
0,We propose a multilingual bag-of-entities (M-\...
1,We evaluate the performance of the M-BoE\nmode...
2,We train the model using training data in the ...
3,Our contributions are as follows:\n•We present...
4,Our method successfully improves the per-\nfor...
5,Our method dynami-\ncally injects entity infor...
6,"In particular, the perfor-\nmance of our model..."
7,We attribute this to the num-\nber of entities...
8,We at-\ntribute this to the ability of our met...
9,We achieved state-of-the-art results on\nthree...


# Combing Contribution sentences from other conferences

In [39]:
import pandas as pd
df_conll=pd.read_csv('all_contribution_sentences.csv')
df_jlmr=pd.read_csv('C://Users//prash//Downloads//Sujit-JMLR.csv')
df_emnlp=pd.read_csv("C://Users//prash//Downloads//EMNLP.csv//EMNLP.csv")
df_final_contributions=pd.concat([df_conll,df_jlmr,df_emnlp], ignore_index=True)

In [40]:
df_final_contributions.to_csv("final_contributions.csv",index=False)


In [10]:
import pandas as pd

missing_values_per_column = df_final_contributions.isnull().sum()

print(missing_values_per_column)
df_final_contributions.dropna(inplace=True)

text          9955
label            6
year             3
conference       3
dtype: int64


In [12]:
df_final_contributions.shape

(375267, 4)

In [32]:
# Display cleaned text DataFrame
print(df_final_contributions)

                                                     text  label    year  \
0       In addition tothe ﬁve comparison methods used ...    1.0  2013.0   
3       In this paper, we present a system that combin...    1.0  2013.0   
4       In addition, the selec-tion of classifiers, fe...    1.0  2013.0   
5       This paper proposes a boosting algorithm fora ...    0.0  2013.0   
6       We also compare to HTK,a ﬁxed-structure HMM wi...    1.0  2013.0   
...                                                   ...    ...     ...   
385215  It is suggested that such a\nvolume is enough ...    1.0  2022.0   
385216  In contrast,\nlong paragraph inputs describe t...    1.0  2022.0   
385218  Feature selection is often useful in boosting ...    0.0  2022.0   
385221  We compare models in terms of ofﬂine\nNLU perf...    1.0  2022.0   
385224  In addition to the distilled baseline, we also...    1.0  2022.0   

                                              conference  \
0                          

In [33]:
# Function to remove text less than 50 characters
def filter_text(text):
    if len(str(text)) >= 50:
        return text
    else:
        return None  # Replace shorter text with None/NaN

# Apply the function to the specified column
df_final_contributions['cleaned_text'] = df_final_contributions['cleaned_text'].apply(filter_text)

# Drop rows with None/NaN values in the column
df_final_contributions.dropna(inplace=True)

In [34]:
df_final_contributions.shape

(219000, 10)