In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn

In [None]:
pip install transformers[torch]


In [63]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pdfplumber
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize NLTK tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Directory containing all PDFs
PDF_DIR = "papers"  # Replace with the path to your dataset folder
label_pdf="Reference"

# Define labeled (15) and unlabeled (135) PDFs

labeled_pdfs = [
    "P001.pdf","P002.pdf","P003.pdf","P004.pdf","P005.pdf","P006.pdf",
    "P007.pdf","P008.pdf","P009.pdf","P010.pdf","P011.pdf","P012.pdf",
    "P013.pdf","P014.pdf","P015.pdf"# Add all 15 filenames
]
unlabeled_pdfs = [pdf for pdf in os.listdir(PDF_DIR) if pdf not in labeled_pdfs]

# Preprocessing function for text
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = [word for word in text.split() if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Function to extract sections from a PDF
def extract_sections(pdf_path):
    sections = {'Abstract': '', 'Methodology': '', 'Results': '', 'Conclusion': ''}
    section_keywords = ['Abstract', 'Methodology', 'Results', 'Conclusion']
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if not text:
                continue
            for keyword in section_keywords:
                if keyword.lower() in text.lower():
                    sections[keyword] += text
    return sections

# Extract features from PDFs
def extract_features(pdfs, pdf_dir):
    data = []
    for pdf in pdfs:
        pdf_path = os.path.join(pdf_dir, pdf)
        sections = extract_sections(pdf_path)
        features = {
            'filename': pdf,
            'abstract': preprocess_text(sections.get('Abstract', '')),
            'methodology': preprocess_text(sections.get('Methodology', '')),
            'results': preprocess_text(sections.get('Results', '')),
            'conclusion': preprocess_text(sections.get('Conclusion', ''))
        }
        data.append(features)
    return pd.DataFrame(data)

# Extract features for labeled and unlabeled data
labeled_data = extract_features(labeled_pdfs, PDF_DIR)
unlabeled_data = extract_features(unlabeled_pdfs, PDF_DIR)

# Load known labels for the labeled data
# Replace this with your actual labels: 0 = Non-Publishable, 1 = Publishable
labeled_data['label'] = [0,0,0,0,0,1,1,1,1,1,1,1,1,1,1]  # Example labels

# Combine text fields for vectorization
labeled_data['combined_text'] = labeled_data['abstract'] + labeled_data['methodology'] + labeled_data['results'] + labeled_data['conclusion']
unlabeled_data['combined_text'] = unlabeled_data['abstract'] + unlabeled_data['methodology'] + unlabeled_data['results'] + unlabeled_data['conclusion']

# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=500)
X_labeled = vectorizer.fit_transform(labeled_data['combined_text']).toarray()
y_labeled = labeled_data['label']

X_unlabeled = vectorizer.transform(unlabeled_data['combined_text']).toarray()

# Train classifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_labeled, y_labeled)

# Predict on the remaining 135 papers
unlabeled_data['predicted_label'] = classifier.predict(X_unlabeled)

# Save predictions to a CSV
unlabeled_data[['filename', 'predicted_label']].to_csv('predicted_labels.csv', index=False)
print("Predictions saved to 'predicted_labels.csv'.")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\patet\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\patet\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Predictions saved to 'predicted_labels.csv'.


In [64]:
df=pd.read_csv("predicted_labels.csv")

In [65]:
# importing the above .csv file


In [66]:
data=pd.DataFrame(labeled_pdfs)

In [67]:
data['

SyntaxError: unterminated string literal (detected at line 1) (2152013325.py, line 1)

In [None]:
df=pd.read_csv('research_papers_publishability.csv',usecols=['Paper ID','Publishable'])

In [None]:
df

In [52]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
x_train,x_test,y_train,y_test=train_test_split(X_labeled,y_labeled,test_size=0.3,random_state=42)

In [53]:
classifier.fit(x_train,y_train)

In [54]:
y_pred=classifier.predict(x_test)

In [55]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.80      1.00      0.89         4

    accuracy                           0.80         5
   macro avg       0.40      0.50      0.44         5
weighted avg       0.64      0.80      0.71         5



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## accuracy 

In [56]:
from sklearn.model_selection  import cross_val_score
scores=cross_val_score(classifier,X_labeled,y_labeled,cv=5,scoring='accuracy')
print(f'Cross-validation Accuracy score:{scores}')

Cross-validation Accuracy score:[0.66666667 0.66666667 0.66666667 0.66666667 0.66666667]


In [57]:
print(f"mean Accuracy :{scores.mean():.2f}")

mean Accuracy :0.67


In [61]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

# Tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenize data
def tokenize_function(texts):
    return tokenizer(list(texts), padding=True, truncation=True, max_length=512)

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    labeled_data['combined_text'].tolist(),  # Ensure it's a list
    labeled_data['label'].tolist(),          # Ensure it's a list
    test_size=0.2,
    random_state=42
)

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

# Create datasets
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,  # Save only the last 2 checkpoints
    logging_dir='./logs',
    logging_steps=10
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer  # Add tokenizer for automatic preprocessing
)

# Train the model
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`

In [None]:
labeled_data['combined_text'] = labeled_data['combined_text'].astype(str)


In [None]:
print(type(train_texts), type(val_texts))
print(train_texts[:5])  # Check the first few examples


In [46]:
train_texts = train_texts.tolist()
val_texts = val_texts.tolist()


In [60]:
pip install transformers[torch]
pip install accelerate>=0.26.0


SyntaxError: invalid syntax (766517327.py, line 1)