# ArIES Hackathon


In [1]:
!pip install -q scikit-learn pandas PyMuPDF textstat sentence-transformers seaborn

In [2]:
import os
import re
import string
import fitz  
import pandas as pd
import numpy as np
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer
from scipy.sparse import hstack

In [3]:
def extract_text(pdf_path):
    doc = fitz.open(pdf_path)
    return " ".join(page.get_text() for page in doc if page.get_text())

In [4]:
def get_sections(text):
    intro = re.search(
        r'(introduction)(.*?)(methodology|methods)',
        text,
        re.IGNORECASE | re.DOTALL
    )
    concl = re.search(
        r'(conclusion)(.*?)(references|acknowledgments)',
        text,
        re.IGNORECASE | re.DOTALL
    )
    intro_text = intro.group(2) if intro else ""
    concl_text = concl.group(2) if concl else ""
    return intro_text, concl_text


In [5]:
def preprocess_text(text):
    text = text.lower()
    return text.translate(str.maketrans('', '', string.punctuation))

In [6]:
def extract_features(text, intro, concl, model):
    features = {}
    dc = textstat.dale_chall_readability_score(text)
    ari = textstat.automated_readability_index(text)
    wc = len(preprocess_text(text).split())
    features['dale_chall'] = dc * 2
    features['ari'] = ari * 2
    features['word_count'] = wc * 1.5
    if intro.strip() and concl.strip():
        vec_intro = model.encode(intro)
        vec_concl = model.encode(concl)
        similarity = np.dot(vec_intro, vec_concl)
        features['similarity'] = similarity * 0.5
    else:
        features['similarity'] = 0.0
    return features

In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2')
data = []
texts = []

directory = "/Users/parvkhanna/Downloads/Non-Publishable"  

for file in sorted(os.listdir(directory)):
    if file.endswith('.pdf') and file.startswith('R'):
        paper_id = int(file[1:4])
        label = 0 if paper_id <= 5 else 1
        path = os.path.join(directory, file)
        text = extract_text(path)
        if text:
            intro, concl = get_sections(text)
            feats = extract_features(text, intro, concl, model)
            feats['label'] = label
            feats['filename'] = file
            data.append(feats)
            texts.append(text)

directory = "/Users/parvkhanna/Downloads/Publishable" 

for file in sorted(os.listdir(directory)):
    if file.endswith('.pdf') and file.startswith('R'):
        paper_id = int(file[1:4])
        label = 0 if paper_id <= 5 else 1
        path = os.path.join(directory, file)
        text = extract_text(path)
        if text:
            intro, concl = get_sections(text)
            feats = extract_features(text, intro, concl, model)
            feats['label'] = label
            feats['filename'] = file
            data.append(feats)
            texts.append(text)


df = pd.DataFrame(data)
df['text'] = texts
df.head()

Unnamed: 0,dale_chall,ari,word_count,similarity,label,filename,text
0,16.92,66.6,13479.0,0.0,0,R001.pdf,Transdimensional Properties of Graphite in Rel...
1,15.28,53.0,15256.5,0.0,0,R002.pdf,Synergistic Convergence of Photosynthetic Path...
2,19.56,88.4,14946.0,0.0,0,R003.pdf,Deciphering the Enigmatic Properties of Metals...
3,18.18,43.8,4210.5,0.0,0,R004.pdf,AI-Driven Personalization in Online Education\...
4,15.14,44.2,10425.0,0.0,0,R005.pdf,Analyzing Real-Time Group Coordination in\nAug...


In [8]:
tfidf = TfidfVectorizer(stop_words='english', max_features=2000, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(df['text'])

handcrafted = df[['dale_chall', 'ari', 'word_count', 'similarity']].reset_index(drop=True)
scaler = StandardScaler()
X_handcrafted = scaler.fit_transform(handcrafted)

X_combined = hstack([X_tfidf, X_handcrafted])
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.3, stratify=y, random_state=42)

clf1 = LogisticRegression(max_iter=1000)
clf2 = RandomForestClassifier(n_estimators=150, max_depth=10)
clf3 = GradientBoostingClassifier()

ensemble = VotingClassifier(estimators=[
    ('lr', clf1), ('rf', clf2), ('gb', clf3)
], voting='soft')

ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print(f"F1 Score: {f1_score(y_test, y_pred):.2f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.75      1.00      0.86         3

    accuracy                           0.80         5
   macro avg       0.88      0.75      0.76         5
weighted avg       0.85      0.80      0.78         5

F1 Score: 0.86
Accuracy: 0.80


## Output CSV in original format

In [9]:
results = []

for i in range(len(df)):
    text = df.loc[i, 'text']
    X_test_tfidf = tfidf.transform([text])
    handcrafted_feats = scaler.transform(df.loc[[i], ['dale_chall', 'ari', 'word_count', 'similarity']])
    X_final = hstack([X_test_tfidf, handcrafted_feats])
    prob = ensemble.predict_proba(X_final)[0]
    pred = np.argmax(prob)
    
    justification = (
        "Paper has structured methodology and scientific value" if pred == 1 else
        "Paper lacks structure, scientific evidence and incorrect assumptions."
    )
    results.append({
        "Paper ID": df.loc[i, 'filename'],
        "publishability": int(pred),
        "Rationale": justification
    })

pd.DataFrame(results).to_csv("result.csv", index=False)


In [10]:
unlabeled_dir = "/Users/parvkhanna/Downloads/Papers"  
unlabeled_results = []

for file in sorted(os.listdir(unlabeled_dir)):
    if file.endswith(".pdf"):
        path = os.path.join(unlabeled_dir, file)
        text = extract_text(path)
        if text:
            intro, concl = get_sections(text)
            feats = extract_features(text, intro, concl, model)
            tfidf_vec = tfidf.transform([text])
            handcrafted_feats = scaler.transform([[feats['dale_chall'], feats['ari'], feats['word_count'], feats['similarity']]])
            X = hstack([tfidf_vec, handcrafted_feats])
            prob = ensemble.predict_proba(X)[0]
            pred = np.argmax(prob)
            justification = (
                "Paper has structured methodology and scientific value." if pred == 1 else
                "Paper lacks structure, scientific evidence and incorrect assumptions."
            )
            unlabeled_results.append({
                "Paper ID": file,
                "Publishability": int(pred),
                "Rationale": justification
            })

df_unlabeled = pd.DataFrame(unlabeled_results)
df_unlabeled.to_csv("Results_23112070.csv", index=False)
df_unlabeled.head()



Unnamed: 0,Paper ID,Publishability,Rationale
0,P001.pdf,1,Paper has structured methodology and scientifi...
1,P002.pdf,0,"Paper lacks structure, scientific evidence and..."
2,P003.pdf,0,"Paper lacks structure, scientific evidence and..."
3,P004.pdf,1,Paper has structured methodology and scientifi...
4,P005.pdf,1,Paper has structured methodology and scientifi...
