In [1]:
import os
import re
import nltk
import spacy
import PyPDF2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import classification_report, accuracy_score


In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vansh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vansh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Vansh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            content = page.extract_text()
            if content:
                text += content
    return text


In [None]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # remove extra whitespace
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    tokens = [lemmatizer.lemmatize(stemmer.stem(t)) for t in tokens]
    return ' '.join(tokens)


In [None]:
pdf_dir = "pdfs"

if not os.path.exists(pdf_dir):
    raise FileNotFoundError(f"Folder '{pdf_dir}' not found!")

label_map = {}
for file in os.listdir(pdf_dir):
    if file.endswith(".pdf"):
        if "resume" in file.lower():
            label_map[file] = "resume"
        elif "invoice" in file.lower():
            label_map[file] = "invoice"
        elif "report" in file.lower():
            label_map[file] = "report"

print("Total labeled PDFs:", len(label_map))


In [None]:
texts = []
labels = []

for file, label in label_map.items():
    path = os.path.join(pdf_dir, file)
    raw_text = extract_text_from_pdf(path)
    
    if not raw_text.strip():
        print(f"Warning: '{file}' is empty or unreadable.")
        continue

    clean_text = preprocess(raw_text)
    texts.append(clean_text)
    labels.append(label)


In [None]:
pd.Series(labels).value_counts().plot(kind='bar', title='Class Distribution')
plt.xlabel("Document Type")
plt.ylabel("Count")
plt.show()


In [None]:
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(texts)

y_labels = pd.Series(labels).astype('category')
y = y_labels.cat.codes  # 0,1,2


In [None]:
from sklearn.model_selection import train_test_split
from scipy.sparse import vstack

# Convert y to NumPy array
y_full = y.to_numpy()

# Simulate unlabeled data (e.g., hide 20% labels)
rng = np.random.default_rng(seed=42)
unlabeled_indices = rng.choice(len(y_full), size=int(0.2 * len(y_full)), replace=False)
y_semi = y_full.copy()
y_semi[unlabeled_indices] = -1  # -1 indicates unlabeled

# Train-test split only on labeled data
labeled_mask = y_semi != -1
X_labeled = X[labeled_mask]
y_labeled = y_semi[labeled_mask]

X_train, X_test, y_train, y_test = train_test_split(
    X_labeled, y_labeled, test_size=0.3, stratify=y_labeled, random_state=42
)

# Add back unlabeled data to training
X_unlabeled = X[~labeled_mask]
y_unlabeled = y_semi[~labeled_mask]

X_train = vstack([X_train, X_unlabeled])
y_train = np.concatenate([y_train, y_unlabeled])


In [None]:
base_model = MultinomialNB()
model = SelfTrainingClassifier(base_model)
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)
present_classes = sorted(np.unique(np.concatenate((y_test, y_pred))))
target_names = y_labels.cat.categories[present_classes]

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, labels=present_classes, target_names=target_names))


In [None]:
import joblib
joblib.dump(model, "pdf_text_classifier_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")


In [None]:
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", np.mean(cv_scores))


In [None]:
import joblib
joblib.dump(model, "pdf_text_classifier_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
