In [1]:
import os
import re
import nltk
import spacy
import PyPDF2
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import classification_report, accuracy_score


In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vansh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vansh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Vansh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            content = page.extract_text()
            if content:
                text += content
    return text


In [4]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(stemmer.stem(t)) for t in tokens]
    return ' '.join(tokens)


In [5]:
label_map = {
    "resume1.pdf": "resume",
    "invoice1.pdf": "invoice",
    "report1.pdf": "report"
}


In [6]:
pdf_dir = "pdfs/"
texts = []
labels = []

for file, label in label_map.items():
    path = os.path.join(pdf_dir, file)
    raw_text = extract_text_from_pdf(path)
    clean_text = preprocess(raw_text)
    texts.append(clean_text)
    labels.append(label)



In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(texts)

# Convert string labels to numbers (e.g., resume → 0, invoice → 1, report → 2)
y_labels = pd.Series(labels).astype('category')
y = y_labels.cat.codes


In [8]:
print(dict(enumerate(y_labels.cat.categories)))


{0: 'invoice', 1: 'report', 2: 'resume'}


In [11]:
from sklearn.model_selection import train_test_split
import numpy as np

# Ensure it's a writable NumPy array
y_semi = y.astype(int).copy()  # Or: y_labels.cat.codes.to_numpy(copy=True)
y_semi[0] = -1  # Simulate the first document as "unlabeled"

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_semi, test_size=0.3, random_state=42)


In [13]:
# Separate the one unlabeled sample
X_unlabeled = X[0]
y_unlabeled = -1

# Remaining labeled data
X_labeled = X[1:]
y_labeled = y[1:]

# Split only labeled data
from sklearn.model_selection import train_test_split
X_train_labeled, X_test, y_train_labeled, y_test = train_test_split(X_labeled, y_labeled, test_size=0.3, random_state=42)

# Add the unlabeled sample back to training set
from scipy.sparse import vstack  # Because X is sparse
X_train = vstack([X_unlabeled, X_train_labeled])
y_train = np.hstack(([y_unlabeled], y_train_labeled))


In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.semi_supervised import SelfTrainingClassifier

base_model = MultinomialNB()
model = SelfTrainingClassifier(base_model)
model.fit(X_train, y_train)


In [15]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on test set
y_pred = model.predict(X_test)

# Compare only on originally labeled data
actual = y[y_semi != -1][:len(y_pred)]  # Filter out simulated unlabeled

print("Accuracy:", accuracy_score(actual, y_pred))
print(classification_report(actual, y_pred, target_names=y_labels.cat.categories))


Accuracy: 1.0


ValueError: Number of classes, 1, does not match size of target_names, 3. Try specifying the labels parameter