# **Notebook C**: Patent Classification with CNN
----



In [15]:
from google.colab import drive
drive.mount('/content/drive')

import os
DEFAULT_ROOT = "/content/drive/MyDrive/USPTO_data"   # 你要用的根目录
EMB_DIR = os.path.join(DEFAULT_ROOT, "Embeddings")
os.makedirs(EMB_DIR, exist_ok=True)

print("DEFAULT_ROOT:", DEFAULT_ROOT)
print("EMBEDDINGS DIR:", EMB_DIR)


MessageError: Error: credential propagation was unsuccessful

In [None]:
%%bash
EMB_DIR="/content/drive/MyDrive/USPTO_data/Embeddings"
mkdir -p "$EMB_DIR"
wget -c http://nlp.stanford.edu/data/glove.6B.zip -O /content/glove.6B.zip
unzip -o /content/glove.6B.zip -d /content/glove6B
mv -f /content/glove6B/glove.6B.50d.txt "$EMB_DIR/glove.6B.50d.txt"
ls -lh "$EMB_DIR/glove.6B.50d.txt"


In [None]:
%%bash
EMB_DIR="/content/drive/MyDrive/USPTO_data/Embeddings"
wget -c http://nlp.stanford.edu/data/glove.840B.300d.zip -O /content/glove.840B.300d.zip
unzip -o /content/glove.840B.300d.zip -d /content
mv -f /content/glove.840B.300d.txt "$EMB_DIR/glove.840B.300d.txt"
ls -lh "$EMB_DIR/glove.840B.300d.txt"


In [None]:
%%bash
EMB_DIR="/content/drive/MyDrive/USPTO_data/Embeddings"
wget -c https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip -O /content/wiki-news-300d-1M.vec.zip
unzip -o /content/wiki-news-300d-1M.vec.zip -d /content
mv -f /content/wiki-news-300d-1M.vec "$EMB_DIR/FastText.en.300.vec"
ls -lh "$EMB_DIR/FastText.en.300.vec"


In [None]:
!pip -q install gensim textblob nltk
import os, re, pandas as pd, nltk
from textblob import TextBlob
from nltk.stem import PorterStemmer
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser

for pkg in ["punkt", "punkt_tab"]:
    try: nltk.data.find(f"tokenizers/{pkg}")
    except LookupError:
        try: nltk.download(pkg, quiet=True)
        except: pass

data_path = os.path.join(DEFAULT_ROOT, "Training Data", "4K Patents - AI 20p.csv")
assert os.path.exists(data_path), f"训练数据没找到：{data_path}"
df = pd.read_csv(data_path)
abstracts = df["abstract"].astype(str).tolist()

stemmer = PorterStemmer()
def string_cleaner(text: str):
    text = (text or "").lower()
    try:
        tokens = TextBlob(text).words
        return [t.stem() for t in tokens]
    except Exception:
        tokens = re.findall(r"[a-z]+", text)
        return [stemmer.stem(t) for t in tokens]

tokens_list = [string_cleaner(x) for x in abstracts]
bigram = Phrases(tokens_list, min_count=5, threshold=10)
bigram_phraser = Phraser(bigram)
tokens_bi = [bigram_phraser[doc] for doc in tokens_list]

w2v = Word2Vec(sentences=tokens_bi, vector_size=50, window=5, min_count=5, workers=4, sg=1, epochs=10)
out_path = os.path.join(EMB_DIR, "W2V Pat Abstracts 50 - AI Bigrams[Lemma].txt")
w2v.wv.save_word2vec_format(out_path, binary=False)
print("自训词向量已保存：", out_path)


In [None]:
must_have = [
    "glove.6B.50d.txt",
    # 如下载了再检查：
    # "glove.840B.300d.txt",
    # "FastText.en.300.vec",
    "W2V Pat Abstracts 50 - AI Bigrams[Lemma].txt",
]
for fn in must_have:
    p = os.path.join(EMB_DIR, fn)
    print("✔" if os.path.exists(p) else "✘", fn, "->", p)


# C.1. Load Packages
---

In [1]:
# === Cell 1 ===
# === General Packages ===
import os
import sys
import warnings
import numpy as np
import pandas as pd

# Progress bars
from tqdm.auto import tqdm

# Reproducibility
import random
import tensorflow as tf
tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)

# Sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
)

# Keras / TensorFlow (modern APIs)
from tensorflow.keras import layers, Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Text processing
try:
    from textblob import TextBlob
except ModuleNotFoundError:
    raise ModuleNotFoundError("TextBlob is required. Install with: pip install textblob")


In [3]:
# Optional: quiet some warnings
warnings.filterwarnings("ignore")
print("✅ Environment ready (TensorFlow", tf.__version__, ")")

✅ Environment ready (TensorFlow 2.19.0 )


In [9]:

# === Cell 2 ===
# === NLTK tokenizers required by TextBlob ===
import nltk
# Try to ensure both 'punkt' (older) and 'punkt_tab' (NLTK 3.8+) exist.
for pkg in ["punkt", "punkt_tab"]:
    try:
        nltk.data.find(f"tokenizers/{pkg}")
    except LookupError:
        try:
            nltk.download(pkg, quiet=True)
            print(f"Downloaded NLTK resource: {pkg}")
        except Exception as e:
            print(f"Note: Could not download NLTK resource '{pkg}' ({e}). Will use fallback tokenizer if needed.")


Downloaded NLTK resource: punkt_tab


In [5]:
# === (Optional) Mount Google Drive if running in Colab ===
try:
    from google.colab import drive  # type: ignore
    drive.mount("/content/drive")
    IN_COLAB = True
    print("Mounted Google Drive.")
except Exception:
    IN_COLAB = False
    print("Not in Colab; proceeding without Drive mount.")

Mounted at /content/drive
Mounted Google Drive.


# C.2. Load Training Data ##
----------------

We are going to use the data on the Google drive. This is in a csv file, and so we are going to load the data as a dataframe, and then convert the main data (Patent Ids, Indicator for AI / Non-AI, Patent Abstract) from a Pandas DataFrame to a list (which is more easily used in later sections).

In [6]:
# === Working Directory & Data Loading ===
# Adjust this path to your own folder structure
DEFAULT_ROOT = "/content/drive/MyDrive/USPTO_data/" if IN_COLAB else os.getcwd()

data_path = os.path.join(DEFAULT_ROOT, "Training_Data", "4K Patents - AI 20p.csv")
if not os.path.exists(data_path):
    print("⚠️ Could not find the training data at:", data_path)
    print("   Update `data_path` to point to your CSV file.")
else:
    print("Loading:", data_path)

TrainingData = pd.read_csv(data_path)

# Store Data in Lists for Text Classification
IDs = np.array(TrainingData['app number'].values.tolist())
Abstract_Text = TrainingData['abstract'].astype(str).values.tolist()
Classes = TrainingData['actual'].astype(int).values.tolist()

print(f"Loaded {len(Classes)} records.")

Loading: /content/drive/MyDrive/USPTO_data/Training_Data/4K Patents - AI 20p.csv
Loaded 4000 records.


In [7]:
# === Tokenizer & String Cleaner ===
n_words = 2000  # vocabulary cap for tokenizer

# Keras tokenizer
tokenizer = Tokenizer(
    num_words=n_words,
    lower=True,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\\t\\n',
    char_level=False
)

# Text cleaner: stem words using TextBlob (as in original)
def string_cleaner(str_input: str):
    blob = TextBlob(str_input.lower())
    tokens = blob.words
    words = [token.stem() for token in tokens]
    # If you prefer full words (no stemming), use:
    # words = [token for token in tokens]
    return words


In [10]:
# === Preprocess abstracts & fit tokenizer ===
Abstracts_Lemmatized = []
for x in Abstract_Text:
    segments = string_cleaner(x)
    Abstracts_Lemmatized.append(" ".join(segments))

Abstracts = Abstracts_Lemmatized

# Fit tokenizer to get word index
tokenizer.fit_on_texts(Abstracts)
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary size:", vocab_size)


Vocabulary size: 6301


Once we have the list of words that occur in our corpus of abstracts (i.e. word index), then we can try to map those words to embedding vectors. Below we define the functions that will go through each of the words in our word index and extract the coresponding embedding vector and save it to an embedding matrix that will be used as a layer in a subsequent convolutional neural network (CNN) model.

In [11]:
# === Embedding Matrix Helpers ===
def create_embedding_matrix(filepath, word_index, embedding_dim):
    """Create embedding matrix from a text embedding file (e.g., GloVe)."""
    vocab_size = len(word_index) + 1  # reserve 0 for padding
    embedding_matrix = np.zeros((vocab_size, embedding_dim), dtype=np.float32)

    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.rstrip().split()
            if len(parts) <= embedding_dim:
                continue
            word = parts[0]
            vector = np.asarray(parts[1:1+embedding_dim], dtype=np.float32)
            idx = word_index.get(word)
            if idx is not None and idx < vocab_size:
                embedding_matrix[idx] = vector
    return embedding_matrix

def create_empty_matrix(word_index, embedding_dim):
    """Create an embedding matrix initialized to ones (no external embeddings)."""
    vocab_size = len(word_index) + 1
    return np.ones((vocab_size, embedding_dim), dtype=np.float32)

In [12]:
# === Embedding Configurations ===
# Name, path, embedding_dim
CLASSIFIERS = [
    ["No Embeddings", "NONE", 50],
    ["GLOVE (6B - 50)", os.path.join(DEFAULT_ROOT, "Embeddings", "glove.6B.50d.txt"), 50],
    ["GLOVE (840B - 300)", os.path.join(DEFAULT_ROOT, "Embeddings", "glove.840B.300d.txt"), 300],
    ["Doc2Vec USPTO Patent Embeddings", os.path.join(DEFAULT_ROOT, "Embeddings", "W2V Pat Abstracts 50 - AI Bigrams[Lemma].txt"), 50],
    ["FastText", os.path.join(DEFAULT_ROOT, "Embeddings", "FastText.en.300.vec"), 300],
]



In [13]:
# === CNN + LSTM Model Training (Cross-Validation) ===

maxlen = 200
batch_size = 50
epochs = 20
NUM_OF_SPLITS = 5

# Where to save outputs
OUTPUT_DIR_PERF = os.path.join(DEFAULT_ROOT, "Output", "Model Performance")
OUTPUT_DIR_PRED = os.path.join(DEFAULT_ROOT, "Output", "Classification Output")
os.makedirs(OUTPUT_DIR_PERF, exist_ok=True)
os.makedirs(OUTPUT_DIR_PRED, exist_ok=True)

# Storage
RESULTS = []
Classified_Values = []

for name, emb_path, embedding_dim in tqdm(CLASSIFIERS, desc="Loop Through Embeddings"):
    print(f"\n=== Training: {name} ===")

    # Prepare sequences for all texts once (we'll pick train/test indices per fold)
    sequences = tokenizer.texts_to_sequences(Abstracts)
    sequences = pad_sequences(sequences, padding="post", maxlen=maxlen)

    # Build the embedding matrix
    if emb_path == "NONE":
        embedding_matrix = create_empty_matrix(tokenizer.word_index, embedding_dim)
    else:
        if not os.path.exists(emb_path):
            print(f"⚠️  Embedding file not found: {emb_path}. Skipping this embedding.")
            continue
        embedding_matrix = create_embedding_matrix(emb_path, tokenizer.word_index, embedding_dim)

    y_actual_all = np.array(Classes, dtype=int)
    ids_all = np.array(IDs)

    y_actual = []
    y_predicted = []
    y_proba = []
    id_s = []

    skf = StratifiedKFold(n_splits=NUM_OF_SPLITS, shuffle=True, random_state=42)
    for fold_idx, (train_idx, test_idx) in enumerate(tqdm(skf.split(sequences, y_actual_all), total=NUM_OF_SPLITS, leave=False, desc="Cross-Validating"), start=1):
        X_train = sequences[train_idx]
        X_test = sequences[test_idx]
        y_train = y_actual_all[train_idx]
        y_test = y_actual_all[test_idx]
        Test_IDs = ids_all[test_idx].tolist()

        # Build model
        model = Sequential()
        model.add(layers.Embedding(
            input_dim=vocab_size,
            output_dim=embedding_dim,
            weights=None if emb_path == "NONE" else [embedding_matrix],
            input_length=maxlen,
            trainable=True
        ))
        model.add(layers.Dropout(0.2))
        model.add(layers.Conv1D(filters=64, kernel_size=2, activation="relu"))
        model.add(layers.MaxPooling1D(pool_size=4))
        # Dense applied timewise over sequence (as in original)
        model.add(layers.Dense(512, activation="sigmoid"))
        model.add(layers.LSTM(100))
        model.add(layers.Dense(1, activation="sigmoid"))

        model.compile(
            optimizer="adam",
            loss="binary_crossentropy",
            metrics=[tf.keras.metrics.Recall(name="recall")]
        )

        model.fit(
            X_train, y_train,
            batch_size=batch_size,
            epochs=epochs,
            verbose=0,
            validation_data=(X_test, y_test)
        )

        y_pred_p = model.predict(X_test, verbose=0).ravel()
        y_pred = (y_pred_p >= 0.5).astype(int)

        # accumulate
        y_actual.extend(y_test.tolist())
        y_predicted.extend(y_pred.tolist())
        y_proba.extend(y_pred_p.tolist())
        id_s.extend(Test_IDs)

    # After CV: compute metrics
    Share = np.round(np.mean(y_predicted), 3)
    Accuracy = np.round(accuracy_score(y_actual, y_predicted), 3)
    try:
        ROC = np.round(roc_auc_score(y_actual, y_proba), 3)  # use probabilities
    except Exception:
        ROC = np.nan

    Precision = np.round(precision_score(y_actual, y_predicted, zero_division=0), 3)
    Recall = np.round(recall_score(y_actual, y_predicted, zero_division=0), 3)
    F1 = np.round(f1_score(y_actual, y_predicted, zero_division=0), 3)

    CM = confusion_matrix(y_actual, y_predicted)
    # Protect against zero division
    denom_pos = (CM[0][1] + CM[1][1]) if (CM[0][1] + CM[1][1]) > 0 else 1
    denom_neg = (CM[0][0] + CM[1][0]) if (CM[0][0] + CM[1][0]) > 0 else 1
    FP = np.round(CM[0][1] / denom_pos, 3)
    TN = np.round(CM[1][0] / denom_neg, 3)
    TP = np.round(CM[1][1] / denom_pos, 3)
    FN = np.round(CM[0][0] / denom_neg, 3)

    RESULTS.append([name, Share, TP, FN, FP, TN, Accuracy, ROC, Precision, Recall, F1])

    # Save per-model classified values
    Classified_Values.append(list(zip([name]*len(id_s), id_s, y_actual, y_predicted)))


Loop Through Embeddings:   0%|          | 0/5 [00:00<?, ?it/s]


=== Training: No Embeddings ===


Cross-Validating:   0%|          | 0/5 [00:00<?, ?it/s]


=== Training: GLOVE (6B - 50) ===


Cross-Validating:   0%|          | 0/5 [00:00<?, ?it/s]


=== Training: GLOVE (840B - 300) ===
⚠️  Embedding file not found: /content/drive/MyDrive/USPTO_data/Embeddings/glove.840B.300d.txt. Skipping this embedding.

=== Training: Doc2Vec USPTO Patent Embeddings ===
⚠️  Embedding file not found: /content/drive/MyDrive/USPTO_data/Embeddings/W2V Pat Abstracts 50 - AI Bigrams[Lemma].txt. Skipping this embedding.

=== Training: FastText ===
⚠️  Embedding file not found: /content/drive/MyDrive/USPTO_data/Embeddings/FastText.en.300.vec. Skipping this embedding.


In [None]:
# === Results Table ===
RESULTS_TABLE = pd.DataFrame(
    RESULTS,
    columns=["Name", "Share", "True-Positives", "False-Negatives", "False-Positives",
             "True-Negatives", "Accuracy", "AUC", "Precision", "Recall", "F1"]
)
RESULTS_TABLE["Type"] = "CNN"
RESULTS_TABLE = RESULTS_TABLE[
    ["Name", "Type", "Share", "True-Positives", "False-Negatives", "False-Positives",
     "True-Negatives", "Accuracy", "AUC", "Precision", "Recall", "F1"]
]

perf_path = os.path.join(OUTPUT_DIR_PERF, "CNN Model Classification Performance.csv")
RESULTS_TABLE.sort_values("Accuracy", ascending=False).to_csv(perf_path, index=False)
print("Saved performance to:", perf_path)

RESULTS_TABLE.sort_values("Accuracy", ascending=False)

In [None]:
# === Out-of-sample predictions (per model) wide table ===
Final = None
for i in range(len(Classified_Values)):
    Temp = pd.DataFrame(
        Classified_Values[i], columns=["Model", "id", "Actual", "Predicted"]
    )
    if i == 0:
        name = Temp.head(1)["Model"].iloc[0]
        Temp = Temp[["id", "Actual", "Predicted"]].copy()
        Temp.columns = ["id", "Actual", name]
        Final = Temp
    else:
        name = Temp.head(1)["Model"].iloc[0]
        Temp = Temp[["id", "Predicted"]].copy()
        Temp.columns = ["id", name]
        Final = Final.merge(Temp, on=["id"], how="outer")

pred_path = os.path.join(OUTPUT_DIR_PRED, "CNN Classification Results.csv")
Final.to_csv(pred_path, index=False)
print("Saved predictions to:", pred_path)