In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf 
from pathlib import Path
from sklearn import linear_model
from nltk import ngrams
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from bs4 import BeautifulSoup
import requests
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re 

%matplotlib inline

In [2]:
file_path = Path("G06_full_cleaned_more_stopwords_removed2.csv")
patents_df = pd.read_csv(file_path)

In [3]:
# Create CPC Codes and replace them in the dataframe
class_codes = {
    'G06C': 0,
    'G06E': 1,
    'G06F': 2,
    'G06G': 3,
    'G06J': 4,
    'G06K': 5,
    'G06M': 6,
    'G06N': 7,
    'G06Q': 8,
    'G06T': 9    
}

patents_df = patents_df.replace({'Class':class_codes})

In [4]:
patents_df.drop(columns=['Unnamed: 0'], inplace=True)
patents_df.dropna(inplace=True)
patents_df.reset_index(drop=True)

Unnamed: 0,Class,Claims
0,0.0,business machine apparatus constructed arrange...
1,0.0,tenkey calculating maching value mechanism re...
2,0.0,printing calculator ten digit key fourfunction...
3,0.0,cash register amount bank key includes differe...
4,0.0,machine calculating decimal number coded accor...
...,...,...
7536,9.0,information process apparatus comprisingan ext...
7537,9.0,comprisingreceiving server computer user cust...
7538,9.0,comprisingcreating bundle work unitsselecting...
7539,9.0,providing internet mobile end user customized...


In [5]:
# Label X and y data 
X = patents_df.Claims
y = patents_df.Class

In [6]:
#Train Test Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25 ,random_state=78)

In [7]:
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 2000

In [8]:
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
print(features_train.shape)

features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)

(5655, 2000)
(1886, 2000)


In [9]:
for Product, category_id in sorted(class_codes.items()):
    features_chi2 = chi2(features_train, labels_train == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}' category:".format(Product))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-5:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-2:])))
    print("")

# 'G06C' category:
  . Most correlated unigrams:
. disc
. gear
. slide
. shaft
. lever
  . Most correlated bigrams:
. mean includes
. mean comprises

# 'G06E' category:
  . Most correlated unigrams:
. lens
. fourier
. light
. beam
. optical
  . Most correlated bigrams:
. spatial light
. fourier transform

# 'G06F' category:
  . Most correlated unigrams:
. wearable
. protocol
. playback
. wireless
. audio
  . Most correlated bigrams:
. electronic device
. medium content

# 'G06G' category:
  . Most correlated unigrams:
. resistance
. resistor
. transistor
. voltage
. amplifier
  . Most correlated bigrams:
. mean connected
. current source

# 'G06J' category:
  . Most correlated unigrams:
. circuit
. digitaltoanalog
. digital
. converter
. analog
  . Most correlated bigrams:
. digital signal
. analog signal

# 'G06K' category:
  . Most correlated unigrams:
. character
. region
. feature
. pixel
. image
  . Most correlated bigrams:
. feature vector
. image based

# 'G06M' category:
  . Mo

In [10]:
labels_train = labels_train.astype('int')
labels_test = labels_test.astype('int')

In [33]:
model = XGBClassifier(learning_rate = 0.2)
model.fit(features_train, labels_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.2, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [34]:
predictions = model.predict(features_test)

In [35]:
# evaluate predictions
accuracy = accuracy_score(labels_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 79.90%


In [36]:
print(classification_report(labels_test, predictions))

              precision    recall  f1-score   support

           0       0.69      0.63      0.66        38
           1       0.88      0.92      0.90       189
           2       0.70      0.73      0.72       263
           3       0.71      0.66      0.69        83
           4       0.88      0.86      0.87       170
           5       0.84      0.85      0.85       245
           6       0.87      0.76      0.81       155
           7       0.83      0.86      0.84       254
           8       0.85      0.86      0.86       237
           9       0.66      0.65      0.66       252

    accuracy                           0.80      1886
   macro avg       0.79      0.78      0.79      1886
weighted avg       0.80      0.80      0.80      1886

