In [18]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [19]:
import re
import json
import nltk
import pprint
import numpy as np
import pandas as pd
import seaborn as sns
from os import listdir
from os.path import isfile, join
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from scipy.stats import mannwhitneyu
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize

from keras.preprocessing.text import text_to_word_sequence

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Fetching all the text-target files from the folder and merging them 
path_all_text_target = '/content/drive/MyDrive/dataset_patent/text_target'
files_all_text_target = [f for f in listdir(path_all_text_target) if isfile(join(path_all_text_target, f))]
print(files_all_text_target)

['pos_text_target.csv', 'neg_text_target.csv', 'neut_text_target.csv']


In [3]:
combined_all_text_target = pd.concat([pd.read_csv(path_all_text_target+'/'+f) for f in files_all_text_target ])

In [4]:
combined_all_text_target.head()

Unnamed: 0,text,target
0,The present invention can enhance the shaping ...,1
1,"According to the present invention, a slide fa...",1
2,"According to the present invention, it is poss...",1
3,In the diagnostic medical image system accordi...,1
4,A dental implant fixture according to the pres...,1


In [5]:
combined_all_text_target = shuffle(combined_all_text_target)

In [6]:
combined_all_text_target.head()

Unnamed: 0,text,target
4172,The present invention is advantageous in terms...,1
87023,"Currently, Public Safety Department of the Uni...",-1
86125,"Among MTC terminals, there has been an increas...",-1
75127,"Still, any conventional techniques have diffic...",-1
22540,To achieve these objects and other advantages ...,0


In [7]:
# Reseting index such that it starts from 0,1,2, ... so on 
combined_all_text_target.reset_index(inplace=True, drop=True)

In [8]:
combined_all_text_target.head(10)

Unnamed: 0,text,target
0,The present invention is advantageous in terms...,1
1,"Currently, Public Safety Department of the Uni...",-1
2,"Among MTC terminals, there has been an increas...",-1
3,"Still, any conventional techniques have diffic...",-1
4,To achieve these objects and other advantages ...,0
5,The present invention can provide a vehicle br...,1
6,In the hash function described in Non-patent l...,-1
7,It is an aspect of the present invention to pr...,-1
8,A conventional height-adjustment mechanism com...,-1
9,The above two conventional methods are process...,-1


In [9]:
# A copied file to play with the text-target dataset
df_analysis = combined_all_text_target.copy()

In [10]:
# Counting total words available in the text
df_analysis['total_words_in_text'] = df_analysis['text'].str.count(' ') + 1

In [11]:
df_analysis.head()

Unnamed: 0,text,target,total_words_in_text
0,The present invention is advantageous in terms...,1,24.0
1,"Currently, Public Safety Department of the Uni...",-1,158.0
2,"Among MTC terminals, there has been an increas...",-1,305.0
3,"Still, any conventional techniques have diffic...",-1,450.0
4,To achieve these objects and other advantages ...,0,434.0


In [14]:
# Calculating total numbers of the sample 
print("Size of the data: ", combined_all_text_target.shape[0])

Size of the data:  205635


In [15]:
# Checking counts of respective labels 
combined_all_text_target['target'].value_counts()

-1    89105
 0    63055
 1    53475
Name: target, dtype: int64

In [16]:
df_analysis['text']=df_analysis['text'].apply(str)

In [20]:
# Tokenization using NLTK  - Taking 5 minutes on colab 
df_analysis['tokenized_sents'] = df_analysis.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

In [22]:
# Tokenization using keras - Takind 1 minutes on colab
df_analysis['keras_tokenized_sents'] = df_analysis.apply(lambda row: text_to_word_sequence(row['text']), axis=1)

In [24]:
df_analysis.head()

Unnamed: 0,text,target,total_words_in_text,tokenized_sents,keras_tokenized_sents
0,The present invention is advantageous in terms...,1,24.0,"[The, present, invention, is, advantageous, in...","[the, present, invention, is, advantageous, in..."
1,"Currently, Public Safety Department of the Uni...",-1,158.0,"[Currently, ,, Public, Safety, Department, of,...","[currently, public, safety, department, of, th..."
2,"Among MTC terminals, there has been an increas...",-1,305.0,"[Among, MTC, terminals, ,, there, has, been, a...","[among, mtc, terminals, there, has, been, an, ..."
3,"Still, any conventional techniques have diffic...",-1,450.0,"[Still, ,, any, conventional, techniques, have...","[still, any, conventional, techniques, have, d..."
4,To achieve these objects and other advantages ...,0,434.0,"[To, achieve, these, objects, and, other, adva...","[to, achieve, these, objects, and, other, adva..."


In [25]:
# Total number of words in text - overall
print('Total number of words ' + str(df_analysis['total_words_in_text'].sum()))

Total number of words 59526838.0


Let's see how both tokenization works

In [33]:
# nltk tokenization 
print(df_analysis['tokenized_sents'][0])

['The', 'present', 'invention', 'is', 'advantageous', 'in', 'terms', 'of', 'reducing', 'battery', 'consumption', 'of', 'the', 'UE', 'by', 'applying', 'discontinuous', 'reception', 'in', 'the', 'inter-eNB', 'carrier', 'aggregation', 'mode', '.']


In [34]:
# Keras tokenization 
print(df_analysis['keras_tokenized_sents'][0])

['the', 'present', 'invention', 'is', 'advantageous', 'in', 'terms', 'of', 'reducing', 'battery', 'consumption', 'of', 'the', 'ue', 'by', 'applying', 'discontinuous', 'reception', 'in', 'the', 'inter', 'enb', 'carrier', 'aggregation', 'mode']


It seems like the tokenization perform better considering following factors 


1.   Execution time is better (As seen above in code)
2.   Converting all tokens in lower case
3.   It is not considering punctuation as new token  



In [37]:
df_analysis['total_words_k_token'] = df_analysis['keras_tokenized_sents'].apply(lambda x : len(x))

In [38]:
df_analysis.head()

Unnamed: 0,text,target,total_words_in_text,tokenized_sents,keras_tokenized_sents,total_words_k_token
0,The present invention is advantageous in terms...,1,24.0,"[The, present, invention, is, advantageous, in...","[the, present, invention, is, advantageous, in...",25
1,"Currently, Public Safety Department of the Uni...",-1,158.0,"[Currently, ,, Public, Safety, Department, of,...","[currently, public, safety, department, of, th...",161
2,"Among MTC terminals, there has been an increas...",-1,305.0,"[Among, MTC, terminals, ,, there, has, been, a...","[among, mtc, terminals, there, has, been, an, ...",313
3,"Still, any conventional techniques have diffic...",-1,450.0,"[Still, ,, any, conventional, techniques, have...","[still, any, conventional, techniques, have, d...",454
4,To achieve these objects and other advantages ...,0,434.0,"[To, achieve, these, objects, and, other, adva...","[to, achieve, these, objects, and, other, adva...",439


In [40]:
# Highest number of words in text 
print('Highest number of words in text is : ' + str(df_analysis['total_words_k_token'].max()))

# Lowest number of words in text 
print('Lowest number of words in text is : ' + str(df_analysis['total_words_k_token'].min()))

# Average number of words in text 
print('Average number of words in text is : ' + str(df_analysis['total_words_k_token'].mean()))

Highest number of words in text is : 7317
Lowest number of words in text is : 0
Average number of words in text is : 297.0090354268485


In [41]:
# Checking the rows having lowest number of words = 0
df_analysis[df_analysis['total_words_k_token']==0]

Unnamed: 0,text,target,total_words_in_text,tokenized_sents,keras_tokenized_sents,total_words_k_token
28899,-,1,1.0,[-],[],0


Depending on usage we would like to remove such rows 

In [39]:
# Saving a generated file on drive = Having size of 1.41GB
df_analysis.to_csv( "/content/drive/MyDrive/dataset_patent/generated_files/df_analysis.csv", index=False, encoding='utf-8-sig')