In [2]:
!pip install neattext

Collecting neattext
  Downloading neattext-0.1.3-py3-none-any.whl (114 kB)
     ---------------------------------------- 0.0/114.7 kB ? eta -:--:--
     --- ------------------------------------ 10.2/114.7 kB ? eta -:--:--
     ------------- ----------------------- 41.0/114.7 kB 393.8 kB/s eta 0:00:01
     -----------------------------------  112.6/114.7 kB 726.2 kB/s eta 0:00:01
     ------------------------------------ 114.7/114.7 kB 741.0 kB/s eta 0:00:00
Installing collected packages: neattext
Successfully installed neattext-0.1.3


In [2]:
from wordcloud import WordCloud
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
# !pip install neattext
from keras.models import load_model
import neattext.functions as nfx
import matplotlib.pyplot as plt
import plotly.express as plx
from sklearn.metrics import classification_report
import keras
from keras.layers import Embedding,Dense,LSTM,Bidirectional,GlobalMaxPooling1D,Input,Dropout
from keras.callbacks import EarlyStopping,ReduceLROnPlateau
from keras.models import Sequential
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer




In [3]:
from keras.utils import pad_sequences
from tqdm import tqdm
import seaborn as sns
import pickle
import warnings
warnings.filterwarnings('ignore')

### Fething the Merged Data

In [4]:
df = pd.read_csv("merged_data.csv")

In [5]:
df.head(20)

Unnamed: 0.4,label,Text,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Unnamed: 0.3
0,1,Deep in the darkest part of your heart Every b...,,,,
1,1,"Pining for a much better quality of life, Open...",,,,
2,1,Everyone should care. No one should ignore a w...,,,,
3,1,h-healthy food e-eat a lot a-are you healthy l...,,,,
4,1,Adorable are women as mothers! Detestable when...,,,,
5,1,Aborigines lived here for forty thousand years...,,,,
6,1,"Doctors are a noble lot Often misunderstood, a...",,,,
7,1,Every flower that blooms withers Very true is ...,,,,
8,1,He gave forgiveness for all who repent In His ...,,,,
9,1,Kindles my heart with love enthuse my soul wit...,,,,


In [22]:
df["Text"] = df["Text"].str.lower()
X = df["Text"]
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [23]:
X_train

79     6 devotional paths   to the divine  you  may  ...
340    www.samsung.comuser manual2 about this manual ...
18     i don’t desire to wear a sad face too long whe...
292    google vs. microsoft: tech titans lock horns o...
104    provided by   accounting financial statements ...
                             ...                        
188    lpl - production test collection  centre secto...
71     forests: our lifeline  12 one evening boojho e...
106    financial statements dr. derek farnsworth  |  ...
270    regulatory intervention spurs reflection in fi...
102    73   research papers   faculty of materials sc...
Name: Text, Length: 243, dtype: object

In [24]:
y_train

79     0
340    4
18     1
292    7
104    6
      ..
188    8
71     0
106    6
270    7
102    6
Name: label, Length: 243, dtype: int64

### Data Preprocessing

In [25]:
def clean_text(text):
    text_length=[]
    cleaned_text=[]
    for sent in tqdm(text):
        sent=sent.lower()
        sent=nfx.remove_special_characters(sent)
        sent=nfx.remove_stopwords(sent)
#         sent=nfx.remove_shortwords(sent)
        text_length.append(len(sent.split()))
        cleaned_text.append(sent)
    return cleaned_text,text_length

In [26]:
cleaned_train_text,train_text_length=clean_text(X_train)
cleaned_test_text,test_text_length=clean_text(X_test)

100%|██████████| 243/243 [00:01<00:00, 186.89it/s]
100%|██████████| 105/105 [00:00<00:00, 190.19it/s]


In [27]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(X_train)

In [28]:
word_freq=pd.DataFrame(tokenizer.word_counts.items(),columns=['word','count']).sort_values(by='count',ascending=False)

In [29]:
train_text_seq=tokenizer.texts_to_sequences(X_train)
train_text_pad=pad_sequences(train_text_seq,maxlen=400)


test_text_seq=tokenizer.texts_to_sequences(X_test)
test_text_pad=pad_sequences(test_text_seq,maxlen=400)

In [30]:
v=len(tokenizer.word_index)

In [31]:
v

33605

In [14]:
word_freq[10:300]

Unnamed: 0,word,count
108,or,4705
152,that,4697
58,by,4488
45,are,4275
245,with,4256
...,...,...
651,increase,368
1017,effect,367
3516,rationalised,364
758,products,363


##### Using Glove Embeddings in .pkl format 

In [15]:
# glove_embedding={}
with open('glove.840B.300d.pkl', 'rb') as fp:
    glove_embedding = pickle.load(fp)

##### One-hot encoding

In [32]:
from tensorflow.keras.utils import to_categorical
y_train_onehot = to_categorical(y_train, num_classes=10)
y_test_onehot = to_categorical(y_test, num_classes=10)

In [33]:
y_train_onehot

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

#### Embeddings Generation

In [34]:
v=len(tokenizer.word_index)
embedding_matrix=np.zeros((v+1,300), dtype=float)
for word,idx in tokenizer.word_index.items():
    embedding_vector=glove_embedding.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx]=embedding_vector

#### LSTM based Model Architecture Definition

In [35]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, LSTM, GlobalMaxPooling1D, Dense
from tensorflow.keras.optimizers import Adam  
import tensorflow.keras.backend as K

num_classes = 10  

model = Sequential()

model.add(Input(shape=(400,)))

model.add(Embedding(v+1, 300, weights=[embedding_matrix], trainable=False))

model.add(LSTM(20, return_sequences=True))

model.add(GlobalMaxPooling1D())

model.add(Dense(256, activation='relu'))

model.add(Dense(num_classes, activation='softmax')) 

model.compile(optimizer=Adam(learning_rate=0.005), loss='categorical_crossentropy', metrics=['accuracy'])  


In [36]:
early_stop=EarlyStopping(patience=5)
reducelr=ReduceLROnPlateau(patience=3)

In [37]:
r=model.fit(train_text_pad,np.array(y_train_onehot),validation_data=(test_text_pad,np.array(y_test_onehot)),
            epochs=10,batch_size=24,callbacks=[reducelr])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### 100% Training Accuracy and 98.10% Testing Accuracy on the Collected Dataset 

### Testing on Test PDFs

In [211]:
import os
import PyPDF2
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np


def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        if len(reader.pages)<100:
            for page_num in range(len(reader.pages)):
                text += reader.pages[page_num].extract_text()
            text = text.replace("\n", " ")
        else:
            for page_num in range(100):
                text += reader.pages[page_num].extract_text()
            text = text.replace("\n", " ")
    return text.strip()


predicted_labels = []
ground_truth_labels = []

for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):

        pdf_text = extract_text_from_pdf(os.path.join(folder_path, filename))
        preprocessed_text = tokenizer.texts_to_sequences([pdf_text])
        padded_text=pad_sequences(preprocessed_text,maxlen=400)
        predictions = model.predict(padded_text)
        predicted_class_index = np.argmax(predictions)
        predicted_labels.append(predicted_class_index)
        
        ground_truth_label = int(filename.split(".")[0]) - 1
        ground_truth_labels.append(ground_truth_label)

accuracy = np.mean(np.array(predicted_labels) == np.array(ground_truth_labels)) * 100
print(f"Accuracy: {accuracy:.2f}%")       

Accuracy: 100.00%


In [212]:
predicted_labels

[0, 9, 1, 2, 3, 4, 5, 6, 7, 8]

In [213]:
ground_truth_labels

[0, 9, 1, 2, 3, 4, 5, 6, 7, 8]

### 100% Accuracy on given test files

In [None]:
model.save('my_model.h5')

In [39]:
model = load_model('my_model.h5')

### LLM Server (In progress)

##### We are trying to finetune a local llm then running it into LM Studio

#### Currently not developed fully

In [27]:
from openai import OpenAI

client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")

def llm_output(prompts):
        llm_list = []
        for prompt in prompts:
            completion = client.chat.completions.create(
              model="local-model", # this field is currently unused
              messages=[
                {"role": "system", "content": "Answer in the range of 0 - 9"},
                {f"role": "user", "content": "Tell in which category this document lies : {prompt}"}
              ],
              temperature=0.7,
            )
            llm_list.append(completion.choices[0].message)
        return llm_list

In [137]:
llm = llm_output(["Great product, everyone should buy it"])
print(llm)

[ChatCompletionMessage(content=' Fake\n', role='assistant', function_call=None, tool_calls=None)]
