In [1]:
### Importing the necessary modules ###
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [2]:
total_data = pd.read_csv('training_data.csv', index_col=0)

In [3]:
total_data.head()

Unnamed: 0,feature,actual_val,no_of_mistakes
0,==============\nreStructuredText\n==\n\n**reSt...,==============\nreStructuredText\n============...,105
1,==============\n reStructuredText\n========...,==============\nreStructuredText\n============...,113
2,==============\n reStructuredText\n========...,==============\nreStructuredText\n============...,108
3,==============\nreStructuredText\n=======\...,==============\nreStructuredText\n============...,113
4,==============\n reStructuredText\n =======...,==============\nreStructuredText\n============...,112


In [4]:
# Using tokenization to convert the text into a matrix of token counts
feature_extraction = TfidfVectorizer(min_df=1, stop_words = 'english', lowercase=True)

train_features = feature_extraction.fit_transform(total_data.feature)
train_labels = feature_extraction.transform(total_data.actual_val)


In [5]:
train_arr_feature_dense = train_features.toarray()
train_arr_label_dense = train_labels.toarray()

In [6]:
# Splitting the data into train and test datasets
# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(train_arr_feature_dense, train_arr_label_dense, test_size=0.2, random_state=42)

In [7]:
X_train_t = tf.convert_to_tensor(X_train)
X_test_t = tf.convert_to_tensor(X_test)
Y_train_t = tf.convert_to_tensor(Y_train)
Y_test_t = tf.convert_to_tensor(Y_test)

In [8]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(227,)),
    tf.keras.layers.Dense(units=350, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(units=350, activation='relu'),
    tf.keras.layers.Dense(units=227, activation='linear')
])

In [9]:
model.compile(optimizer='adam', loss='mean_squared_error') 

In [10]:
model.fit(X_train_t, Y_train_t, epochs=500, batch_size=32, validation_split=0.2)

Epoch 1/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - loss: 0.0033 - val_loss: 4.8967e-04
Epoch 2/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 4.8308e-04 - val_loss: 7.3973e-05
Epoch 3/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1.5494e-04 - val_loss: 1.5785e-05
Epoch 4/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 9.1027e-05 - val_loss: 8.5789e-06
Epoch 5/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 6.5028e-05 - val_loss: 1.3866e-05
Epoch 6/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 6.0515e-05 - val_loss: 1.0955e-05
Epoch 7/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 5.5841e-05 - val_loss: 7.2784e-06
Epoch 8/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 5.0842e-05 - val_loss

<keras.src.callbacks.history.History at 0x202ed949eb0>

In [11]:
loss_training= model.evaluate(X_train_t, Y_train_t, verbose=1)
print(f"Loss: {loss_training}")


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.3123e-08 
Loss: 3.287149041852899e-08


In [12]:
predictions = model.predict(X_test_t)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


In [13]:
loss_test = model.evaluate(X_test_t, Y_test_t)
print(f"Loss:{loss_test}")

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.5298e-08  
Loss:3.632763778682602e-08


In [14]:
predictions

array([[-2.8311927e-04, -7.4925832e-05, -8.5402280e-06, ...,
         1.5643612e-04,  7.3278904e-02,  7.6197378e-02],
       [-2.5093742e-04, -1.4787540e-05, -1.3770536e-05, ...,
        -2.6412308e-06,  2.6393682e-04,  6.3374966e-02],
       [-2.5093742e-04, -1.4787540e-05, -1.3770536e-05, ...,
        -2.6412308e-06,  2.6393682e-04,  6.3374966e-02],
       ...,
       [-3.6093220e-04, -2.4267938e-04, -1.2497045e-04, ...,
         9.4063438e-02,  9.4263621e-02,  4.8806623e-02],
       [-2.5093649e-04, -1.4791265e-05, -1.3770536e-05, ...,
        -2.6449561e-06,  2.6393868e-04,  6.3374966e-02],
       [-2.0845421e-04,  6.9928713e-02,  6.9915958e-02, ...,
         1.0297676e-01,  1.2980960e-04,  2.6845830e-02]], dtype=float32)

In [15]:
X_test_text = feature_extraction.inverse_transform(X_test_t)

In [16]:
Y_test_text = feature_extraction.inverse_transform(predictions)

In [17]:
X_test_text

[array(['2024', '300px', '31', '400px', 'add', 'additional', 'align',
        'alt', 'august', 'author', 'center', 'code', 'com', 'conclusion',
        'create', 'data', 'date', 'def', 'definition', 'demonstrates',
        'different', 'directive', 'directives', 'document',
        'documentation', 'docutils', 'example', 'features', 'follows',
        'function', 'functions', 'header', 'heading', 'headings',
        'highlight', 'html', 'https', 'illustrates', 'image', 'images',
        'important', 'include', 'including', 'information', 'introduction',
        'io', 'item', 'jpg', 'key', 'level', 'levels', 'link', 'links',
        'list', 'lists', 'note', 'number', 'numbers', 'official',
        'ordered', 'org', 'output', 'overview', 'paragraph', 'param',
        'png', 'print', 'provide', 'provides', 'python', 'refer', 'remote',
        'rest', 'restructuredtext', 'result', 'return', 'right', 'row',
        'rst', 'second', 'section', 'sections', 'simple', 'sourceforge',
        'su