In [64]:
### Importing the necessary modules ###
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [65]:
total_data = pd.read_csv('training_data.csv', index_col=0)

In [66]:
total_data.head()

Unnamed: 0,feature,actual_val,no_of_mistakes
0,==============\nreStructuredText\n============...,==============\nreStructuredText\n============...,23
1,==============\nreStructuredText\n============...,==============\nreStructuredText\n============...,25
2,==============\nreStructuredText\n============...,==============\nreStructuredText\n============...,20
3,==============\nreStructuredText\n============...,==============\nreStructuredText\n============...,21
4,==============\nreStructuredText\n============...,==============\nreStructuredText\n============...,21


In [67]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words = 'english', lowercase=True)

train_features = feature_extraction.fit_transform(total_data.feature)
train_labels = feature_extraction.transform(total_data.actual_val)

In [68]:
train_arr_feature_dense = train_features.toarray()
train_arr_label_dense = train_labels.toarray()

In [69]:
# Splitting the data into train and test datasets
# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(train_arr_feature_dense, train_arr_label_dense, test_size=0.2, random_state=42)

In [70]:
X_train_t = tf.convert_to_tensor(X_train)
X_test_t = tf.convert_to_tensor(X_test)
Y_train_t = tf.convert_to_tensor(Y_train)
Y_test_t = tf.convert_to_tensor(Y_test)

In [71]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(227,)),
    tf.keras.layers.Dense(units=350, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(units=350, activation='relu'),
    tf.keras.layers.Dense(units=227, activation='linear')
])

In [72]:
model.compile(optimizer='adam', loss='mean_squared_error') 

In [73]:
model.fit(X_train_t, Y_train_t, epochs=500, batch_size=32, validation_split=0.2)

Epoch 1/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - loss: 0.0030 - val_loss: 4.5449e-04
Epoch 2/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 4.6239e-04 - val_loss: 1.0397e-04
Epoch 3/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1.5940e-04 - val_loss: 2.2392e-05
Epoch 4/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 9.8864e-05 - val_loss: 1.0790e-05
Epoch 5/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 6.8478e-05 - val_loss: 6.8939e-06
Epoch 6/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 5.9698e-05 - val_loss: 6.0422e-06
Epoch 7/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 5.2717e-05 - val_loss: 1.0625e-05
Epoch 8/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 4.8711e-05 - val_loss

<keras.src.callbacks.history.History at 0x2c088dbd3a0>

In [74]:
loss_training= model.evaluate(X_train_t, Y_train_t, verbose=1)
print(f"Loss: {loss_training}")


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.0029e-08 
Loss: 3.0432772746280534e-08


In [75]:
predictions = model.predict(X_test_t)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


In [76]:
loss_test = model.evaluate(X_test_t, Y_test_t)
print(f"Loss:{loss_test}")

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.0767e-08  
Loss:3.040983997948388e-08


In [77]:
predictions

array([[-1.4145672e-04,  5.3141266e-06,  3.9875507e-05, ...,
         3.2238849e-04,  7.3262706e-02,  7.6314703e-02],
       [-7.3960051e-05,  3.0361116e-07,  5.9876591e-05, ...,
         3.1131506e-04,  1.5707593e-04,  6.3452542e-02],
       [-7.3960051e-05,  3.0361116e-07,  5.9876591e-05, ...,
         3.1131506e-04,  1.5707593e-04,  6.3452542e-02],
       ...,
       [-2.5138259e-05,  4.5076013e-07, -4.4684857e-05, ...,
         9.3877316e-02,  9.3814164e-02,  4.8870899e-02],
       [-7.3961914e-05,  3.0733645e-07,  5.9876591e-05, ...,
         3.1131506e-04,  1.5707593e-04,  6.3452542e-02],
       [-9.2992559e-05,  7.0201322e-02,  7.0230670e-02, ...,
         1.0312909e-01,  1.7690007e-05,  2.6805408e-02]], dtype=float32)

In [78]:
X_test_text = feature_extraction.inverse_transform(X_test_t)

In [79]:
Y_test_text = feature_extraction.inverse_transform(predictions)

In [81]:
X_test_text

[array(['2024', '300px', '31', '400px', 'add', 'additional', 'align',
        'alt', 'august', 'author', 'center', 'code', 'com', 'conclusion',
        'create', 'data', 'date', 'def', 'definition', 'demonstrates',
        'different', 'directive', 'directives', 'document',
        'documentation', 'docutils', 'example', 'features', 'follows',
        'function', 'functions', 'header', 'heading', 'headings',
        'highlight', 'html', 'https', 'illustrates', 'image', 'images',
        'important', 'include', 'including', 'information', 'introduction',
        'io', 'item', 'jpg', 'key', 'level', 'levels', 'link', 'links',
        'list', 'lists', 'note', 'number', 'numbers', 'official',
        'ordered', 'org', 'output', 'overview', 'paragraph', 'param',
        'png', 'print', 'provide', 'provides', 'python', 'refer', 'remote',
        'rest', 'restructuredtext', 'result', 'return', 'right', 'row',
        'rst', 'second', 'section', 'sections', 'simple', 'sourceforge',
        'su