In [1]:
### Importing the necessary modules ###
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [2]:
total_data = pd.read_csv('training_data.csv', index_col=0)

In [3]:
total_data.head()

Unnamed: 0,feature,actual_val,no_of_mistakes
0,==============\nreStructuredText\n============...,==============\nreStructuredText\n============...,23
1,==============\nreStructuredText\n============...,==============\nreStructuredText\n============...,25
2,==============\nreStructuredText\n============...,==============\nreStructuredText\n============...,20
3,==============\nreStructuredText\n============...,==============\nreStructuredText\n============...,21
4,==============\nreStructuredText\n============...,==============\nreStructuredText\n============...,21


In [4]:
# Using tokenization to convert the text into a matrix of token counts
feature_extraction = TfidfVectorizer(min_df=1, stop_words = 'english', lowercase=True)

train_features = feature_extraction.fit_transform(total_data.feature)
train_labels = feature_extraction.transform(total_data.actual_val)


In [5]:
train_arr_feature_dense = train_features.toarray()
train_arr_label_dense = train_labels.toarray()

In [6]:
# Splitting the data into train and test datasets
# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(train_arr_feature_dense, train_arr_label_dense, test_size=0.2, random_state=42)

In [7]:
X_train_t = tf.convert_to_tensor(X_train)
X_test_t = tf.convert_to_tensor(X_test)
Y_train_t = tf.convert_to_tensor(Y_train)
Y_test_t = tf.convert_to_tensor(Y_test)

In [8]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(227,)),
    tf.keras.layers.Dense(units=350, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(units=350, activation='relu'),
    tf.keras.layers.Dense(units=227, activation='linear')
])

In [9]:
model.compile(optimizer='adam', loss='mean_squared_error') 

In [10]:
model.fit(X_train_t, Y_train_t, epochs=500, batch_size=32, validation_split=0.2)

Epoch 1/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - loss: 0.0032 - val_loss: 4.8702e-04
Epoch 2/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 4.7909e-04 - val_loss: 1.0076e-04
Epoch 3/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1.6946e-04 - val_loss: 2.3650e-05
Epoch 4/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 8.7701e-05 - val_loss: 8.0570e-06
Epoch 5/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 6.4864e-05 - val_loss: 5.8990e-06
Epoch 6/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 6.2098e-05 - val_loss: 7.7838e-06
Epoch 7/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 5.0854e-05 - val_loss: 4.8788e-06
Epoch 8/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 4.8479e-05 - val_loss

<keras.src.callbacks.history.History at 0x21ce2063740>

In [11]:
loss_training= model.evaluate(X_train_t, Y_train_t, verbose=1)
print(f"Loss: {loss_training}")


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 2.2935e-09 
Loss: 2.3488928579951107e-09


In [12]:
predictions = model.predict(X_test_t)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


In [13]:
loss_test = model.evaluate(X_test_t, Y_test_t)
print(f"Loss:{loss_test}")

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 2.2815e-09  
Loss:2.273832899746253e-09


In [14]:
predictions

array([[ 1.6353559e-05,  3.2852404e-05,  4.1042455e-05, ...,
        -2.6199967e-05,  7.3026873e-02,  7.6237544e-02],
       [-4.6872068e-05, -3.4178607e-05,  4.6668574e-06, ...,
        -5.4873526e-06,  9.3332492e-05,  6.3482240e-02],
       [-4.6872068e-05, -3.4178607e-05,  4.6668574e-06, ...,
        -5.4873526e-06,  9.3332492e-05,  6.3482240e-02],
       ...,
       [-2.4571549e-05,  4.0941872e-05,  4.3938868e-05, ...,
         9.3717903e-02,  9.3677193e-02,  4.8930507e-02],
       [-4.6874862e-05, -3.4197234e-05,  4.6622008e-06, ...,
        -5.4873526e-06,  9.3333423e-05,  6.3482240e-02],
       [-2.3592729e-05,  6.9975704e-02,  7.0055895e-02, ...,
         1.0288331e-01,  3.3101067e-05,  2.6867863e-02]], dtype=float32)

In [15]:
X_test_text = feature_extraction.inverse_transform(X_test_t)

In [16]:
Y_test_text = feature_extraction.inverse_transform(predictions)

In [17]:
X_test_text

[array(['2024', '300px', '31', '400px', 'add', 'additional', 'align',
        'alt', 'august', 'author', 'center', 'code', 'com', 'conclusion',
        'create', 'data', 'date', 'def', 'definition', 'demonstrates',
        'different', 'directive', 'directives', 'document',
        'documentation', 'docutils', 'example', 'features', 'follows',
        'function', 'functions', 'header', 'heading', 'headings',
        'highlight', 'html', 'https', 'illustrates', 'image', 'images',
        'important', 'include', 'including', 'information', 'introduction',
        'io', 'item', 'jpg', 'key', 'level', 'levels', 'link', 'links',
        'list', 'lists', 'note', 'number', 'numbers', 'official',
        'ordered', 'org', 'output', 'overview', 'paragraph', 'param',
        'png', 'print', 'provide', 'provides', 'python', 'refer', 'remote',
        'rest', 'restructuredtext', 'result', 'return', 'right', 'row',
        'rst', 'second', 'section', 'sections', 'simple', 'sourceforge',
        'su