In [1]:
import tensorflow as tf
import pandas as pd
import os
import tarfile
import urllib.request


In [2]:
# ##extracting data
# # Path to the .tar.gz file
# tar_file_path = r"C:\Users\sergi\Documents\Py\IMDB_ML\aclImdb_v1.tar.gz"
# extract_path = r"C:\Users\sergi\Documents\Py\IMDB_ML\aclImdb"

# # Extract the tar.gz file
# with tarfile.open(tar_file_path, "r:gz") as tar:
#     tar.extractall(path=extract_path)

# print("Extraction complete.")

In [3]:
##Reading text 
def read_text_files(directory):
    text_data = []
    for root, dirs, files in os.walk(directory):
        for file_name in files:
            if file_name.endswith(".txt"):  # Ensure we are reading only text files
                file_path = os.path.join(root, file_name)
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                    text_data.append(content)
    return text_data

In [4]:
train_pos_reviews_path = r"C:\Users\sergi\Documents\Py\IMDB_ML\aclImdb\aclImdb\train\pos"
train_neg_reviews_path = r"C:\Users\sergi\Documents\Py\IMDB_ML\aclImdb\aclImdb\train\neg"
test_pos_reviews_path = r"C:\Users\sergi\Documents\Py\IMDB_ML\aclImdb\aclImdb\test\pos"
test_neg_reviews_path = r"C:\Users\sergi\Documents\Py\IMDB_ML\aclImdb\aclImdb\test\neg"


In [5]:
Train_Positive=(read_text_files(train_pos_reviews_path))
Test_Positive=(read_text_files(test_pos_reviews_path))
Train_Negative=(read_text_files(train_neg_reviews_path))
Test_Negative=(read_text_files(test_neg_reviews_path))


In [6]:
import numpy as np

# Convert labels to one-hot encoding
def to_one_hot(labels, num_classes):
    return np.eye(num_classes)[labels]

# Number of classes
num_classes = 2

# Generate labels (one-hot encoded)
Train_pos = to_one_hot([0] * len(Train_Positive), num_classes)
Train_neg = to_one_hot([1] * len(Train_Negative), num_classes)
Test_pos = to_one_hot([0] * len(Test_Positive), num_classes)
Test_neg = to_one_hot([1] * len(Test_Negative), num_classes)

# Split data into training and validation sets
split_index_train = int(len(Train_Positive) / 2)
split_index_val = int(len(Test_Positive) / 2)

# Training data
in_Train_positive = Train_Positive[:split_index_train]
in_Train_negative = Train_Negative[:split_index_train]
Train_txt = in_Train_positive + in_Train_negative
Train_out = np.concatenate((Train_pos[:split_index_train], Train_neg[:split_index_train]))

# Validation data
in_val_positive = Test_Positive[:split_index_val]
in_val_negative = Test_Negative[:split_index_val]
Vali_txt = in_val_positive + in_val_negative
Vali_out = np.concatenate((Test_pos[:split_index_val], Test_neg[:split_index_val]))

In [7]:
##preprocessing using textvectorisation
text_vec_layer = tf.keras.layers.TextVectorization(
    output_mode="tf_idf",
    max_tokens=1000  # Reduce based on your memory capacity
)
# Adapting the layer to the training data
text_vec_layer.adapt(Train_txt)

# Vectorizing the data
Vectorised_Train = text_vec_layer(Train_txt)
Vectorised_Val = text_vec_layer(Vali_txt)


In [8]:


model = tf.keras.Sequential([
    tf.keras.layers.Dense(5000, activation='relu'),  
    tf.keras.layers.Dense(1000, activation='relu'),  
    tf.keras.layers.Dense(500, activation='relu'),  
    tf.keras.layers.Dense(num_classes, activation='softmax')  # Output layer with softmax activation with softmax
])


In [9]:
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',  # Use binary_crossentropy because we are dealing with either good or bad classification
    metrics=['accuracy']
)

In [10]:
# Prepare datasets
train_dataset = tf.data.Dataset.from_tensor_slices((Vectorised_Train, Train_out)).batch(1000)
val_dataset = tf.data.Dataset.from_tensor_slices((Vectorised_Val, Vali_out)).batch(100)


In [11]:
model.fit(train_dataset, epochs=30, validation_data=val_dataset)  # Validation data is optional but useful


Epoch 1/30
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 353ms/step - accuracy: 0.5645 - loss: 81.3945 - val_accuracy: 0.5000 - val_loss: 9.5735
Epoch 2/30
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 290ms/step - accuracy: 0.0845 - loss: 11.1096 - val_accuracy: 0.5207 - val_loss: 0.7322
Epoch 3/30
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 259ms/step - accuracy: 0.2963 - loss: 0.9901 - val_accuracy: 0.5000 - val_loss: 0.7198
Epoch 4/30
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 254ms/step - accuracy: 0.1108 - loss: 0.8670 - val_accuracy: 0.5914 - val_loss: 0.6752
Epoch 5/30
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 283ms/step - accuracy: 0.2850 - loss: 0.7262 - val_accuracy: 0.5026 - val_loss: 0.6861
Epoch 6/30
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 284ms/step - accuracy: 0.1450 - loss: 0.8161 - val_accuracy: 0.5840 - val_loss: 0.6546
Epoch 7/30
[1m13/13[0m 

<keras.src.callbacks.history.History at 0x17d428f94d0>

In [13]:
model.save("C:\\Users\\sergi\\Documents\\Py\\IMDB_ML\\IMBD_reviews_vectorisation_v1.keras")


In [14]:
test_loss, test_accuracy = model.evaluate(val_dataset)
print(f'Categorical Cross entropy Loss: {test_loss}') ###Loss value on th last dataset
print(f'Test Accuracy: {test_accuracy}')

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8408 - loss: 0.4694
Categorical Cross entropy Loss: 0.5280663967132568
Test Accuracy: 0.852400004863739


In [15]:
##Transforming the softmax output into final output
def pos_net(pre):
    a=(pre.tolist())
    a=[round(i) for i in a]
    if round(a[0])>0.5: 
        return "pos"
    if round(a[1])>0.5: 
        return "neg"
    else:
        return "Not properly classified"

In [16]:
##Making prediction on only negative reviews
Vectorised_Test_neg = text_vec_layer(Test_Negative)
Predictions_Negative = model.predict(Vectorised_Test_neg)

Predictions_Negative_Output=[pos_net(Predictions_Negative[i]) for i in range(0,len(Predictions_Negative))]

[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step


In [17]:
count_neg = Predictions_Negative_Output.count("neg")

# Calculate the total number of predictions
total_predictions_neg_set = len(Predictions_Negative_Output)

# Calculate the proportion
accuracy_neg = count_neg / total_predictions_neg_set

# Print the result using formatted strings
print(f"Accuracy for negative testing set: {accuracy_neg:.2f}")

Accuracy for negative testing set: 0.87


In [18]:
##Making prediction on only positive reviews
Vectorised_Test_pos = text_vec_layer(Test_Positive)
Predictions_Positive = model.predict(Vectorised_Test_pos)

Predictions_Positive_Output=[pos_net(Predictions_Positive[i]) for i in range(0,len(Predictions_Positive))]

[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step


In [19]:
count_pos = Predictions_Positive_Output.count("pos")

# Calculate the total number of predictions
total_predictions_pos_set = len(Predictions_Positive_Output)

# Calculate the proportion
accuracy_pos = count_pos / total_predictions_pos_set

# Print the result using formatted strings
print(f"Accuracy for positive testing set: {accuracy_pos:.2f}")

Accuracy for positive testing set: 0.83


In [20]:
##Total testing set accuracy
Testing_Accuracy=(count_pos+count_neg)/(total_predictions_neg_set+total_predictions_pos_set)
print(f"Accuracy for overall testing set: {Testing_Accuracy:.2f}")

Accuracy for overall testing set: 0.85


##performed on the data obtained in; 
@InProceedings{maas-EtAl:2011:ACL-HLT2011,
  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
  title     = {Learning Word Vectors for Sentiment Analysis},
  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
  month     = {June},
  year      = {2011},
  address   = {Portland, Oregon, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {142--150},
  url       = {http://www.aclweb.org/anthology/P11-1015}
}
