In [22]:
import tarfile
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import os
from multiprocessing import Pool




In [23]:
# Path to the compressed dataset
tar_path = 'C:/Users/Shubham S pathak/Downloads/aclImdb_v1.tar.gz'

# Safe extraction function
def is_safe_path(base_path, target):
    return os.path.realpath(target).startswith(os.path.realpath(base_path))

def safe_extract(tar, path=".", members=None):
    for member in tar.getmembers():
        member_path = os.path.join(path, member.name)
        if not is_safe_path(path, member_path):
            raise Exception("Attempted Path Traversal in Tar File")
    tar.extractall(path, members) 

# Extracting the dataset
with tarfile.open(tar_path, 'r:gz') as tar:
    safe_extract(tar, path='C:/Users/Shubham S pathak/Downloads/')
    
print('Dataset extracted safely and successfully!')



  tar.extractall(path, members)


Dataset extracted safely and successfully!


In [15]:
import os

# Check if the main folder exists
data_dir = 'C:/Users/Shubham S pathak/Downloads/aclImdb_v1.tar.gz'
if os.path.exists(data_dir):
    print("Main folder found!")
    print("Subfolders and files:")
    for root, dirs, files in os.walk(data_dir):
        print(f"📁 {root}")
        for d in dirs:
            print(f"  📂 {d}")
        for f in files[:5]:  # Print first 5 files only
            print(f"    📄 {f}")
else:
    print("Main folder not found. Extraction may have failed.")


Main folder found!
Subfolders and files:


In [21]:
import os

def load_imdb_data(data_dir):
    texts, labels = [], []
    for label_type in ['pos', 'neg']:
        dir_name = os.path.join(data_dir, label_type)
        print(f'Checking directory: {dir_name}')  # Debugging line
        for fname in os.listdir(dir_name):
            print(f'Found file: {fname}')  # Debugging line
            if fname.endswith('.txt'):
                with open(os.path.join(dir_name, fname), encoding='utf-8') as f:
                    texts.append(f.read())
                    labels.append(1 if label_type == 'pos' else 0)
    return texts, labels

# Loading training and testing data
train_texts, train_labels = load_imdb_data('C:/Users/Shubham S pathak/Downloads/aclImdb/train')
test_texts, test_labels = load_imdb_data('C:/Users/Shubham S pathak/Downloads/aclImdb/test')

print(f'Loaded {len(train_texts)} training and {len(test_texts)} testing reviews.')



Checking directory: C:/Users/Shubham S pathak/Downloads/aclImdb/train\pos
Found file: 0_9.txt
Found file: 10000_8.txt
Found file: 10001_10.txt
Found file: 10002_7.txt
Found file: 10003_8.txt
Found file: 10004_8.txt
Found file: 10005_7.txt
Found file: 10006_7.txt
Found file: 10007_7.txt
Found file: 10008_7.txt
Found file: 10009_9.txt
Found file: 1000_8.txt
Found file: 10010_7.txt
Found file: 10011_9.txt
Found file: 10012_8.txt
Found file: 10013_7.txt
Found file: 10014_8.txt
Found file: 10015_8.txt
Found file: 10016_8.txt
Found file: 10017_9.txt
Found file: 10018_8.txt
Found file: 10019_8.txt
Found file: 1001_8.txt
Found file: 10020_8.txt
Found file: 10021_8.txt
Found file: 10022_7.txt
Found file: 10023_9.txt
Found file: 10024_9.txt
Found file: 10025_9.txt
Found file: 10026_7.txt
Found file: 10027_7.txt
Found file: 10028_10.txt
Found file: 10029_10.txt
Found file: 1002_7.txt
Found file: 10030_10.txt
Found file: 10031_10.txt
Found file: 10032_10.txt
Found file: 10033_10.txt
Found file: 10

In [17]:
# Initialize tokenizer
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_texts)

# Convert texts to sequences and pad them
X_train = pad_sequences(tokenizer.texts_to_sequences(train_texts), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_texts), maxlen=200)

# Convert labels to numpy arrays
y_train = np.array(train_labels)
y_test = np.array(test_labels)

print(f'Training data shape: {X_train.shape}')
print(f'Testing data shape: {X_test.shape}')


Training data shape: (25000, 200)
Testing data shape: (25000, 200)


In [25]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(200,)),  # Explicitly defining input shape
    tf.keras.layers.Embedding(10000, 128),
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())



None


In [26]:
history = model.fit(X_train, y_train, epochs=3, batch_size=64, validation_data=(X_test, y_test))


Epoch 1/3
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 274ms/step - accuracy: 0.7460 - loss: 0.4974 - val_accuracy: 0.8552 - val_loss: 0.3492
Epoch 2/3
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 282ms/step - accuracy: 0.8958 - loss: 0.2644 - val_accuracy: 0.8375 - val_loss: 0.3784
Epoch 3/3
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 316ms/step - accuracy: 0.9192 - loss: 0.2106 - val_accuracy: 0.8674 - val_loss: 0.3241


In [27]:
predictions = (model.predict(X_test) > 0.5).astype('int32')

# Accuracy and classification report
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy * 100:.2f}%')
print(classification_report(y_test, predictions))


[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 61ms/step
Accuracy: 86.74%
              precision    recall  f1-score   support

           0       0.87      0.86      0.87     12500
           1       0.86      0.88      0.87     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000



In [30]:
model.save('/mnt/data/imdb_sentiment_model.keras')
print('Model saved successfully in .keras format!')



Model saved successfully in .keras format!


In [31]:
def predict_sentiment(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded_seq = pad_sequences(sequence, maxlen=200)
    prediction = model.predict(padded_seq)[0][0]
    sentiment = "Positive" if prediction > 0.5 else "Negative"
    return sentiment

print(predict_sentiment("This movie was absolutely fantastic!"))
print(predict_sentiment("Worst movie ever, totally hated it."))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 560ms/step
Positive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 396ms/step
Negative
