In [1]:
import os

def load_data_from_folder(directory):
    """Load data and labels from given directory."""
    data = []
    labels = []

    # Positives
    pos_dir = os.path.join(directory, 'pos')
    for filename in os.listdir(pos_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(pos_dir, filename), 'r', encoding='utf-8') as file:
                data.append(file.read())
                labels.append(1)  # Positive label

    # Negatives
    neg_dir = os.path.join(directory, 'neg')
    for filename in os.listdir(neg_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(neg_dir, filename), 'r', encoding='utf-8') as file:
                data.append(file.read())
                labels.append(0)  # Negative label

    return data, labels

# Load training and testing data
train_data, train_labels = load_data_from_folder('/home/rgukt/Downloads/aclImdb/train')
test_data, test_labels = load_data_from_folder('/home/rgukt/Downloads/aclImdb/test')


In [2]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load stopwords list only once
stop_words = set(stopwords.words('english'))

def preprocess_text(text, stop_words):
    # Lowercasing and non-alphabet removal
    text = re.sub("[^a-zA-Z]", " ", text).lower()
    words = text.split()
    
    # Stopwords and lemmatization
    lemmatizer = WordNetLemmatizer()
    cleaned_text = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return " ".join(cleaned_text)

# Apply preprocessing to text data
train_data = [preprocess_text(doc, stop_words) for doc in train_data]
test_data = [preprocess_text(doc, stop_words) for doc in test_data]


In [3]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=1000)
X_train = vectorizer.fit_transform(train_data)
X_test = vectorizer.transform(test_data)

y_train = train_labels
y_test = test_labels


In [4]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [5]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.85712
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.85      0.86     12500
           1       0.85      0.87      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000



In [7]:
def load_unsupervised_data(directory):
    data = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                data.append(file.read())
    return data

unsupervised_data = load_unsupervised_data('/home/rgukt/Downloads/aclImdb/train/unsup')
unsupervised_data = [preprocess_text(text,stop_words) for text in unsupervised_data]
unsupervised_features = vectorizer.transform(unsupervised_data)
unsupervised_predictions = model.predict(unsupervised_features)


In [8]:
# Assuming model.predict_proba() returns probabilities for positive class

# Set threshold for classification
threshold = 0.5

# Make predictions on unsupervised data
unsupervised_probabilities = model.predict_proba(unsupervised_features)

# Classify reviews into "positive", "negative", and "neutral"
classified_reviews = []
for prob in unsupervised_probabilities:
    if prob[1] > threshold:
        classified_reviews.append("positive")
    elif prob[1] < 1 - threshold:
        classified_reviews.append("negative")
    else:
        classified_reviews.append("neutral")

# Print the number of reviews in each category
print("Positive Reviews:", classified_reviews.count("positive"))
print("Negative Reviews:", classified_reviews.count("negative"))
print("Neutral Reviews:", classified_reviews.count("neutral"))


Positive Reviews: 25304
Negative Reviews: 24696
Neutral Reviews: 0


In [12]:
def classify_review(review, stop_words):
    # Preprocess the review
    preprocessed_review = preprocess_text(review, stop_words)
    
    # Transform the preprocessed review into features
    review_features = vectorizer.transform([preprocessed_review])
    
    # Predict the probability of the review being positive
    probability_positive = model.predict_proba(review_features)[0][1]
    
    # Set threshold for classification
    threshold = 0.5
    
    # Classify the review
    if probability_positive > threshold:
        return "positive"
    elif probability_positive < 1 - threshold:
        return "negative"
    else:
        return "neutral"

# Test inputted reviews
reviews = [
    "This movie was amazing! I loved every minute of it.",
    "Terrible movie. Waste of time and money.",
    "The movie was okay. Not great, but not terrible either."
]

for review in reviews:
    classification = classify_review(review, stop_words)
    print(f"Review: {review}\nClassification: {classification}\n")


Review: This movie was amazing! I loved every minute of it.
Classification: positive

Review: Terrible movie. Waste of time and money.
Classification: negative

Review: The movie was okay. Not great, but not terrible either.
Classification: negative

