# Imporrting Necessary Library

In [1]:
# data processing and Data manipulation
import numpy as np # linear algebra
import pandas as pd # data processing
import re
import sklearn
from sklearn.model_selection import train_test_split
    
# Libraries and packages for NLP
import nltk
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    

# Data preprocessing

In [2]:
# Step 1: Data Preprocessing
def clean_text(text):
    # Remove special characters, digits, and convert to lowercase
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    cleaned_text = cleaned_text.lower()
    return cleaned_text

def tokenize_text(text):
    tokens = text.split()  # Split by whitespace
    return tokens

def load_data(data_dir, subset):
    reviews = []
    labels = []
    for label in ['pos', 'neg']:
        label_dir = os.path.join(data_dir, subset, label)
        for filename in os.listdir(label_dir):
            with open(os.path.join(label_dir, filename), 'r', encoding='utf-8') as file:
                review = file.read()
                cleaned_review = clean_text(review)
                tokens = tokenize_text(cleaned_review)
                reviews.append(tokens)
                labels.append(1 if label == 'pos' else 0)
    return reviews, labels

data_dir = r'F:\Datasets\Sentimental data\IMDB_reviews'  # Update this to your dataset directory
train_reviews, train_labels = load_data(data_dir, 'train')  
test_reviews, test_labels = load_data(data_dir, 'test')


# BoW feature extraction 

In [3]:
from scipy.sparse import lil_matrix

# Load BoW features and convert to a sparse matrix
def load_sparse_feat_file(file_path, num_features):
    num_samples = sum(1 for line in open(file_path))
    X = lil_matrix((num_samples, num_features), dtype=int)

    with open(file_path, 'r', encoding='utf-8') as feat_file:
        for row_idx, line in enumerate(feat_file):
            features = line.strip().split(' ')
            for feature in features[1:]:  # Skip the label
                index, count = feature.split(':')
                X[row_idx, int(index)] = int(count)
    
    return X

# Load vocabulary and get the number of features
vocab = []
with open('F:/Datasets/Sentimental data/IMDB_reviews/imdb.vocab', 'r', encoding='utf-8') as vocab_file:
    vocab = vocab_file.read().splitlines()
num_features = len(vocab)

# Load BoW features and labels as sparse matrices
X_train_bow = load_sparse_feat_file('F:/Datasets/Sentimental data/IMDB_reviews/train/labeledBow.feat', num_features)
X_test_bow = load_sparse_feat_file('F:/Datasets/Sentimental data/IMDB_reviews/test/labeledBow.feat', num_features)


In [4]:
y_train = train_labels  # Implement load_labels function
y_test = test_labels    # Implement load_labels function

In [5]:
# Train a logistic regression classifier
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_bow, y_train)

# Step : Feature Extraction and Classification
y_pred = classifier.predict(X_test_bow)

# Step : Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['Negative', 'Positive'])

print("Accuracy:", accuracy)
print(report)

Accuracy: 0.86772
              precision    recall  f1-score   support

    Negative       0.86      0.87      0.87     12500
    Positive       0.87      0.86      0.87     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000



In [6]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


# Random Forest Classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_bow, train_labels)
rf_predictions = rf_classifier.predict(X_test_bow)


# Evaluation
rf_accuracy = accuracy_score(test_labels, rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)
report = classification_report(y_test,rf_predictions, target_names=['Negative', 'Positive'])

print(report)

Random Forest Accuracy: 0.84292
              precision    recall  f1-score   support

    Negative       0.84      0.84      0.84     12500
    Positive       0.84      0.84      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000



In [7]:

# Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier()
gb_classifier.fit(X_train_bow, train_labels)
gb_predictions = gb_classifier.predict(X_test_bow)

# Evaluation
gb_accuracy = accuracy_score(test_labels, gb_predictions)
report = classification_report(y_test,gb_predictions, target_names=['Negative', 'Positive'])
print(report)
print("Gradient Boosting Accuracy:", gb_accuracy)

              precision    recall  f1-score   support

    Negative       0.85      0.76      0.80     12500
    Positive       0.79      0.86      0.82     12500

    accuracy                           0.81     25000
   macro avg       0.82      0.81      0.81     25000
weighted avg       0.82      0.81      0.81     25000

Gradient Boosting Accuracy: 0.8132


In [None]:
import os
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

def load_unsup_feat_file(file_path, vocab_size):
    feature_matrix = []
    with open(file_path, 'r', encoding='utf-8') as feat_file:
        for line in feat_file:
            features = line.strip().split(' ')
            feature_vector = [0] * vocab_size
            for feature in features[1:]:  # Skip the label
                index, count = feature.split(':')
                feature_vector[int(index)] = int(count)
            feature_matrix.append(feature_vector)
    return feature_matrix

X_unsup_bow = load_unsup_feat_file(r'F:\Datasets\Sentimental data\IMDB_reviews\train\unsupBow.feat', len(vocab))

# Load labeled BoW features and labels for train and test data (similar to previous steps)
X_train_bow = load_feat_file(r'F:/Datasets/Sentimental data/IMDB_reviews/train/labeledBow.feat', len(vocab))
X_test_bow = load_feat_file(r'F:/Datasets/Sentimental data/IMDB_reviews/test/labeledBow.feat', len(vocab))
y_train = train_labels  # Implement load_labels function
y_test = test_labels    # Implement load_labels function

# Combine labeled and unsupervised BoW features
X_train_combined = np.vstack((X_train_bow, X_unsup_bow))
y_train_combined = np.concatenate((y_train, np.zeros(len(X_unsup_bow))))  # Assign unsupervised data to class 0

# Train SVM classifier on combined data
classifier = SVC()
classifier.fit(X_train_combined, y_train_combined)

# Predict and evaluate on test data
y_pred = classifier.predict(X_test_bow)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
