In [5]:
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from scipy.special import expit

In [6]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Load and preprocess the data
def load_and_preprocess_data(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                data.append((preprocess_text(parts[1]), parts[0]))
    return data

def convert_labels_to_numeric(labels):
    label_mapping = {'Jane Austen': 0, 'Arthur Conan Doyle': 1, 'Fyodor Dostoyevsky': 2}
    return np.array([label_mapping[label] for label in labels])

In [7]:
class LogisticRegression:
    def __init__(self, lr=0.01, num_epochs=100, batch_size=32):
        self.lr = lr
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return expit(z)

    def fit_transform(self, X, y):
        vectorizer = TfidfVectorizer()
        X_tfidf = vectorizer.fit_transform(X)
        X_tfidf = csr_matrix(X_tfidf)

        # Initialize weights and bias
        self.weights = np.zeros(X_tfidf.shape[1])
        self.bias = 0

        num_batches = X_tfidf.shape[0] // self.batch_size
        for epoch in range(self.num_epochs):
            for batch, i in enumerate(range(0, X_tfidf.shape[0], self.batch_size)):
                X_batch = X_tfidf[i:i + self.batch_size]
                y_batch = y[i:i + self.batch_size]

                # Forward pass
                z = X_batch.dot(self.weights) + self.bias
                A = self.sigmoid(z)

                # Compute gradients
                dz = A - y_batch
                dw = X_batch.T.dot(dz) / len(y_batch)
                db = np.mean(dz)

                # Update parameters
                self.weights -= self.lr * dw
                self.bias -= self.lr * db

            # Print cost every 10 epochs
            if epoch % 10 == 0:
                z = X_tfidf.dot(self.weights) + self.bias
                A = self.sigmoid(z)
                cost = -np.mean(y * np.log(A) + (1 - y) * np.log(1 - A))
                print(f'Epoch {epoch}: Cost {cost}')

        return X_tfidf, self.weights, vectorizer



    def predict(self, X):
        z = X.dot(self.weights) + self.bias
        A = self.sigmoid(z)
        return (A > 0.5).astype(int)


In [8]:
# Load and preprocess the data
file_path = "/Users/naveenverma/Desktop/NewStart/Dataset/a1-data/books.txt"
data = load_and_preprocess_data(file_path)

# Split data into features (X) and labels (y)
X = np.array([entry[0] for entry in data])
y = np.array([entry[1] for entry in data])

# Convert labels to numeric values
y_numeric = convert_labels_to_numeric(y)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_numeric, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
model = LogisticRegression()
X_train_transformed, weights, vectorizer = model.fit_transform(X_train, y_train)

# Make predictions on the test set
X_test_transformed = vectorizer.transform(X_test)
y_pred = model.predict(X_test_transformed)

# Calculate accuracy
accuracy = np.mean(y_pred == y_test)
print("Accuracy:", accuracy)

Epoch 0: Cost 0.5721263146352233
Epoch 10: Cost 0.4275728093598143
Epoch 20: Cost 0.31533983403155486
Epoch 30: Cost 0.21849129052007152
Epoch 40: Cost 0.13270590994789422
Epoch 50: Cost 0.05497435879848055
Epoch 60: Cost -0.01675101594448709
Epoch 70: Cost -0.08387724203280193
Epoch 80: Cost -0.1473934031849272
Epoch 90: Cost -0.20801324556459969
Accuracy: 0.3172978505629478
