## DS5601 Assignment 1: Text Classification
__Name__ : Vaibhav Nagrale \\
__RollNo__ : 112001046

In [22]:
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import math
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy import log, dot, exp, shape
import copy
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

### Data Preprocessing

In [23]:
data = pd.read_csv('/content/drive/MyDrive/insta/data/cleaned_data_150k.csv')

def remove_tags(string):
    result = re.sub('','',string)          #remove HTML tags
    result = re.sub('https://.*','',result)   #remove URLs
    result = re.sub(r'[^a-zA-Z0-9\s]', ' ',result)    #remove non-alphanumeric characters
    result = result.lower()
    return result

data['text']=data['text'].apply(lambda cw : remove_tags(cw))

nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + " "
    return st
data['text'] = data.text.apply(lemmatize_text)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data Split

In [24]:
reviews = data['text']
labels = data['target']

# The dataset is then split into 80% train and 20% test parts using train_test_split from sklearn.model_selection.
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=42)

vec = CountVectorizer()
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

### (a) Naive Bayes approach

In [25]:
class MyNaiveBayes:
    def train(self, X_train, y_train):
        self.unique_classes = np.unique(y_train)
        self.class_counts = dict(zip(*np.unique(y_train, return_counts=True)))
        self.vocab_size = X_train.shape[1]
        self.word_counts_by_class = {label: np.zeros(self.vocab_size) for label in self.unique_classes}
        self.total_words_by_class = {label: 0 for label in self.unique_classes}

        for i in range(X_train.shape[0]):
            class_label = y_train.iloc[i]
            self.word_counts_by_class[class_label] += X_train[i].toarray()[0]
            self.total_words_by_class[class_label] += X_train[i].sum()

    def predict(self, X_test):
        predictions = []
        for i in range(X_test.shape[0]):
            class_probs = {}
            for class_label in self.unique_classes:
                prior_prob = np.log(self.class_counts[class_label] / sum(self.class_counts.values()))
                likelihood = np.log((self.word_counts_by_class[class_label] + 1) /
                                    (self.total_words_by_class[class_label] + self.vocab_size)).dot(X_test[i].toarray()[0])
                class_probs[class_label] = prior_prob + likelihood
            predictions.append(max(class_probs, key=class_probs.get))
        return predictions

# Using the MyNaiveBayes class:
nb_model = MyNaiveBayes()
nb_model.train(X_train_vec, y_train)

pred = nb_model.predict(X_test_vec)

In [31]:
# Calculate accuracy and print classification report
print("My Naive Bayes")
accuracy = accuracy_score(y_test, pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

report = classification_report(y_test, pred)
print("Classification Report:")
print(report)

# Sklearn model accuracy and classification report
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_vec, y_train)
naive_bayes_pred = naive_bayes_classifier.predict(X_test_vec)
# Calculate accuracy and print classification report
print("Sklearn Naive Bayes")
naive_bayes_accuracy = accuracy_score(y_test, naive_bayes_pred)
print("Accuracy: {:.2f}%".format(naive_bayes_accuracy * 100))

naive_bayes_report = classification_report(y_test, naive_bayes_pred)
print("Classification Report:")
print(naive_bayes_report)

My Naive Bayes
Accuracy: 84.75%
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.91      0.84      9896
           1       0.89      0.76      0.82      9949
           2       0.89      0.87      0.88     10155

    accuracy                           0.85     30000
   macro avg       0.85      0.85      0.85     30000
weighted avg       0.85      0.85      0.85     30000

Sklearn Naive Bayes
Accuracy: 82.48%
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.84      0.86      9896
           1       0.78      0.77      0.78      9949
           2       0.82      0.86      0.84     10155

    accuracy                           0.82     30000
   macro avg       0.83      0.82      0.82     30000
weighted avg       0.83      0.82      0.82     30000



### Naive Bayes observation
My NB and Sklearn NB both have almost same accurracy \\
My NB takes time to learn but Sklearn NB is fast to learn.

### (b) Logistic regression-based approach

In [30]:
import numpy as np
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression

class MyLogisticRegression:
    def __init__(self, num_classes, lr=0.01, num_iter=5000, lambda_reg=0.01):
        self.num_classes = num_classes
        self.lr = lr
        self.num_iter = num_iter
        self.lambda_reg = lambda_reg
        self.weights = None

    @staticmethod
    def softmax(scores):
        e_scores = np.exp(scores - np.max(scores))
        return e_scores / e_scores.sum(axis=1, keepdims=True)

    @staticmethod
    def cross_entropy(pred, targets):
        return -np.sum(targets * np.log(pred + 1e-5)) / pred.shape[0]

    def train(self, X, y):
        m, n = X.shape
        self.weights = np.zeros((n, self.num_classes))
        targ = np.eye(self.num_classes)[y]

        for _ in range(self.num_iter):
            scores = X.dot(self.weights)
            predictions = self.softmax(scores)
            err = predictions - targ
            grad = (X.T.dot(err) + self.lambda_reg * self.weights) / m
            self.weights -= self.lr * grad
            if _ % 500 == 0:
                print(f"{_}/{self.num_iter}: Loss {self.cross_entropy(predictions, targ)}")

    def predict(self, X):
        scores = X.dot(self.weights)
        pred = self.softmax(scores)
        return np.argmax(pred, axis=1)

if __name__ == "__main__":

    tfidf_vec = TfidfVectorizer(max_features=5000)
    X_train_transform = tfidf_vec.fit_transform(X_train)
    X_test_transform = tfidf_vec.transform(X_test)

    X_train_intercept = hstack((csr_matrix(np.ones((X_train_transform.shape[0], 1)), dtype=float), X_train_transform))
    X_test_intercept = hstack((csr_matrix(np.ones((X_test_transform.shape[0], 1)), dtype=float), X_test_transform))

    # Using the MyLogisticRegression class:
    num_classes = len(np.unique(y_train))
    model = MyLogisticRegression(num_classes=num_classes, lr=0.01, num_iter=5000)
    model.train(X_train_intercept, y_train)
    pred = model.predict(X_test_intercept)

    # My model accuracy and classification report
    print("My Logistic Regression")
    accuracy = accuracy_score(y_test, pred)
    print("Accuracy: {:.2f}%".format(accuracy * 100))

    report = classification_report(y_test, pred)
    print("Classification Report:")
    print(report)

    # Sklearn model accuracy and classification report
    sk_lr = LogisticRegression(max_iter=1000)
    sk_lr.fit(X_train_transform, y_train)
    sklearn_pred = sk_lr.predict(X_test_transform)

    # Calculate accuracy and print classification report
    print("Sklearn Logistic Regression")
    lr_accurracy = accuracy_score(y_test, sklearn_pred)
    print("Accuracy: {:.2f}%".format(lr_accurracy * 100))

    lr_report = classification_report(y_test, sklearn_pred)
    print("Classification Report:")
    print(lr_report)

0/5000: Loss 1.0985822891181014
500/5000: Loss 1.0905094545940157
1000/5000: Loss 1.0825767129176151
1500/5000: Loss 1.07477718586217
2000/5000: Loss 1.0671085536985672
2500/5000: Loss 1.0595686377652354
3000/5000: Loss 1.0521552562703556
3500/5000: Loss 1.044866222706988
4000/5000: Loss 1.0376993491728757
4500/5000: Loss 1.0306524497570049
My Logistic Regression
Accuracy: 84.75%
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.91      0.84      9896
           1       0.89      0.76      0.82      9949
           2       0.89      0.87      0.88     10155

    accuracy                           0.85     30000
   macro avg       0.85      0.85      0.85     30000
weighted avg       0.85      0.85      0.85     30000

Sklearn Logistic Regression
Accuracy: 93.52%
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.96      0.94      9896
           1       0.93      0.91   

### Logistic Regression observation
My LR and Sklearn LR both have almost same accurracy \\
My LR takes time to learn but Sklearn LR is fast to learn.