In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

# 1 - Basic Data Preprocessing

In [2]:
train_data = "../data/reddit_train.csv"
train = pd.read_csv(train_data)
train.head()

Unnamed: 0,id,comments,subreddits
0,0,"Honestly, Buffalo is the correct answer. I rem...",hockey
1,1,Ah yes way could have been :( remember when he...,nba
2,2,https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...,leagueoflegends
3,3,He wouldn't have been a bad signing if we woul...,soccer
4,4,Easy. You use the piss and dry technique. Let ...,funny


In [3]:
# First time running do this
# nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

def lemmatize(comment):
    comment = comment.split()
    comment = [stemmer.lemmatize(word) for word in comment]
    comment = ' '.join(comment)
    return comment


def preprocess(df):

    #-------------------------------------------------------------
    # Text preprocessing for the 'comments' column
    #-------------------------------------------------------------
    # Lowercase
    df['comments'] = df['comments'].apply(lambda x: " ".join(x.lower() for x in x.split()))
    # Remove all the special characters
    df['comments'] = df['comments'].apply(lambda x: re.sub(r'\W', ' ', x))
    # Remove all single characters
    df['comments'] = df['comments'].apply(lambda x: re.sub(r'\s+[a-zA-Z]\s+', ' ', x))
    # Remove single characters from the start
    df['comments'] = df['comments'].apply(lambda x: re.sub(r'\^[a-zA-Z]\s+', ' ', x))
    # Substituting multiple spaces with single space
    df['comments'] = df['comments'].apply(lambda x: re.sub(r'\s+', ' ', x, flags=re.I))
    # Lemmatization
    df['comments'] = df['comments'].apply(lemmatize)
    
    #-------------------------------------------------------------
    # Create a numerical class out of each possible subreddit
    #-------------------------------------------------------------
    df.subreddits = pd.Categorical(df.subreddits)
    df['y'] = df.subreddits.cat.codes
    
    return df


train = preprocess(train)
train.head()

Unnamed: 0,id,comments,subreddits,y
0,0,honestly buffalo is the correct answer remembe...,hockey,11
1,1,ah yes way could have been remember when he wa...,nba,14
2,2,http youtu be 6xxbbr8isz0 40m49s if you didn f...,leagueoflegends,12
3,3,he wouldn have been bad signing if we wouldn h...,soccer,16
4,4,easy you use the piss and dry technique let fe...,funny,9


# 2 - Converting Text to Numbers

## 2.1 Input X

**max_features**
We set the max_features parameter to 1000, which means that we want to use 1000 most occurring words as features for training our classifier.

**min_df** 
This corresponds to the minimum number of documents that should contain this feature. So we only include those words that occur in at least 5 documents. 

**max_df** 
Here 0.7 means that we should include only those words that occur in a maximum of 70% of all the documents. Words that occur in almost every document are usually not suitable for classification because they do not provide any unique information about the document.

The fit_transform function of the CountVectorizer class converts text documents into corresponding numeric features.

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
    max_features=2000, 
    min_df=5, max_df=0.7, 
    stop_words=stopwords.words('english'), 
    binary=True)

X_train = vectorizer.fit_transform(list(train['comments'])).toarray()

In [5]:
X_train.shape

(70000, 2000)

## 2.2 Output y

In [6]:
y_train = train['y'].to_numpy()
y_train.shape

(70000,)

# 3 - Train/Test Split

Here we split the training data because 80/20 because the test file does not contain the true categories.

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = X_train
y = y_train

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 3 - Bernouilli Naive Bayes (Sklearn)

In [8]:
from sklearn.naive_bayes import BernoulliNB

clf = BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True)
clf.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

### Sklearn Naive Bayes performance on test set

In [9]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_pred = clf.predict(X_test)

print(accuracy_score(y_test, y_pred))

0.3668571428571429


# Home made Bernouilli Naive Bayes

In [10]:
class MultiClassBernouilliNB():
    
    def __init__(self, alpha): 
        self.alpha = alpha
        self.X     = None
        self.y     = None
        
        
    def _compute_marginal_probabilities(self):
        """ 
        Marginal probabilities for each class
        """
        self.marginals = np.empty(len(self.classes))
        for k in self.classes:
            Y_k = self.data[self.data[:,-1] == k]
            self.marginals[k] = float(Y_k.shape[0])/float(self.data.shape[0])
        
        
    def _compute_priors(self):
        """
        Compute the priors matrix P(Xj|Yk)
        """
        self.priors = np.empty((self.n_features,len(self.classes),))
        
        for k in self.classes:
            Y_k = self.data[self.data[:,-1] == k]
            
            for j in range(self.n_features):    
                # number of times xj=1 and y=k
                Xj1_Yk   = Y_k[Y_k[:,j] == 1.0]
                
                # Add Laplace smoothing (parameter self.alpha)
                numerator   = self.alpha + Xj1_Yk.shape[0]
                denominator = self.alpha*len(self.classes) + Y_k.shape[0]
                theta_jk    = float(numerator)/float(denominator)
                
                self.priors[j,k] = theta_jk
        
    
    def fit(self, X, y):
        """
        For each class in y, compute the marginal probabilities P(Y=k)
        For each feature Xj, compute the contional pronability P(Xj|Y)
        """
        # Train set X and y
        self.X = X
        self.y = y
        self.data = np.c_[self.X, self.y]
        
        # number of classes to predict
        self.classes = set(self.y)
        # number of features 
        self.n_features = X.shape[1]
        
        # compute the marginal P(Y=k) and prior probabilities P(Xj|Y)
        self._compute_marginal_probabilities()
        self._compute_priors()
        
        # log both the marginal and prior probabilities for numerical stability
        self.log_marginals   = np.log(self.marginals)
        self.log_priors      = np.log(self.priors)
        self.log_1minusprior = np.log(1.0 - self.priors)
        
        
    def predict(self, X_test):
        """
        Apply equation from lecture 10, slide 6 (matrix version),
        to pick the most likely class
        """
        self.X_test = X_test
        
        # Predictions proportional to likelihood for class 0 and class 1
        self.likelihoods = np.empty((X_test.shape[0],len(self.classes),))
        self.pred_class  = np.empty((X_test.shape[0],1,))
        
        for i in range(self.X_test.shape[0]):
            # individual x to classify
            x = X_test[i,:]
            
            for k in self.classes:
                pred_yk  = self.log_marginals[k]
                pred_yk += np.dot(x, self.log_priors[:,k])
                pred_yk += np.dot((1.0-x), self.log_1minusprior[:,k])
                
                self.likelihoods[i,k] = pred_yk
                
            # predicted class corresponds to the maximum value of pred_yk for this instance
            self.pred_class[i,0] = np.where(self.likelihoods[i,:] == np.amax(self.likelihoods[i,:]))[0][0]
        return self.pred_class
        
        

In [11]:
clf = MultiClassBernouilliNB(alpha=1.0)
clf.fit(X_train, y_train)

In [12]:
y_pred = clf.predict(X_test)

print(accuracy_score(y_test, y_pred))

0.36742857142857144
