In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

# 1 - Basic Data Preprocessing

In [2]:
train_data = "../data/reddit_train.csv"
train = pd.read_csv(train_data)
train.head()

Unnamed: 0,id,comments,subreddits
0,0,"Honestly, Buffalo is the correct answer. I rem...",hockey
1,1,Ah yes way could have been :( remember when he...,nba
2,2,https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...,leagueoflegends
3,3,He wouldn't have been a bad signing if we woul...,soccer
4,4,Easy. You use the piss and dry technique. Let ...,funny


In [3]:
# First time running do this
# nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

def lemmatize(comment):
    comment = comment.split()
    comment = [stemmer.lemmatize(word) for word in comment]
    comment = ' '.join(comment)
    return comment


def preprocess(df):

    #-------------------------------------------------------------
    # Text preprocessing for the 'comments' column
    #-------------------------------------------------------------
    # Remove URLs
    df['comments'] = df['comments'].str.replace('http\S+|www.\S+', '', case=False)
    # Lowercase
    df['comments'] = df['comments'].apply(lambda x: " ".join(x.lower() for x in x.split()))
    # Remove all the special characters
    df['comments'] = df['comments'].apply(lambda x: re.sub(r'\W', ' ', x))
    # Remove all single characters
    df['comments'] = df['comments'].apply(lambda x: re.sub(r'\s+[a-zA-Z]\s+', ' ', x))
    # Removing punctuation
    df['comments'] = df['comments'].str.replace('[^\w\s]','')
    # Remove single characters from the start
    df['comments'] = df['comments'].apply(lambda x: re.sub(r'\^[a-zA-Z]\s+', ' ', x))
    # Substituting multiple spaces with single space
    df['comments'] = df['comments'].apply(lambda x: re.sub(r'\s+', ' ', x, flags=re.I))
    # Removing stop words
    stop = stopwords.words('english')
    train['comments'] = train['comments'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    # Lemmatization
    df['comments'] = df['comments'].apply(lemmatize)
    
    # Remove most frequent and least frequent words
    freq           = pd.Series(' '.join(df['comments']).split()).value_counts()
    words_del      = list(freq[:100].index)
    df['comments'] = df['comments'].apply(lambda x: " ".join(x for x in x.split() if x not in words_del))
    
    print(freq)
    
    words_del      = list(freq[-100:].index)
    df['comments'] = df['comments'].apply(lambda x: " ".join(x for x in x.split() if x not in words_del))
    
    #-------------------------------------------------------------
    # Create a numerical class out of each possible subreddit
    #-------------------------------------------------------------
    df.subreddits = pd.Categorical(df.subreddits)
    df['y'] = df.subreddits.cat.codes
    
    return df


train = preprocess(train)
train.head()

like             14111
one              10341
would             9692
get               9617
people            9436
                 ...  
idealogically        1
winratio             1
flocked              1
686                  1
threatplates         1
Length: 60550, dtype: int64


Unnamed: 0,id,comments,subreddits,y
0,0,honestly buffalo correct answer remember somew...,hockey,11
1,1,ah yes remember drafted gonna nope kawhi thomp...,nba,14
2,2,already ordinary eye constant eye contact,leagueoflegends,12
3,3,signing paid 18m euro price acceptable,soccer,16
4,4,easy piss dry technique let drop let dry rinse...,funny,9


# 2 - Model Pipeline

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
    max_features=3000, 
    min_df=5, max_df=0.7, 
    stop_words=stopwords.words('english'), 
    binary=True
)

X = vectorizer.fit_transform(list(train['comments'])).toarray()
y = train['y'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

print(X_train.shape)
print(X_test.shape)

(59500, 3000)
(10500, 3000)


In [5]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np


from sklearn.pipeline import Pipeline
text_clf = Pipeline([
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()),
 ])

text_clf.fit(X_train, y_train) 

np.mean(y_test==text_clf.predict(X_test))

MemoryError: Unable to allocate array with shape (59500, 3000) and data type float64