## Load Data

In [1]:
# Load libraries
import jsonlines
import pandas as pd

In [2]:
%%time
# Load JSON data into a list of dictionaries
data = []
with jsonlines.open('categorized-comments.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        data.append(obj)

Wall time: 2.94 s


In [3]:
# Convert data to DataFrame
cat_comments_df = pd.DataFrame(data)
cat_comments_df.head()

Unnamed: 0,cat,txt
0,sports,Barely better than Gabbert? He was significant...
1,sports,Fuck the ducks and the Angels! But welcome to ...
2,sports,Should have drafted more WRs.\n\n- Matt Millen...
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg)
4,sports,No!! NOO!!!!!


In [4]:
# Check out the categories
categories = cat_comments_df.cat.unique()
print("The categories are:")
for category in categories:
    print(" -", category)

The categories are:
 - sports
 - science_and_technology
 - video_games


## Preprocess Text

In [5]:
df = cat_comments_df.copy()

In [6]:
# Load libraries
import re
import sys
import unicodedata
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer

# from sklearn.feature_extraction.text import CountVectorizer

In [7]:
df.head()

Unnamed: 0,cat,txt
0,sports,Barely better than Gabbert? He was significant...
1,sports,Fuck the ducks and the Angels! But welcome to ...
2,sports,Should have drafted more WRs.\n\n- Matt Millen...
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg)
4,sports,No!! NOO!!!!!


In [8]:
%%time
# Convert to lowercase
df.txt = df['txt'].str.lower()

Wall time: 368 ms


In [9]:
df.head()

Unnamed: 0,cat,txt
0,sports,barely better than gabbert? he was significant...
1,sports,fuck the ducks and the angels! but welcome to ...
2,sports,should have drafted more wrs.\n\n- matt millen...
3,sports,[done](https://i.imgur.com/2yz90pm.jpg)
4,sports,no!! noo!!!!!


In [10]:
%%time
# Remove URLs
df.txt = df.txt.apply(lambda text: re.sub(r'http\S+', '', text))

Wall time: 736 ms


In [11]:
df.head()

Unnamed: 0,cat,txt
0,sports,barely better than gabbert? he was significant...
1,sports,fuck the ducks and the angels! but welcome to ...
2,sports,should have drafted more wrs.\n\n- matt millen...
3,sports,[done](
4,sports,no!! noo!!!!!


In [12]:
%%time
# Remove punctuation from strings in text
punctuation = dict.fromkeys(i for i in range(sys.maxunicode) 
                            if unicodedata.category(chr(i)).startswith('P'))

df.txt = [string.translate(punctuation) for string in df.txt]

Wall time: 1.83 s


In [13]:
df.head()

Unnamed: 0,cat,txt
0,sports,barely better than gabbert he was significantl...
1,sports,fuck the ducks and the angels but welcome to a...
2,sports,should have drafted more wrs\n\n matt millen p...
3,sports,done
4,sports,no noo


In [14]:
%%time
# Remove newlines
df.txt = [string.replace("\n", " ") for string in df.txt]

Wall time: 279 ms


In [15]:
df.head()

Unnamed: 0,cat,txt
0,sports,barely better than gabbert he was significantl...
1,sports,fuck the ducks and the angels but welcome to a...
2,sports,should have drafted more wrs matt millen pro...
3,sports,done
4,sports,no noo


In [16]:
stop_words = stopwords.words('english')

stopwords_dict = Counter(stop_words)

def removeStopwords(string):
    return [word for word in string.split() if word not in stopwords_dict]

In [17]:
%%time
df.txt = df.txt.apply(lambda string: removeStopwords(string))

Wall time: 5.12 s


In [None]:
df.head()

In [18]:
%%time
# Apply PorterStemmer
porter = PorterStemmer()

df.txt = df.txt.apply(lambda words: [porter.stem(word) for word in words])

Wall time: 2min 51s


In [19]:
df.head()

Unnamed: 0,cat,txt
0,sports,"[bare, better, gabbert, significantli, better,..."
1,sports,"[fuck, duck, angel, welcom, new, niner, fan]"
2,sports,"[draft, wr, matt, millen, probabl]"
3,sports,[done]
4,sports,[noo]


In [20]:
# Set up function to remove stopwords from strings
# and return a list of the remaining words

In [21]:
# test1 = df.copy()
# test2 = df.copy()
# test3 = df.copy()

Method 1: Tokenize words

In [22]:
# ## MUCH FASTER AFTER REMOVING TOKENIZATION:
# from nltk.tokenize import word_tokenize
# def removeSW1(string):
#     return [word for word in string.split() if word not in stop_words]

In [23]:
# from nltk.tokenize import word_tokenize
# def removeSW1(string):
#     tokenized_words = word_tokenize(string)
#     return [word for word in tokenized_words if word not in stop_words]

In [24]:
# %%time
# test1.txt = test1.txt.apply(lambda string: removeSW1(string))

FASTEST - Method 2: Dictionary of Stopwords

In [25]:
# from collections import Counter
# stopwords_dict = Counter(stop_words)
# def removeSW2(string):
#     return [word for word in string.split() if word not in stopwords_dict]

In [26]:
# %%time
# test2.txt = test2.txt.apply(lambda string: removeSW2(string))

Method 3: Regex

In [27]:
# import re
# pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
# def removeSW3(string):
#     return (pattern.sub('', string)).split()

In [28]:
# %%time
# test3.txt = test3.txt.apply(lambda string: removeSW3(string))

In [29]:
# ## RETURNS A LIST WITH ONE STRING OF WORDS ##

# from collections import Counter

# stopwords_dict = Counter(stop_words)

# def removeSW2(string):
#     return [' '.join([word for word in string.split() if word not in stopwords_dict])]

In [30]:
# ## RETURNS A LIST WITH ONE STRING OF WORDS ##

# pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')

# def removeSW3(string):
#     return [pattern.sub('', string)]

# Nice 

## Load Data

In [1]:
# Load libraries
import jsonlines
import pandas as pd

In [2]:
%%time
# Load JSON data into a list of dictionaries
data = []
with jsonlines.open('categorized-comments.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        data.append(obj)

Wall time: 2.57 s


In [3]:
# Convert data to DataFrame
cat_comments_df = pd.DataFrame(data)
cat_comments_df.head()

Unnamed: 0,cat,txt
0,sports,Barely better than Gabbert? He was significant...
1,sports,Fuck the ducks and the Angels! But welcome to ...
2,sports,Should have drafted more WRs.\n\n- Matt Millen...
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg)
4,sports,No!! NOO!!!!!


In [4]:
# Check out the categories
categories = cat_comments_df.cat.unique()
print("The categories are:")
for category in categories:
    print(" -", category)

The categories are:
 - sports
 - science_and_technology
 - video_games


## Preprocess Text

In [5]:
# Load libraries
import re
import sys
import unicodedata
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer

# from sklearn.feature_extraction.text import CountVectorizer

In [6]:
df = cat_comments_df.copy()

In [7]:
df.head()

Unnamed: 0,cat,txt
0,sports,Barely better than Gabbert? He was significant...
1,sports,Fuck the ducks and the Angels! But welcome to ...
2,sports,Should have drafted more WRs.\n\n- Matt Millen...
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg)
4,sports,No!! NOO!!!!!


In [8]:
punctuation_dict = dict.fromkeys(i for i in range(sys.maxunicode) 
                            if unicodedata.category(chr(i)).startswith('P'))

stop_words = stopwords.words('english')
stopwords_dict = Counter(stop_words)

def cleanText(string):
    '''Processes string and returns cleaned up list of words'''
    
    # Convert to lowercase
    string = string.lower()
    
    # Remove URLs
    string = re.sub(r'http\S+', '', string)
    
    # Remove punctuation
    string = string.translate(punctuation_dict)
    
    # Remove newlines
    string = string.replace("\n", " ")
    
    # Remove stopwords
    string = [word for word in string.split() if word not in stopwords_dict]
    
    return string

In [9]:
%%time
# Clean up the text in the 'txt' column
df.txt = df.txt.apply(lambda string: cleanText(string))

Wall time: 7.79 s


In [10]:
df.head()

Unnamed: 0,cat,txt
0,sports,"[barely, better, gabbert, significantly, bette..."
1,sports,"[fuck, ducks, angels, welcome, new, niners, fans]"
2,sports,"[drafted, wrs, matt, millen, probably]"
3,sports,[done]
4,sports,[noo]


In [11]:
%%time
# Apply PorterStemmer
porter = PorterStemmer()

df['txt_stems'] = df.txt.apply(lambda words: [porter.stem(word) for word in words])

Wall time: 3min 4s


In [12]:
df.head()

Unnamed: 0,cat,txt
0,sports,"[bare, better, gabbert, significantli, better,..."
1,sports,"[fuck, duck, angel, welcom, new, niner, fan]"
2,sports,"[draft, wr, matt, millen, probabl]"
3,sports,[done]
4,sports,[noo]


## Prepare Text for Model-Building

In [97]:
# Load libraries
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk import pos_tag
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

### Convert to a word-count vector:

In [19]:
count = CountVectorizer()

In [15]:
text_data, string = [], " "

for text in df.txt:
    text_data.append(string.join(text))

In [20]:
%%time
# Word-count vector as a sparse matrix
sparseWCV = count.fit_transform(text_data)
sparseWCV

<606475x153186 sparse matrix of type '<class 'numpy.int64'>'
	with 7708544 stored elements in Compressed Sparse Row format>

In [78]:
sparseWCV.size

7708544

### Convert to a part-of-speech tag vector:

In [31]:
nltk.pos_tag(df.txt.iloc[1])[:5]

[('fuck', 'JJ'),
 ('duck', 'NN'),
 ('angel', 'NN'),
 ('welcom', 'VBD'),
 ('new', 'JJ')]

In [88]:
testDF = df.copy()

In [89]:
oneHotMulti = MultiLabelBinarizer()

In [90]:
%%time
taggedWords = []

for words in testDF.txt: #[:100]:
    wordTags = nltk.pos_tag(words)
    taggedWords.append([tag for word, tag in wordTags])

Wall time: 12min 6s


In [91]:
posMatrix = oneHotMulti.fit_transform(taggedWords)
posMatrix

Wall time: 1.48 s


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [92]:
posMatrix.shape

(606475, 39)

In [103]:
len(oneHotMulti.classes_)

39

### Convert to a tfidf vector:

In [98]:
tfidf = TfidfVectorizer()

In [99]:
%%time
# tfidf vector as a sparse matrix:
sparseTfidf = tfidf.fit_transform(text_data)
sparseTfidf

Wall time: 9.54 s


<606475x153186 sparse matrix of type '<class 'numpy.float64'>'
	with 7708544 stored elements in Compressed Sparse Row format>

In [102]:
len(tfidf.get_feature_names())

153186

## fit a neural network classifier using scikit-learn