# Scott Breitbach
## 20-March-2021
## DSC550, Week 2

In [1]:
import pandas as pd
import unicodedata
import sys
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk import pos_tag
from nltk import word_tokenize
import nltk

## 1) Read the *controversial-comments.jsonl* file and pre-process the text. 

In [2]:
allCommentsDF = pd.read_json("controversial-comments\controversial-comments.jsonl", lines=True)
allCommentsDF

Unnamed: 0,con,txt
0,0,Well it's great that he did something about th...
1,0,You are right Mr. President.
2,0,You have given no input apart from saying I am...
3,0,I get the frustration but the reason they want...
4,0,I am far from an expert on TPP and I would ten...
...,...,...
949995,0,I genuinely can't understand how anyone can su...
949996,0,"As a reminder, this subreddit [is for civil di..."
949997,0,K. Don't explain why or anything.
949998,0,[deleted]


In [3]:
commentsDF = allCommentsDF.sample(100)

#### A) Convert all text to lowercase letters.

In [4]:
commentsDF.txt = commentsDF['txt'].str.lower()

In [5]:
commentsDF.head()

Unnamed: 0,con,txt
516372,0,[deleted]
584282,0,"as long as you're not in the ""obstruct her eve..."
558074,0,"it was mostly because of the guy's question, n..."
305702,0,that's what obama and the dems are working on....
605221,0,yup. my mother sadly thinks this is just an ob...


#### B) Remove all punctuation from the text.

In [6]:
punctuation = dict.fromkeys(i for i in range(sys.maxunicode) 
                            if unicodedata.category(chr(i)).startswith('P'))

In [7]:
commentsDF.txt = [string.translate(punctuation) for string in commentsDF.txt]

In [8]:
commentsDF.head()

Unnamed: 0,con,txt
516372,0,deleted
584282,0,as long as youre not in the obstruct her every...
558074,0,it was mostly because of the guys question not...
305702,0,thats what obama and the dems are working on ...
605221,0,yup my mother sadly thinks this is just an obv...


#### C) Remove stop words.

In [9]:
# import nltk
# nltk.download('stopwords')

In [10]:
stop_words = stopwords.words('english')

def removeStopWords(string):
    tokenized_words = word_tokenize(string)
    return [word for word in tokenized_words if word not in stop_words]

In [11]:
commentsDF.txt = commentsDF.txt.apply(lambda x: removeStopWords(x))

In [12]:
commentsDF.head()

Unnamed: 0,con,txt
516372,0,[deleted]
584282,0,"[long, youre, obstruct, every, move, camp, tha..."
558074,0,"[mostly, guys, question, answers]"
305702,0,"[thats, obama, dems, working, gop, much]"
605221,0,"[yup, mother, sadly, thinks, obvious, biblical..."


#### D) Apply NLTK's PorterStemmer.

In [13]:
porter = PorterStemmer()

In [14]:
commentsDF.txt = commentsDF.txt.apply(lambda x: [porter.stem(word) for word in x])

In [15]:
commentsDF.head()

Unnamed: 0,con,txt
516372,0,[delet]
584282,0,"[long, your, obstruct, everi, move, camp, that..."
558074,0,"[mostli, guy, question, answer]"
305702,0,"[that, obama, dem, work, gop, much]"
605221,0,"[yup, mother, sadli, think, obviou, biblic, tr..."


## 2) Get text into a usable form for model-building.

#### A) Convert each text entry into a word-count vector.
See sections 5.3 & 6.8 in the *Machine Learning with Python Cookbook*

In [16]:
count = CountVectorizer()

In [17]:
# commentsDF['WCV'] = commentsDF.txt.apply(lambda x: count.fit_transform(x).toarray())

Converting to word-count vector threw an error when the 'txt' field contained an empty list. This function should return an empty list when coming across this error.

In [18]:
# def wordCountVector(wordList):
#     try:
#         array = count.fit_transform(wordList).toarray()
#         return array
#     except:
#         return []

In [19]:
# commentsDF['WCV'] = commentsDF.txt.apply(lambda x: wordCountVector(x))
# commentsDF.head()

Apparently this needs to be in one large matrix so let's try it again:

In [27]:
text_data, string = [], " "

for text in commentsDF.txt:
    text_data.append(string.join(text))
    
wordCountVector = pd.DataFrame(count.fit_transform(text_data).toarray(), columns=count.get_feature_names())

In [28]:
wordCountVector

Unnamed: 0,12,1970,1999,2004,2016,40,50,85,absolut,accus,...,worth,would,wrong,ye,yeah,year,young,your,youthen,yup
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### B) Convert each text entry into a part-of-speech tag vector.
See section 6.7 in the *Machine Learning with Python Cookbook*

In [21]:
nltk.pos_tag(commentsDF.txt.iloc[0])[:5]

[('delet', 'NN')]

In [22]:
commentsDF['PoS'] = commentsDF.txt.apply(lambda x: [tag for word, tag in nltk.pos_tag(x)])

In [23]:
commentsDF.head()

Unnamed: 0,con,txt,WCV,PoS
516372,0,[delet],[[1]],[NN]
584282,0,"[long, your, obstruct, everi, move, camp, that...","[[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, ...","[RB, PRP$, NN, JJ, NN, NN, IN, VBZ, JJ, NN, NN]"
558074,0,"[mostli, guy, question, answer]","[[0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1], [1,...","[NN, NN, NN, NN]"
305702,0,"[that, obama, dem, work, gop, much]","[[0, 0, 0, 0, 1, 0], [0, 0, 0, 1, 0, 0], [1, 0...","[DT, VBZ, JJ, NN, RB, JJ]"
605221,0,"[yup, mother, sadli, think, obviou, biblic, tr...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[NN, NN, NN, VBP, IN, JJ, NN, PRP, VBD, JJ, VB..."


#### C) Convert each entry into a term frequency-inverse document frequency (**tfidf**) vector.
See section 6.9 in the *Machine Learning with Python Cookbook*

## **Follow-Up Question**

### For the three techniques in problem 2 above, give an example where each would be useful.

#### A) Word-count vector

#### B) Part-of-speech vector

#### C) tfdif vector