# Scott Breitbach
## 20-March-2021
## DSC550, Week 2

In [1]:
import pandas as pd
import unicodedata
import sys
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

## 1) Read the *controversial-comments.jsonl* file and pre-process the text. 

In [2]:
allCommentsDF = pd.read_json("controversial-comments\controversial-comments.jsonl", lines=True)
commentsDF = allCommentsDF.sample(100)

In [3]:
commentsDF.head(3)

Unnamed: 0,con,txt
835221,0,Not sure if your username is meant to be apt h...
510669,1,Because there's nothing of substance in the leak.
524678,1,[deleted]


In [4]:
commentsDF.tail(3)

Unnamed: 0,con,txt
946419,0,"Remember how, after a tragedy like a shooting,..."
627110,0,"Oh, that's great news. Obama is the hero we ne..."
815359,0,I really fucking hope your right...


#### A) Convert all text to lowercase letters.

In [5]:
commentsDF.txt = commentsDF['txt'].str.lower()

In [6]:
commentsDF.head()

Unnamed: 0,con,txt
835221,0,not sure if your username is meant to be apt h...
510669,1,because there's nothing of substance in the leak.
524678,1,[deleted]
338313,0,"as a reminder, this subreddit [is for civil di..."
893191,0,even the terms they are using are outdated as ...


#### B) Remove all punctuation from the text.

In [7]:
punctuation = dict.fromkeys(i for i in range(sys.maxunicode) 
                            if unicodedata.category(chr(i)).startswith('P'))

In [9]:
commentsDF.txt = [string.translate(punctuation) for string in commentsDF.txt]

In [10]:
commentsDF.head()

Unnamed: 0,con,txt
835221,0,not sure if your username is meant to be apt h...
510669,1,because theres nothing of substance in the leak
524678,1,deleted
338313,0,as a reminder this subreddit is for civil disc...
893191,0,even the terms they are using are outdated as ...


#### C) Remove stop words.

In [None]:
# import nltk
# nltk.download('stopwords')

In [13]:
stop_words = stopwords.words('english')

def removeStopWords(string):
    tokenized_words = word_tokenize(string)
    return [word for word in tokenized_words if word not in stop_words]

In [14]:
commentsDF.txt = commentsDF.txt.apply(lambda x: removeStopWords(x))

In [15]:
commentsDF.head()

Unnamed: 0,con,txt
835221,0,"[sure, username, meant, apt, dont, think, kind..."
510669,1,"[theres, nothing, substance, leak]"
524678,1,[deleted]
338313,0,"[reminder, subreddit, civil, discussionhttpsww..."
893191,0,"[even, terms, using, outdated, hell, latin, pe..."


#### D) Apply NLTK's PorterStemmer.

In [16]:
porter = PorterStemmer()

In [17]:
commentsDF.txt = commentsDF.txt.apply(lambda x: [porter.stem(word) for word in x])

In [18]:
commentsDF.head()

Unnamed: 0,con,txt
835221,0,"[sure, usernam, meant, apt, dont, think, kind,..."
510669,1,"[there, noth, substanc, leak]"
524678,1,[delet]
338313,0,"[remind, subreddit, civil, discussionhttpswwwr..."
893191,0,"[even, term, use, outdat, hell, latin, person,..."


## 2) Get text into a usable form for model-building.

#### A) Convert each text entry into a word-count vector.
See sections 5.3 & 6.8 in the *Machine Learning with Python Cookbook*

#### B) Convert each text entry into a part-of-speech tag vector.
See section 6.7 in the *Machine Learning with Python Cookbook*

#### C) Convert each entry into a term frequency-inverse document frequency (**tfidf**) vector.
See section 6.9 in the *Machine Learning with Python Cookbook*

## **Follow-Up Question**

### For the three techniques in problem 2 above, give an example where each would be useful.

#### A) Word-count vector

#### B) Part-of-speech vector

#### C) tfdif vector