# Scott Breitbach
## 20-March-2021
## DSC550, Week 2

In [1]:
import pandas as pd
import unicodedata
import sys
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk import pos_tag
from nltk import word_tokenize
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

## 1) Read the *controversial-comments.jsonl* file and pre-process the text. 

In [2]:
allCommentsDF = pd.read_json("controversial-comments\controversial-comments.jsonl", lines=True)
allCommentsDF

Unnamed: 0,con,txt
0,0,Well it's great that he did something about th...
1,0,You are right Mr. President.
2,0,You have given no input apart from saying I am...
3,0,I get the frustration but the reason they want...
4,0,I am far from an expert on TPP and I would ten...
...,...,...
949995,0,I genuinely can't understand how anyone can su...
949996,0,"As a reminder, this subreddit [is for civil di..."
949997,0,K. Don't explain why or anything.
949998,0,[deleted]


Grab a sample of the set to work with:

In [3]:
commentsDF = allCommentsDF.sample(100)

#### A) Convert all text to lowercase letters.

In [4]:
commentsDF.txt = commentsDF['txt'].str.lower()

In [5]:
commentsDF.head()

Unnamed: 0,con,txt
256176,0,[deleted]
904398,0,no. trump also apparently had a 2020 reelectio...
484075,0,[removed]
844124,0,well since i know what has been going on here ...
499079,0,which truth is that? obama is a kenyan infiltr...


#### B) Remove all punctuation from the text.

In [6]:
punctuation = dict.fromkeys(i for i in range(sys.maxunicode) 
                            if unicodedata.category(chr(i)).startswith('P'))

In [7]:
commentsDF.txt = [string.translate(punctuation) for string in commentsDF.txt]

In [8]:
commentsDF.head()

Unnamed: 0,con,txt
256176,0,deleted
904398,0,no trump also apparently had a 2020 reelection...
484075,0,removed
844124,0,well since i know what has been going on here ...
499079,0,which truth is that obama is a kenyan infiltra...


#### C) Remove stop words.

In [9]:
# import nltk
# nltk.download('stopwords')

In [10]:
stop_words = stopwords.words('english')

def removeStopWords(string):
    tokenized_words = word_tokenize(string)
    return [word for word in tokenized_words if word not in stop_words]

In [11]:
commentsDF.txt = commentsDF.txt.apply(lambda x: removeStopWords(x))

In [12]:
commentsDF.head()

Unnamed: 0,con,txt
256176,0,[deleted]
904398,0,"[trump, also, apparently, 2020, reelection, ca..."
484075,0,[removed]
844124,0,"[well, since, know, going, yea, live, near, or..."
499079,0,"[truth, obama, kenyan, infiltrator, ted, cruzs..."


#### D) Apply NLTK's PorterStemmer.

In [13]:
porter = PorterStemmer()

In [14]:
commentsDF.txt = commentsDF.txt.apply(lambda x: [porter.stem(word) for word in x])

In [15]:
commentsDF.head()

Unnamed: 0,con,txt
256176,0,[delet]
904398,0,"[trump, also, appar, 2020, reelect, campaign, ..."
484075,0,[remov]
844124,0,"[well, sinc, know, go, yea, live, near, orovil..."
499079,0,"[truth, obama, kenyan, infiltr, ted, cruz, dad..."


## 2) Get text into a usable form for model-building.

#### A) Convert each text entry into a word-count vector.
See sections 5.3 & 6.8 in the *Machine Learning with Python Cookbook*

In [16]:
count = CountVectorizer()

In [17]:
# commentsDF['WCV'] = commentsDF.txt.apply(lambda x: count.fit_transform(x).toarray())

Converting to word-count vector threw an error when the 'txt' field contained an empty list. This function should return an empty list when coming across this error.

In [18]:
# def wordCountVector(wordList):
#     try:
#         array = count.fit_transform(wordList).toarray()
#         return array
#     except:
#         return []

In [19]:
# commentsDF['WCV'] = commentsDF.txt.apply(lambda x: wordCountVector(x))
# commentsDF.head()

Apparently this needs to be in one large matrix so let's try it again:

In [20]:
# Convert all the text data into a list of strings, 
# with each tweet as one string in the list

text_data, string = [], " "

for text in commentsDF.txt:
    text_data.append(string.join(text))

In [21]:
# # Word-count vector as a DataFrame
# wordCountVector = pd.DataFrame(count.fit_transform(text_data).toarray(), columns=count.get_feature_names())
# wordCountVector

In [22]:
# Word-count vector as a sparse matrix
sparseWCV = count.fit_transform(text_data)
sparseWCV

<100x770 sparse matrix of type '<class 'numpy.int64'>'
	with 1420 stored elements in Compressed Sparse Row format>

#### B) Convert each text entry into a part-of-speech tag vector.
See section 6.7 in the *Machine Learning with Python Cookbook*

In [23]:
nltk.pos_tag(commentsDF.txt.iloc[0])[:5]

[('delet', 'NN')]

In [24]:
commentsDF['PoS'] = commentsDF.txt.apply(lambda x: [tag for word, tag in nltk.pos_tag(x)])

In [25]:
commentsDF.head()

Unnamed: 0,con,txt,PoS
256176,0,[delet],[NN]
904398,0,"[trump, also, appar, 2020, reelect, campaign, ...","[NN, RB, JJ, CD, NN, NN, VBD, PRP, JJ, NN, NN,..."
484075,0,[remov],[NN]
844124,0,"[well, sinc, know, go, yea, live, near, orovil...","[RB, RB, VBP, VB, JJ, JJ, IN, JJ, NN, NN, VB, ..."
499079,0,"[truth, obama, kenyan, infiltr, ted, cruz, dad...","[NN, MD, VB, NN, VBN, JJ, NN, NN, NN, NN, VBP,..."


#### C) Convert each entry into a term frequency-inverse document frequency (**tfidf**) vector.
See section 6.9 in the *Machine Learning with Python Cookbook*

In [26]:
tfidf = TfidfVectorizer()

In [27]:
# # tfidf vector as a Dataframe:
# tfidfVector = pd.DataFrame(tfidf.fit_transform(text_data).toarray(), columns=tfidf.get_feature_names())
# tfidfVector

In [28]:
# tfidf vector as a sparse matrix:
sparseTfidf = tfidf.fit_transform(text_data)
sparseTfidf

<100x770 sparse matrix of type '<class 'numpy.float64'>'
	with 1420 stored elements in Compressed Sparse Row format>

## **Follow-Up Question**

### For the three techniques in problem 2 above, give an example where each would be useful.

#### A) Word-count vector

#### B) Part-of-speech vector

#### C) tfdif vector