# Week 2 : Assignment

Name : Ragunath Gunasekaran

In [61]:
import pandas as pd
import unicodedata
import sys
import numpy as np

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

## Reading the controversial-comments.jsonl file into a DataFrame

In [62]:
# the controversial comments availabe in the Data Files/controversial-comments/controversial-comments.jsonl
Con_com_df = pd.read_json('Data Files/controversial-comments/controversial-comments.jsonl',lines=True)
print (Con_com_df)

        con                                                txt
0         0  Well it's great that he did something about th...
1         0                       You are right Mr. President.
2         0  You have given no input apart from saying I am...
3         0  I get the frustration but the reason they want...
4         0  I am far from an expert on TPP and I would ten...
...     ...                                                ...
949995    0  I genuinely can't understand how anyone can su...
949996    0  As a reminder, this subreddit [is for civil di...
949997    0                  K. Don't explain why or anything.
949998    0                                          [deleted]
949999    0  Ya, sociopaths are known for celebrating their...

[950000 rows x 2 columns]


## Convert all text to lowercase letters

In [63]:
# converting into the lowercase
Con_com_df["txt"]=Con_com_df["txt"].apply(lambda x: x.lower())
Con_com_df["txt"]

0         well it's great that he did something about th...
1                              you are right mr. president.
2         you have given no input apart from saying i am...
3         i get the frustration but the reason they want...
4         i am far from an expert on tpp and i would ten...
                                ...                        
949995    i genuinely can't understand how anyone can su...
949996    as a reminder, this subreddit [is for civil di...
949997                    k. don't explain why or anything.
949998                                            [deleted]
949999    ya, sociopaths are known for celebrating their...
Name: txt, Length: 950000, dtype: object

## Remove all punctuation from the text

In [64]:
punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))
# removing puncutation
Con_com_df["txt"]= [string.translate(punctuation) for string in Con_com_df["txt"]]

In [65]:
Con_com_df

Unnamed: 0,con,txt
0,0,well its great that he did something about tho...
1,0,you are right mr president
2,0,you have given no input apart from saying i am...
3,0,i get the frustration but the reason they want...
4,0,i am far from an expert on tpp and i would ten...
...,...,...
949995,0,i genuinely cant understand how anyone can sup...
949996,0,as a reminder this subreddit is for civil disc...
949997,0,k dont explain why or anything
949998,0,deleted


## Remove stop words

In [None]:
Con_com_df["txt"] = Con_com_df["txt"].str.lower().str.split()

stop = stopwords.words('english')
# Removing stop words
Con_com_df["txt"].apply(lambda x: [item for item in x if item not in stop])

## NLTK’s PorterStemmer

In [42]:
# PorterStemmer
ps = PorterStemmer()

# Applying NLTK’s PorterStemmer
Con_com_df["txt"] = Con_com_df["txt"].apply(lambda word_list: [ ps.stem(word) for word in word_list])

##  Convert each text entry into a word-count vector

In [49]:
# Calling Vectorizer from NLTK
count = CountVectorizer()
# using count fit transform
bagofwords=count.fit_transform(Con_com_df["txt"])

In [50]:
bagofwords

<950000x189015 sparse matrix of type '<class 'numpy.int64'>'
	with 25712788 stored elements in Compressed Sparse Row format>

##  Convert each text entry into a part-of-speech tag vector 

In [None]:
words = Con_com_df["txt"].values.flatten()
# declaring array
part_of_speechTag = []

# tag each word and each text
for word in words:
    word_tag = nltk.pos_tag(word_tokenize(word))
    part_of_speechTag.append([tag for word, tag in word_tag])

# use one hot encoding to convert the tags into features
one_hot_multi = MultiLabelBinarizer()
one_hot_multi.fit_transform(part_of_speechTag)

## Convert each entry into a term frequency-inverse document frequency (tfidf) vector

In [51]:
tfidf = TfidfVectorizer()
# Term Frequ-inverse document vector
feature_matrix = tfidf.fit_transform(Con_com_df["txt"])
feature_matrix

<950000x189015 sparse matrix of type '<class 'numpy.float64'>'
	with 25712788 stored elements in Compressed Sparse Row format>

In [53]:
tfidf.vocabulary_

{'well': 182092,
 'it': 102788,
 'great': 89579,
 'that': 166398,
 'he': 92739,
 'did': 68366,
 'something': 158044,
 'about': 33061,
 'those': 168190,
 'beliefs': 44300,
 'while': 182815,
 'was': 181174,
 'in': 99201,
 'office': 127833,
 'doubt': 71522,
 'trump': 171398,
 'would': 185019,
 'fight': 81645,
 'the': 166466,
 'un': 174443,
 'for': 83337,
 'so': 157521,
 'really': 143211,
 'happy': 92015,
 'obama': 127029,
 'could': 60449,
 'oh': 127966,
 'wait': 180715,
 'you': 187125,
 'are': 39035,
 'right': 147471,
 'mr': 121021,
 'president': 138085,
 'have': 92517,
 'given': 87789,
 'no': 125118,
 'input': 100764,
 'apart': 38251,
 'from': 84647,
 'saying': 151029,
 'am': 36461,
 'wrong': 185295,
 'argument': 39122,
 'clearly': 55764,
 'get': 87216,
 'frustration': 84778,
 'but': 50202,
 'reason': 143326,
 'they': 167826,
 'want': 180954,
 'them': 167411,
 'to': 169240,
 'do': 70353,
 'way': 181494,
 'is': 102435,
 'because': 43888,
 'its': 102903,
 'foundation': 83830,
 'more': 1204

## Reference

Machine Learning with Python Cookbook
by Chris Albon
Released March 2018
Publisher(s): O'Reilly Media, Inc.
ISBN: 9781491989388