# Data Preprocessing


In [None]:
import re
import string
import json
import nltk
from nltk.corpus import stopwords
from nltk import ngrams
from nltk import ToktokTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.corpus import wordnet
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

lemmatizer = WordNetLemmatizer()
tokenizer = ToktokTokenizer()
vectorizer = TfidfVectorizer()

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

**Loading Data**


In [None]:
stopword_list = nltk.corpus.stopwords.words('english')


def remove_punctuations(text):
    text = text.strip()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'[^\w\s]', '', text)
    return text


def generate_tokens(text):
    text = text.lower()
    tokens = tokenizer.tokenize(text)
    texts = [word for word in tokens if word not in stopword_list]
    # texts = ' '.join(texts)
    return texts


def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


def capture_lemmatization(tokens):
    tokens = [lemmatizer.lemmatize(
        token, get_wordnet_pos(token)) for token in tokens]
    return tokens


def decontact(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r'\b\d+\b', '', phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase


In [None]:

ndtvf = open('./articles/ndtv.json', encoding="utf8")
data = json.load(ndtvf)

article_vec = list()

for article in data:
    content = ' '.join(article['content'])
    content = remove_punctuations(content)
    content = decontact(content)
    tokens = generate_tokens(content)
    tokens = capture_lemmatization(tokens)
    modified_text = ' '.join(tokens)   
    article_vec.append(modified_text) 
    print(modified_text)
    # print(tokens)
    # ngrams = ngrams(tokens, 3)

    # for ngram in ngrams:
    # 	print(ngram)
    
    
vectorizer.fit(article_vec)
tfidf_matrix = vectorizer.transform(article_vec)

print(tfidf_matrix.toarray())
print(vectorizer.vocabulary_)

ndtvf.close()
