In [1]:
import nltk
import pandas as pd
import numpy as np
import re 
import string
from nltk.stem import WordNetLemmatizer
from contractions import CONTRACTION_MAP
from nltk.stem import WordNetLemmatizer
from pattern.en import tag
from nltk.corpus import wordnet as wn
from pattern.en import tag
import gensim
stopword_list = nltk.corpus.stopwords.words('english')
wnl = WordNetLemmatizer()

In [2]:
data = pd.read_csv("election.csv")
data.head()

Unnamed: 0,content,Task 1 Irony
0,As the Bharatiya Janata Party (BJP) looks set ...,0.0
1,During the 1992 United States presidential el...,0.0
2,"The BJP has a vision of India: one nation, on...",0.0
3,The Supreme Court on May 22 granted protection...,0.0
4,Many see the election as a referendum on Mr ...,0.0


In [3]:
# function to tokenize text and remove whitespace
def tokenize_text(text):
    tokens = nltk.word_tokenize(str(text))
    tokens = [token.strip() for token in tokens]
    return tokens

In [4]:
# tokenize_text(data["content"])

In [5]:
# function for expanding contractions
def expand_contractions(text,contraction_mapping):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags = re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
    expanded_text = contractions_pattern.sub(expand_match,str(text))
    expanded_text = re.sub("'","",expanded_text)
    return expanded_text

In [6]:
# expand_contractions(data["content"],CONTRACTION_MAP)

In [7]:
# function for word lemmatization
def pos_tag_text(text):
    
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None
    
    tagged_text = tag(str(text))
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text

def lemmatize_text(text):
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word,pos_tag) if pos_tag else word for word,pos_tag in pos_tagged_text]
    lemmatized_text = ''.join(lemmatized_tokens)
    return lemmatized_text

In [8]:
# lemmatize_text(data['content'])

In [9]:
# function to remove special symbols and characters
def remove_special_characters(text):
    tokens = tokenize_text(str(text))
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None,[pattern.sub('',token) for token in tokens])
    filtered_text = ''.join(filtered_tokens)
    return filtered_text

In [10]:
# remove_special_characters(data["content"])

In [11]:
# function to remove stopwords
def remove_stopwords(text):
    tokens = tokenize_text(str(text))
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ''.join(filtered_tokens)
    return filtered_text

In [12]:
# remove_stopwords(data["content"])

In [13]:
# text normalization pipeline
def normalize_corpus(corpus,tokenize=False):
    normalized_corpus = []
    for text in str(corpus):
        text = expand_contractions(text,CONTRACTION_MAP)
        text = lemmatize_text(text)
        text = remove_special_characters(text)
        text = remove_stopwords(text)
        normalized_corpus.append(text)
        if tokenize:
            text = tokenize_text(text)
            normalized_corpus.append(text)
        return normalized_corpus

In [14]:
# normalize_corpus(data['content'])

# feature extraction

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
def bow_extractor(corpus, ngram_range=(1,1)):
    
    vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [16]:
from sklearn.feature_extraction.text import TfidfTransformer

def tfidf_transformer(bow_matrix):
    
    transformer = TfidfTransformer(norm='l2',
                                   smooth_idf=True,
                                   use_idf=True)
    tfidf_matrix = transformer.fit_transform(bow_matrix)
    return transformer, tfidf_matrix
    

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_extractor(corpus, ngram_range=(1,1)):
    
    vectorizer = TfidfVectorizer(min_df=1, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features
    

In [18]:
import numpy as np    
    
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector
    
   
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)
    
    
def tfidf_wtd_avg_word_vectors(words, tfidf_vector, tfidf_vocabulary, model, num_features):
    
    word_tfidfs = [tfidf_vector[0, tfidf_vocabulary.get(word)] 
                   if tfidf_vocabulary.get(word) 
                   else 0 for word in words]    
    word_tfidf_map = {word:tfidf_val for word, tfidf_val in zip(words, word_tfidfs)}
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    vocabulary = set(model.index2word)
    wts = 0.
    for word in words:
        if word in vocabulary: 
            word_vector = model[word]
            weighted_word_vector = word_tfidf_map[word] * word_vector
            wts = wts + word_tfidf_map[word]
            feature_vector = np.add(feature_vector, weighted_word_vector)
    if wts:
        feature_vector = np.divide(feature_vector, wts)
        
    return feature_vector
    
def tfidf_weighted_averaged_word_vectorizer(corpus, tfidf_vectors, 
                                   tfidf_vocabulary, model, num_features):
                                       
    docs_tfidfs = [(doc, doc_tfidf) 
                   for doc, doc_tfidf 
                   in zip(corpus, tfidf_vectors)]
    features = [tfidf_wtd_avg_word_vectors(tokenized_sentence, tfidf, tfidf_vocabulary,
                                   model, num_features)
                    for tokenized_sentence, tfidf in docs_tfidfs]
    return np.array(features) 