# Extracting General Features from Text

In [1]:
import pandas as pd
from string import punctuation
import nltk

nltk.download('tagsets')
from nltk.data import load

nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
from nltk import word_tokenize
from collections import Counter

[nltk_data] Downloading package tagsets to /home/muzaffar/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/muzaffar/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [31]:
def get_tagsets():
    tagdict = load('help/tagsets/upenn_tagset.pickle')
    return list(tagdict.keys())
 
tag_list = get_tagsets()
 
print(tag_list)

['WDT', 'NNS', 'UH', 'PRP$', ':', 'EX', 'JJ', 'JJS', 'NNPS', 'TO', 'RBS', '.', 'JJR', 'WRB', 'SYM', 'RBR', ',', 'VBD', 'WP', ')', 'FW', 'PDT', 'VBG', 'VBZ', 'IN', "''", 'CC', '$', 'WP$', '``', 'VBN', 'POS', 'VB', 'RP', 'PRP', 'NN', 'CD', 'RB', 'LS', 'VBP', 'DT', 'NNP', '--', 'MD', '(']


In [86]:
# This method will count occurrence of pos tags in each sentence.
def get_pos_occurrence_freq(data, tag_list):
    # Get list of sentences in text_list
    text_list = data.text
    
    # create empty dataframe
    feature_df = pd.DataFrame(columns=tag_list)
    for text_line in text_list:
        
        # get pos tags of each word.
        pos_tags = [j for i, j in pos_tag(word_tokenize(text_line))]
        
        # create a dict of pos tags and their frequency in given sentence.
        row = dict(Counter(pos_tags))
        feature_df = feature_df.append(row, ignore_index=True)
    feature_df.fillna(0, inplace=True)
    return feature_df

In [87]:
data = pd.read_csv('../../data_ch2/data.csv', header=0)

In [90]:
feature_df = get_pos_occurrence_freq(data, tag_list)
feature_df.head()

Unnamed: 0,WDT,NNS,UH,PRP$,:,EX,JJ,JJS,NNPS,TO,...,NN,CD,RB,LS,VBP,DT,NNP,--,MD,(
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
def add_punctuation_count(feature_df, data):
    # The below code line will find the intersection of set
    # of punctuations in text and punctuation set
    # imported from string module of python and find the length of
    # intersection set in each row and add it to column `num_of_unique_punctuations`
    # of data frame.
 
    feature_df['num_of_unique_punctuations'] = data['text']. \
        apply(lambda x: len(set(x).intersection(set(punctuation))))
    return feature_df
 
feature_df = add_punctuation_count(feature_df, data)
 
feature_df['num_of_unique_punctuations'].head()

0    0.0
1    0.0
2    1.0
3    1.0
4    0.0
Name: num_of_unique_punctuations, dtype: float64

In [7]:
def get_capitalized_word_count(feature_df, data):
    # The below code line will tokenize text in every row and
    # create a set of only capital words, then find the length of
    # this set and add it to the column `number_of_capital_words`
    # of dataframe.
 
    feature_df['number_of_capital_words'] = data['text'].\
        apply(lambda x: len([word for word in word_tokenize(str(x)) if word[0].isupper()]))
    return feature_df
 
feature_df = get_capitalized_word_count(feature_df, data)
 
feature_df['number_of_capital_words'].head()

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: number_of_capital_words, dtype: float64

In [8]:
def get_small_word_count(feature_df, data):
    # The below code line will tokenize text in every row and
    # create a set of only small words, then find the length of
    # this set and add it to the column `number_of_small_words`
    # of dataframe.
 
    feature_df['number_of_small_words'] = data['text'].\
        apply(lambda x: len([word for word in word_tokenize(str(x)) if word[0].islower()]))
    return feature_df

In [9]:
feature_df = get_small_word_count(feature_df, data)
feature_df['number_of_small_words'].head()

0    4.0
1    3.0
2    7.0
3    3.0
4    2.0
Name: number_of_small_words, dtype: float64

In [10]:
def get_number_of_alphabets(feature_df, data):
    # The below code line will break the text line in a list of
    # characters in each row and add the count of that list into
    # the columns `number_of_alphabets`
 
    feature_df['number_of_alphabets'] = data['text']. \
        apply(lambda x: len([ch for ch in str(x) if ch.isalpha()]))
    return feature_df
feature_df = get_number_of_alphabets(feature_df, data)
feature_df['number_of_alphabets'].head()


0    19.0
1    18.0
2    28.0
3    14.0
4    13.0
Name: number_of_alphabets, dtype: float64

In [11]:
def get_number_of_digit_count(feature_df, data):
    # The below code line will break the text line in a list of
    # digits in each row and add the count of that list into
    # the columns `number_of_digits`
 
    feature_df['number_of_digits'] = data['text']. \
        apply(lambda x: len([ch for ch in str(x) if ch.isdigit()]))
    return feature_df
feature_df = get_number_of_digit_count(feature_df, data)
feature_df['number_of_digits'].head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: number_of_digits, dtype: float64

In [12]:
def get_number_of_words(feature_df, data):
    # The below code line will break the text line in a list of
    # words in each row and add the count of that list into
    # the columns `number_of_digits`
 
    feature_df['number_of_words'] = data['text'].apply(lambda x
                                                       : len(word_tokenize(str(x))))
 
    return feature_df

In [13]:
feature_df = get_number_of_words(feature_df, data)
feature_df['number_of_words'].head()

0    5.0
1    4.0
2    9.0
3    5.0
4    3.0
Name: number_of_words, dtype: float64

In [14]:
def get_number_of_whitespaces(feature_df, data):
    # The below code line will generate list of white spaces
    # in each row and add the length of that list into
    # the columns `number_of_white_spaces`
 
    feature_df['number_of_white_spaces'] = data['text']. \
        apply(lambda x: len([ch for ch in str(x) if ch.isspace()]))
 
    return feature_df
 
feature_df = get_number_of_whitespaces(feature_df, data)
feature_df['number_of_white_spaces'].head()

0    4.0
1    3.0
2    7.0
3    3.0
4    2.0
Name: number_of_white_spaces, dtype: float64

In [15]:
feature_df.head()

Unnamed: 0,0,NNS,PRP,VBP,WP,PRP$,.,DT,IN,NN,...,VBN,MD,TO,num_of_unique_punctuations,number_of_capital_words,number_of_small_words,number_of_alphabets,number_of_digits,number_of_words,number_of_white_spaces
0,WDT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,4.0,19.0,0.0,5.0,4.0
1,NNS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,3.0,18.0,0.0,4.0,3.0
2,UH,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,7.0,28.0,0.0,9.0,7.0
3,PRP$,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,3.0,14.0,0.0,5.0,3.0
4,:,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,2.0,13.0,0.0,3.0,2.0
