In this exercise, we will extract various general features from documents. The dataset that we will be using here consists of random statements. Our objective is to find the frequency of various general features such as punctuation, uppercase and lowercase words, letters, digits, words, and whitespaces.

# Extracting General Features from Text

In [1]:
import pandas as pd
from string import punctuation
import nltk

nltk.download('tagsets')
nltk.download('punkt')
from nltk.data import load

nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
from nltk import word_tokenize
from collections import Counter

[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [2]:
def get_tagsets():
    tagdict = load('help/tagsets/upenn_tagset.pickle')
    return list(tagdict.keys())
 
tag_list = get_tagsets()
 
print(tag_list)

['LS', 'TO', 'VBN', "''", 'WP', 'UH', 'VBG', 'JJ', 'VBZ', '--', 'VBP', 'NN', 'DT', 'PRP', ':', 'WP$', 'NNPS', 'PRP$', 'WDT', '(', ')', '.', ',', '``', '$', 'RB', 'RBR', 'RBS', 'VBD', 'IN', 'FW', 'RP', 'JJR', 'JJS', 'PDT', 'MD', 'VB', 'WRB', 'NNP', 'EX', 'NNS', 'SYM', 'CC', 'CD', 'POS']


In [4]:
# This method will count occurrence of pos tags in each sentence.
def get_pos_occurrence_freq(data, tag_list):
    # Get list of sentences in text_list
    text_list = data.text
    
    # create empty dataframe
    feature_df = pd.DataFrame(columns=tag_list)
    for text_line in text_list:
        
        # get pos tags of each word.
        pos_tags = [j for i, j in pos_tag(word_tokenize(text_line))]
        
        # create a dict of pos tags and their frequency in given sentence.
        row = dict(Counter(pos_tags))
        feature_df = feature_df.append(row, ignore_index=True)
    feature_df.fillna(0, inplace=True)
    return feature_df

tag_list = get_tagsets()

data = pd.read_csv('data.csv', header=0)
feature_df = get_pos_occurrence_freq(data, tag_list)
feature_df.head()

Unnamed: 0,LS,TO,VBN,'',WP,UH,VBG,JJ,VBZ,--,...,MD,VB,WRB,NNP,EX,NNS,SYM,CC,CD,POS
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
def add_punctuation_count(feature_df, data):
    # The below code line will find the intersection of set
    # of punctuations in text and punctuation set
    # imported from string module of python and find the length of
    # intersection set in each row and add it to column `num_of_unique_punctuations`
    # of data frame.
 
    feature_df['num_of_unique_punctuations'] = data['text']. \
        apply(lambda x: len(set(x).intersection(set(punctuation))))
    return feature_df
 
feature_df = add_punctuation_count(feature_df, data)
 
feature_df['num_of_unique_punctuations'].head()

0    0
1    0
2    1
3    1
4    0
Name: num_of_unique_punctuations, dtype: int64

In [6]:
def get_capitalized_word_count(feature_df, data):
    # The below code line will tokenize text in every row and
    # create a set of only capital words, then find the length of
    # this set and add it to the column `number_of_capital_words`
    # of dataframe.
 
    feature_df['number_of_capital_words'] = data['text'].\
        apply(lambda x: len([word for word in word_tokenize(str(x)) if word[0].isupper()]))
    return feature_df
 
feature_df = get_capitalized_word_count(feature_df, data)
 
feature_df['number_of_capital_words'].head()

0    1
1    1
2    1
3    1
4    1
Name: number_of_capital_words, dtype: int64

In [7]:
def get_small_word_count(feature_df, data):
    # The below code line will tokenize text in every row and
    # create a set of only small words, then find the length of
    # this set and add it to the column `number_of_small_words`
    # of dataframe.
 
    feature_df['number_of_small_words'] = data['text'].\
        apply(lambda x: len([word for word in word_tokenize(str(x)) if word[0].islower()]))
    return feature_df

In [8]:
feature_df = get_small_word_count(feature_df, data)
feature_df['number_of_small_words'].head()

0    4
1    3
2    7
3    3
4    2
Name: number_of_small_words, dtype: int64

In [9]:
def get_number_of_alphabets(feature_df, data):
    # The below code line will break the text line in a list of
    # characters in each row and add the count of that list into
    # the columns `number_of_alphabets`
 
    feature_df['number_of_alphabets'] = data['text']. \
        apply(lambda x: len([ch for ch in str(x) if ch.isalpha()]))
    return feature_df
feature_df = get_number_of_alphabets(feature_df, data)
feature_df['number_of_alphabets'].head()


0    19
1    18
2    28
3    14
4    13
Name: number_of_alphabets, dtype: int64

In [10]:
def get_number_of_digit_count(feature_df, data):
    # The below code line will break the text line in a list of
    # digits in each row and add the count of that list into
    # the columns `number_of_digits`
 
    feature_df['number_of_digits'] = data['text']. \
        apply(lambda x: len([ch for ch in str(x) if ch.isdigit()]))
    return feature_df
feature_df = get_number_of_digit_count(feature_df, data)
feature_df['number_of_digits'].head()

0    0
1    0
2    0
3    0
4    0
Name: number_of_digits, dtype: int64

In [11]:
def get_number_of_words(feature_df, data):
    # The below code line will break the text line in a list of
    # words in each row and add the count of that list into
    # the columns `number_of_digits`
 
    feature_df['number_of_words'] = data['text'].apply(lambda x
                                                       : len(word_tokenize(str(x))))
 
    return feature_df

feature_df = get_number_of_words(feature_df, data)
feature_df['number_of_words'].head()

0    5
1    4
2    9
3    5
4    3
Name: number_of_words, dtype: int64

In [12]:
def get_number_of_whitespaces(feature_df, data):
    # The below code line will generate list of white spaces
    # in each row and add the length of that list into
    # the columns `number_of_white_spaces`
 
    feature_df['number_of_white_spaces'] = data['text']. \
        apply(lambda x: len([ch for ch in str(x) if ch.isspace()]))
 
    return feature_df
 
feature_df = get_number_of_whitespaces(feature_df, data)
feature_df['number_of_white_spaces'].head()

0    4
1    3
2    7
3    3
4    2
Name: number_of_white_spaces, dtype: int64

In [13]:
feature_df.head()

Unnamed: 0,LS,TO,VBN,'',WP,UH,VBG,JJ,VBZ,--,...,CC,CD,POS,num_of_unique_punctuations,number_of_capital_words,number_of_small_words,number_of_alphabets,number_of_digits,number_of_words,number_of_white_spaces
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,1,4,19,0,5,4
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,1,3,18,0,4,3
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1,1,7,28,0,9,7
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1,1,3,14,0,5,3
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0,1,2,13,0,3,2
