In [2]:
import pandas as pd
from string import punctuation
import nltk
nltk.download('tagsets')
from nltk.data import load
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
from nltk import word_tokenize
from collections import Counter

[nltk_data] Downloading package tagsets to /home/nicolas/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/nicolas/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
def get_tagsets():
    tagdict = load('help/tagsets/upenn_tagset.pickle')
    return list(tagdict.keys())

tag_list = get_tagsets()

print(tag_list)

['LS', 'TO', 'VBN', "''", 'WP', 'UH', 'VBG', 'JJ', 'VBZ', '--', 'VBP', 'NN', 'DT', 'PRP', ':', 'WP$', 'NNPS', 'PRP$', 'WDT', '(', ')', '.', ',', '``', '$', 'RB', 'RBR', 'RBS', 'VBD', 'IN', 'FW', 'RP', 'JJR', 'JJS', 'PDT', 'MD', 'VB', 'WRB', 'NNP', 'EX', 'NNS', 'SYM', 'CC', 'CD', 'POS']


In [6]:
"""
This method will count the occurence of pos
tags in each sentence.
"""
def get_pos_occurence_freq(data, tag_list):
    # Get list of sentences in text_list
    text_list = data.text
    
    # Create empty dataframe
    feature_df = pd.DataFrame(columns=tag_list)
    for text_line in text_list:
        
        # Get pos tags of each world.
        pos_tags = [j for i, j in pos_tag(word_tokenize(text_line))]
        
        """
        create a dict of pos tags and their frequency
        in given sentence
        """
        row = dict(Counter(pos_tags))
        feature_df = feature_df.append(row, ignore_index=True)
    feature_df.fillna(0, inplace=True)
    return feature_df
tag_list = get_tagsets()

data = pd.read_csv('data/data.csv', header=0)
feature_df = get_pos_occurence_freq(data, tag_list)
feature_df.head()

Unnamed: 0,LS,TO,VBN,'',WP,UH,VBG,JJ,VBZ,--,...,MD,VB,WRB,NNP,EX,NNS,SYM,CC,CD,POS
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
def add_punctuation_count(feature_df, data):
    feature_df['num_of_unique_punctuations'] = data['text'].apply(lambda x: len(set(x).\
                                                                                intersection(set(punctuation))))
    return feature_df

feature_df = add_punctuation_count(feature_df, data)
feature_df['num_of_unique_punctuations'].head()

0    0
1    0
2    1
3    1
4    0
Name: num_of_unique_punctuations, dtype: int64

In [9]:
def get_capitalized_word_count(feature_df, data):
    """
    The bellow code line will tokenize text in every row and
    create a set of only capital words, ten find the length of
    this set and add it to the column 'number_of_capital_words'
    """
    feature_df['number_of_capital_words'] = data['text'].apply(lambda x: len([word for word in word_tokenize(str(x))\
                                                                              if word[0].isupper()]))
    return feature_df

feature_df = get_capitalized_word_count(feature_df, data)

feature_df['number_of_capital_words'].head()

0    1
1    1
2    1
3    1
4    1
Name: number_of_capital_words, dtype: int64

In [None]:
def get_small_word_count(feature_df, data):
    """
    The bellow code will tokenize text in every row and
    create a set of only small words, then find the length of
    this set of only small words, then find the length of
    this set and add it to the column, 'number_of_small_words'
    of dataframe
    """
    feature_df['number_of_small_words'] = data['text'].apply(lambda x: len([word for word in\
                                                                        word_tokenize(str(x)) if word[0].islower()]))