# Activity 1 : Extracting General Features from texts

Extract following features from the documents present in the text file ‘data.csv’: <br>
i) number of occurrences of each parts of speech, <br>
ii) number of punctuations, <br> 
iii) number of capital and small letter words, <br>
iv) number of alphabets, <br>
v) number of digits, <br>
vi) number of words, <br>
vii) number of white spaces for each sentence

(Note: Each line is to be treated as a separate document and words starting with uppercase characters are called capital words)

In [8]:
import pandas as pd
from string import punctuation
import nltk
nltk.download('tagsets')
from nltk.data import load
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
from nltk import word_tokenize
from collections import Counter

[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/sohom.ghosh/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sohom.ghosh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [6]:
tagdict = load('help/tagsets/upenn_tagset.pickle')
list(tagdict.keys())

['LS',
 'TO',
 'VBN',
 "''",
 'WP',
 'UH',
 'VBG',
 'JJ',
 'VBZ',
 '--',
 'VBP',
 'NN',
 'DT',
 'PRP',
 ':',
 'WP$',
 'NNPS',
 'PRP$',
 'WDT',
 '(',
 ')',
 '.',
 ',',
 '``',
 '$',
 'RB',
 'RBR',
 'RBS',
 'VBD',
 'IN',
 'FW',
 'RP',
 'JJR',
 'JJS',
 'PDT',
 'MD',
 'VB',
 'WRB',
 'NNP',
 'EX',
 'NNS',
 'SYM',
 'CC',
 'CD',
 'POS']

In [9]:
#i) number of occurrences of each parts of speech
data = pd.read_csv('data_ch2/data.csv', header = 0)
pos_di = {}
for pos in list(tagdict.keys()):
	pos_di[pos] = []
for doc in data['text']:
	di = Counter([j for i,j in pos_tag(word_tokenize(doc))])
	for pos in list(tagdict.keys()):
		pos_di[pos].append(di[pos])

feature_df = pd.DataFrame(pos_di)

In [10]:
feature_df.head()

Unnamed: 0,LS,TO,VBN,'',WP,UH,VBG,JJ,VBZ,--,...,MD,VB,WRB,NNP,EX,NNS,SYM,CC,CD,POS
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
#ii) number of punctations
feature_df['num_of_unique_punctuations'] = data['text']\
.apply(lambda x : len(set(x).intersection(set(punctuation))))

In [12]:
feature_df['num_of_unique_punctuations'].head()

0    0
1    0
2    1
3    1
4    0
Name: num_of_unique_punctuations, dtype: int64

In [13]:
#iii) number of capital and small letter words
feature_df['number_of_capital_words'] =data['text'].apply(lambda x : \
                                            len([word for word in word_tokenize(str(x)) if word[0].isupper()]))
feature_df['number_of_capital_words'].head()

0    1
1    1
2    1
3    1
4    1
Name: number_of_capital_words, dtype: int64

In [14]:
feature_df['number_of_small_words'] =data['text'].apply(lambda x : \
                                            len([word for word in word_tokenize(str(x)) if word[0].islower()]))
feature_df['number_of_small_words'].head()

0    4
1    3
2    7
3    3
4    2
Name: number_of_small_words, dtype: int64

In [15]:
#iv) number of alphabets
feature_df['number_of_alphabets'] = data['text'].apply(lambda x : len([ch for ch in str(x) if ch.isalpha()]))
feature_df['number_of_alphabets'].head()

0    19
1    18
2    28
3    14
4    13
Name: number_of_alphabets, dtype: int64

In [16]:
#v) number of digits
feature_df['number_of_digits'] = data['text'].apply(lambda x : len([ch for ch in str(x) if ch.isdigit()]))
feature_df['number_of_digits'].head()

0    0
1    0
2    0
3    0
4    0
Name: number_of_digits, dtype: int64

In [17]:
#vi) number of words
feature_df['number_of_words'] = data['text'].apply(lambda x : len(word_tokenize(str(x))))
feature_df['number_of_words'].head()

0    5
1    4
2    9
3    5
4    3
Name: number_of_words, dtype: int64

In [18]:
#vii) number of white spaces for each sentence
feature_df['number_of_white_spaces'] = data['text'].apply(lambda x : len(str(x).split(' '))-1)
feature_df['number_of_white_spaces'].head()

0    4
1    3
2    7
3    3
4    2
Name: number_of_white_spaces, dtype: int64

In [19]:
feature_df.head()

Unnamed: 0,LS,TO,VBN,'',WP,UH,VBG,JJ,VBZ,--,...,CC,CD,POS,num_of_unique_punctuations,number_of_capital_words,number_of_small_words,number_of_alphabets,number_of_digits,number_of_words,number_of_white_spaces
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,4,19,0,5,4
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,3,18,0,4,3
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,7,28,0,9,7
3,0,0,0,0,1,0,0,0,1,0,...,0,0,0,1,1,3,14,0,5,3
4,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,1,2,13,0,3,2
