In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import wordcloud
import nltk
%matplotlib inline

In [3]:
df = pd.read_csv('../data/tweets&logreturns.csv', index_col=0)
df.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag,mention,created_date,created_time,passedClosing,Date,^GSPC,VGT
0,Twitter for iPhone,RT @TyCardon: Regardless who you’re pulling fo...,2020-09-29 22:51:43,30926,0,True,1311136617327984640,[],[],2020-09-29,22:51:43,1,2020-09-30,0.00822,0.007412
1,Twitter for iPhone,RT @realDonaldTrump: Rigged Election!,2020-09-29 22:37:44,66647,0,True,1311133095651717123,[],[],2020-09-29,22:37:44,1,2020-09-30,0.00822,0.007412
2,Twitter for iPhone,Volunteer to be a Trump Election Poll Watcher....,2020-09-29 22:30:39,14609,48540,False,1311131311965306885,['#MakeAmericaGreatAgain'],[],2020-09-29,22:30:39,1,2020-09-30,0.00822,0.007412
3,Twitter for iPhone,“EPA: One Trillion Trees Initiative will build...,2020-09-29 22:26:18,10408,36582,False,1311130217348345856,[],[],2020-09-29,22:26:18,1,2020-09-30,0.00822,0.007412
4,Twitter for iPhone,Joe Biden could not name a single Law Enforcem...,2020-09-29 22:15:30,4988,20767,False,1311127502954196993,[],[],2020-09-29,22:15:30,1,2020-09-30,0.00822,0.007412


## Extract meta information from text data

The purpose of feature engineering is to extract more information from the text data and use the extracted information as features. Such text/NLP based features could include:
<br>
* Word Count – total number of words in the documents
* Character Count – total number of characters in the documents
* Average Word Density – average length of the words used in the documents
* Puncutation Frequency – total number of punctuation marks / total number of words in the documents
* Upper Case Frequency – total number of upper case words / total number of words in the documents
<br>

Since length of a review varies, frequency is used instead of absolute count for Puncutation and Upper Case metrics.

Since there are two textual inputs, `review_headline` and `review_body`, there will be two sets of metrics generated, one for each textual input.

In [20]:
# define a function to generate the new features
def add_text_features(df, col, suffix):
    
    df['word_count'+suffix] = df[col].apply(lambda x: len(x.split()))
    df['char_count'+suffix] = df[col].apply(len)
    df['word_density'+suffix] = df['char_count'+suffix] / df['word_count'+suffix]
    df['punctuation_freq'+suffix] = df[col].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))
                                                         ) / df['word_count'+suffix]
    df['upper_case_freq'+suffix] = df[col].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()])
                                                        ) / df['word_count'+suffix]
    return df_new

In [21]:
# extract information from review_headline and review_body for both train and pred data
train_df = add_text_features(train_df, 'review_headline', '_headline')
train_df = add_text_features(train_df, 'review_body', '_body')
pred_df = add_text_features(pred_df, 'review_headline', '_headline')
pred_df = add_text_features(pred_df, 'review_body', '_body')

In [22]:
train_df.head(2)

Unnamed: 0,review_no,label,product_id,product_title,star_rating,helpful_votes,total_votes,review_headline,review_body,review_date,...,word_count_headline,char_count_headline,word_density_headline,punctuation_freq_headline,upper_case_freq_headline,word_count_body,char_count_body,word_density_body,punctuation_freq_body,upper_case_freq_body
0,24406,1,B00N3D7N6K,The Spells,5,0,0,Quite A Ride and Adventure!,I am not exactly sure how to start this review...,2014-10-20,...,5,27,5.4,0.2,0.2,212,1118,5.273585,0.15566,0.042453
1,44262,0,B00PW3B4LO,Mpow Knight Pro Wireless Bluetooth 4.0 Headset...,5,3,3,Smarter and Clearer Than My Smartphone,"Aside from its good performance, this Bluetoot...",2015-08-04,...,6,38,6.333333,0.0,0.0,241,1428,5.925311,0.145228,0.041494


In [None]:
sns.distplot('')