In [1]:
#Standard
import time
import numpy as np
import pandas as pd

# file manipulation
import os
import json

#text manipulation
import nltk
import re as regex

# Tokenize and Lemmatize text
from nltk.tokenize import TreebankWordTokenizer, TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

from nltk.corpus import wordnet as wn
from nltk import pos_tag

In [2]:
## Helper Functions
# Load dictionary with contractions
with open("data/contractions.json") as f:
    dic = json.load(f)
    
# Replace contractions with full words
def replace_contraction(text, dic = dic):
    for i, j in dic.items():
        text = text.replace(i, j)
    return text

def remove_urls(tweets):
    regexp = regex.compile(r"http.?://[^\s]+[\s]?")
    replace_by = ' '
    tweets.loc[:, "text"].replace(regexp, replace_by, inplace=True)
    return tweets

def remove_user(tweets):
    regexp = regex.compile(r'@[^\s]+[\s]?')
    replace_by = ' '
    tweets.loc[:, "text"].replace(regexp, replace_by, inplace=True)
    return tweets

def remove_special(tweets):
    regexp = regex.compile(u'[^A-Za-z0-9]+')
    replace_by = ' '
    tweets.loc[:, "text"].replace(regexp, replace_by, inplace=True)
    return tweets

def remove_numbers(tweets):
    regexp = regex.compile(r'\s?[0-9]+\.?[0-9]*')
    replace_by = ''
    tweets.loc[:, "text"].replace(regexp, replace_by, inplace=True)
    return tweets

def remove_spaces(tweets):
    regexp = regex.compile(r'\s\s+')
    replace_by = ' '
    tweets.loc[:, "text"].replace(regexp, replace_by, inplace=True)
    #Remove spaces at the beginning and end
    tweets['text'] = tweets['text'].str.strip()
    return tweets


# helper Functions to Convert Penn PoS tag to WordNet PoS tag standard
def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']
def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']
def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']

# Convert Penn to WordNet
def penn_to_wn(tag):
    if is_adjective(tag):
        return wn.ADJ
    elif is_noun(tag):
        return wn.NOUN
    elif is_adverb(tag):
        return wn.ADV
    elif is_verb(tag):
        return wn.VERB
    return None

# Tokenize and Lemmatize text
def lemmatize(txt, wnl = WordNetLemmatizer(), tkn = TreebankWordTokenizer):
    '''    
     1.) We first tokenize and tag part of speech (PoS)
     2.) Lemmatize PoS:  adjectives, nouns, verbs, adverbs. 
     3.) Drop PoS from further analysis: connections, articles, prepositions, etc.
    '''
    lemmaList = [wnl.lemmatize(i,penn_to_wn(j)) for i,j in pos_tag(tkn().tokenize(txt)) if penn_to_wn(j) in ['a','n','v','r']]
    return lemmaList

def stemming(txt, ps = PorterStemmer(), tkn = TreebankWordTokenizer):
    '''    
     1.) We first tokenize sentense
     2.) apply porter Stemmer for each word and return list
    '''
    stemList = [ps.stem(word) for word in tkn().tokenize(txt)]
    return stemList

In [3]:
%%time
## Load tweets from JSON
# Get path to twitter folder
DataPath = 'C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\dataRaw\\AAPL'
#DataPath = 'C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\dataRaw\\MSFT'
#DataPath = 'C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\dataRaw\\NFLX'
#DataPath = 'C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\dataRaw\\TSLA'
#DataPath = 'C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\dataRaw\\GOOG'

# Get files inside the folder
DataList = os.listdir(DataPath)

TweetsData = []
# Load Twitter data
for item in DataList:
    TweetsPath = DataPath + '\\' + item

    with open(TweetsPath, "r", errors='ignore') as TweetsFile:
        for line in TweetsFile:
            try:
                Tweet = json.loads(line)
                TweetsData.append(Tweet)
            except:
                continue

KeyboardInterrupt: 

In [4]:
# remove corrupted observation in case of GOOGLE
TweetsData[219426] = {}

In [None]:
%%time
## Load Data frame of tweets
# Convert loaded tweets to dataframe with selected columns
tweets = pd.DataFrame(TweetsData)
tweets = tweets[['id','created_at', 'text',  'lang', 'retweeted_status', 'quoted_status']]

# Convert date to datetime
tweets['created_at'] = pd.to_datetime(tweets['created_at'], format='%a %b %d %H:%M:%S +0000 %Y')

# drop corupted observations
tweets = tweets.dropna(subset=['id', 'text', 'created_at', 'lang'], how='all')

# remove non-english tweets, retweets, quotes 
tweets = tweets.loc[tweets['retweeted_status'].isnull()]
tweets = tweets.loc[tweets['quoted_status'].isnull()]
tweets = tweets.loc[tweets['lang'] == 'en']

# additional features
tweets['F_exclamation'] = tweets['text'].str.count("!")
tweets['F_question'] = tweets['text'].str.count("\?")
tweets['F_ellipsis'] = tweets['text'].str.count(r"\.\s?\.\s?\.")
tweets['F_hashtags'] = tweets['text'].str.count("#")
tweets['F_cashtags'] = tweets['text'].str.count("\$")
tweets['F_usermention'] = tweets['text'].str.count("@")
tweets['F_urls'] = tweets['text'].str.count(r"http.?://[^\s]+[\s]?")

# Drop useless columns
droplist = ["lang","retweeted_status", "quoted_status"]
tweets = tweets.drop(droplist, 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Wall time: 6.13 s


In [None]:
%%time
## Preprocess text
tweetsClean = tweets.copy()

#1.) Lowercase all characters
tweetsClean['text'] = tweetsClean['text'].str.lower()

#2.) Replace contractions
tweetsClean['text'] = tweetsClean['text'].apply(replace_contraction)

tweetsClean = remove_urls(tweetsClean) #3.) Remove URL
tweetsClean = remove_user(tweetsClean) #4.) Remove @USER
tweetsClean = remove_special(tweetsClean) #5.) Remove special characters
tweetsClean = remove_numbers(tweetsClean) #6.) Remove numbers
tweetsClean = remove_spaces(tweetsClean) #7.) Remove redundant spaces

#8.) get lemmas from tweets
tweetsClean['lemmas'] = tweetsClean['text'].apply(lemmatize)

#9.) get lemmas from tweets
tweetsClean['stems'] = tweetsClean['text'].apply(stemming)

#10.) get tokens from tweets
tokenizer = TreebankWordTokenizer()
tweetsClean['tokens'] = tweetsClean['text'].apply(tokenizer.tokenize)


In [32]:
tweetsClean

Unnamed: 0,id,created_at,text,F_exclamation,F_question,F_ellipsis,F_hashtags,F_cashtags,F_usermention,F_urls
0,6.824537e+17,2015-12-31 06:50:24,lt nice profit chart many millions in profit i...,1,0,0,0,4,0,1
1,6.824537e+17,2015-12-31 06:50:27,hot stocks buzz apple nasdaq aapl sandridge en...,0,0,1,1,0,0,2
3,6.824543e+17,2015-12-31 06:52:35,th jul the protean aapl kalhjvw,0,0,0,0,0,0,0
4,6.824550e+17,2015-12-31 06:55:21,faraday future needs to bring something specta...,0,0,0,0,0,1,2
5,6.824553e+17,2015-12-31 06:56:28,apple aapl stock predictions a screaming buy v...,0,0,1,0,0,0,1
6,6.824559e+17,2015-12-31 06:59:01,apple inc app store was just dominated by a ma...,0,0,0,1,1,0,1
8,6.824563e+17,2015-12-31 07:00:31,we are starting to get a clear idea about what...,0,0,0,2,0,0,1
9,6.824564e+17,2015-12-31 07:01:06,investors should not get distracted by apple a...,0,0,2,2,0,0,1
11,6.824566e+17,2015-12-31 07:01:45,apple inc app store was just dominated by a ma...,0,0,0,1,1,0,2
13,6.824567e+17,2015-12-31 07:02:08,aryc percent change updated thursday december ...,0,0,0,0,5,0,1


In [28]:
# Export to CSV
tweetsClean.to_csv('tweetsAAPL.csv')