In [30]:
import pandas as pd
import string
import numpy as np
from collections import Counter

In [16]:
df_apple_tweets=pd.read_csv('Apple-Twitter-Sentiment-DFE.csv')
df_apple_tweets.head(2)

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,sentiment,sentiment:confidence,date,id,query,sentiment_gold,text
0,623495513,True,golden,10,,3,0.6264,Mon Dec 01 19:30:03 +0000 2014,5.4e+17,#AAPL OR @Apple,3not_relevant,#AAPL:The 10 best Steve Jobs emails ever...htt...
1,623495514,True,golden,12,,3,0.8129,Mon Dec 01 19:43:51 +0000 2014,5.4e+17,#AAPL OR @Apple,31,RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...


In [14]:
df_apple_tweets.shape

(3886, 12)

In [15]:
df_apple_tweets.columns

Index(['_unit_id', '_golden', '_unit_state', '_trusted_judgments',
       '_last_judgment_at', 'sentiment', 'sentiment:confidence', 'date', 'id',
       'query', 'sentiment_gold', 'text'],
      dtype='object')

In [17]:
df_apple_tweets['text'].values

array(['#AAPL:The 10 best Steve Jobs emails ever...http://t.co/82G1kL94tx',
       'RT @JPDesloges: Why AAPL Stock Had a Mini-Flash Crash Today $AAPL #aapl http://t.co/hGFcjYa0E9',
       'My cat only chews @apple cords. Such an #AppleSnob.', ...,
       '@marcbulandr I could not agree more. Between @Apple @Twitter and @IBMWatson only great things will happen. #AppleandIBM #IBMandTwitter',
       "My iPhone 5's photos are no longer downloading automatically to my laptop when I sync it. @apple support is unhelpful. Any ideas?",
       "RT @SwiftKey: We're so excited to be named to @Apple's 'App Store Best of 2014' list this year! http://t.co/d7qlmti4Uf #Apple"],
      dtype=object)

In [19]:
def extract_words(text):
    stopwords = set([word.lower().strip() for word in open("nltk_stopwords.txt", "rt").readlines()])
    temp = text.split() # Split the text on whitespace
    text_words = []

    punctuation = set(string.punctuation)
    
    #Keep #tags and @mentions
    punctuation.remove("#")
    punctuation.remove("@")
    
    for word in temp:
        # Remove any punctuation characters present in the beginning of the word
        while len(word) > 0 and word[0] in punctuation:
            word = word[1:]

        # Remove any punctuation characters present in the end of the word
        while len(word) > 0 and word[-1] in punctuation:
            word = word[:-1]

        # Simple rule to eliminate (most) URLs
        if len(word) > 0 and "/" not in word:
            # If it's not a stopword
            if word.lower() not in stopwords:
                # Append this word into our list of words.
                text_words.append(word.lower())
        
    return text_words

In [42]:
tweets =[extract_words(i) for i in df_apple_tweets[0:50]['text'].values]
tweets[0:2]

[['#aapl:the', '10', 'best', 'steve', 'jobs', 'emails'],
 ['rt',
  '@jpdesloges',
  'aapl',
  'stock',
  'mini-flash',
  'crash',
  'today',
  'aapl',
  '#aapl']]

In [43]:
def inv_doc_freq(corpus_words):
    number_docs = len(corpus_words)
    
    document_count = {}

    for document in corpus_words:
        word_set = set(document)

        for word in word_set:
            document_count[word] = document_count.get(word, 0) + 1
    
    IDF = {}
    
    for word in document_count:
        IDF[word] = np.log(number_docs/document_count[word])
        
    
    return IDF

def tf_idf(corpus_words):
    IDF = inv_doc_freq(corpus_words)
    
    TFIDF = []
    
    for document in corpus_words:
        TFIDF.append(Counter(document))
    
    for document in TFIDF:
        for word in document:
            document[word] = document[word]*IDF[word]
            
    return TFIDF

In [44]:
TFIDF = tf_idf(tweets)

In [45]:
TFIDF[0:3]

[Counter({'#aapl:the': 3.912023005428146,
          '10': 3.912023005428146,
          'best': 3.2188758248682006,
          'steve': 3.912023005428146,
          'jobs': 3.912023005428146,
          'emails': 3.912023005428146}),
 Counter({'rt': 1.2039728043259361,
          '@jpdesloges': 3.912023005428146,
          'aapl': 6.437751649736401,
          'stock': 2.8134107167600364,
          'mini-flash': 3.2188758248682006,
          'crash': 2.302585092994046,
          'today': 2.5257286443082556,
          '#aapl': 1.6094379124341003}),
 Counter({'cat': 3.912023005428146,
          'chews': 3.912023005428146,
          '@apple': 0.32850406697203605,
          'cords': 3.912023005428146,
          '#applesnob': 3.912023005428146})]

In [46]:
def build_vocabulary(TFIDF):
    words = set()
    
    for document in TFIDF:
        words |= set(document.keys())
    
    word_list = list(words)
    word_dict = dict(zip(word_list, range(len(word_list))))
    
    return word_dict, word_list

In [47]:
word_dict, word_list = build_vocabulary(TFIDF)

In [48]:
vocabulary_size = len(word_dict)
print("We have", vocabulary_size, "words in our vocabulary")

We have 287 words in our vocabulary


In [49]:
word_list[:10]

['network',
 'ios8',
 'ever',
 'store',
 'bought',
 'law',
 '8',
 'tech',
 '#aapl:this',
 'seizure']

In [50]:
def term_document_matrix(TFIDF, word_list, word_dict):
    vocabulary_size = len(word_dict)
    number_documents = len(TFIDF)
    
    TDM = np.zeros((vocabulary_size, number_documents))
    
    for doc in range(number_documents):
        document = TFIDF[doc]
        
        for word in document.keys():
            pos = word_dict[word]
            
            TDM[pos, doc] = document[word]
            
    return TDM

In [51]:
TDM = term_document_matrix(TFIDF, word_list, word_dict)
print("Our dataset has:\n%u unique words\n%u documents"%(TDM.shape))

Our dataset has:
287 unique words
50 documents
