# Bag of Words

In [1]:
import pandas as pd
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.manifold import TSNE
from nltk.tokenize import word_tokenize
np.random.seed(0)

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saman\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [5]:
df = pd.read_csv('data/clean/clean_tweet_emotion.csv', index_col=0)

In [6]:
df

Unnamed: 0,text,product,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,-1
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,1
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,1
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,-1
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,1
...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,1
9089,"Wave, buzz... RT @mention We interrupt your re...",,0
9090,"Google's Zeiger, a physician never reported po...",,0
9091,Some Verizon iPhone customers complained their...,,0


In [10]:
test_tweet = df['text'][0]

In [12]:
test_tweet

'.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.'

In [11]:
tokenized_tweet = word_tokenize(test_tweet)
tokenized_tweet

['.',
 '@',
 'wesley83',
 'I',
 'have',
 'a',
 '3G',
 'iPhone',
 '.',
 'After',
 '3',
 'hrs',
 'tweeting',
 'at',
 '#',
 'RISE_Austin',
 ',',
 'it',
 'was',
 'dead',
 '!',
 'I',
 'need',
 'to',
 'upgrade',
 '.',
 'Plugin',
 'stations',
 'at',
 '#',
 'SXSW',
 '.']

In [14]:
# need to remove the punctuation marks
def clean_tweet(tweet):
    cleaned_tweet = []
    for word in tweet:
            for symbol in ",.?!''\n":
                word = word.replace(symbol, '').lower()
            cleaned_tweet.append(word)
    return cleaned_tweet

clean_tweet(tokenized_tweet)

['',
 '@',
 'wesley83',
 'i',
 'have',
 'a',
 '3g',
 'iphone',
 '',
 'after',
 '3',
 'hrs',
 'tweeting',
 'at',
 '#',
 'rise_austin',
 '',
 'it',
 'was',
 'dead',
 '',
 'i',
 'need',
 'to',
 'upgrade',
 '',
 'plugin',
 'stations',
 'at',
 '#',
 'sxsw',
 '']

In [15]:
clean_tokenized_tweet = clean_tweet(tokenized_tweet)
clean_tokenized_tweet.remove('')
clean_tokenized_tweet

['@',
 'wesley83',
 'i',
 'have',
 'a',
 '3g',
 'iphone',
 '',
 'after',
 '3',
 'hrs',
 'tweeting',
 'at',
 '#',
 'rise_austin',
 '',
 'it',
 'was',
 'dead',
 '',
 'i',
 'need',
 'to',
 'upgrade',
 '',
 'plugin',
 'stations',
 'at',
 '#',
 'sxsw',
 '']

In [16]:
for item in clean_tokenized_tweet:
    if item == '':
        clean_tokenized_tweet.remove(item)
        
clean_tokenized_tweet

['@',
 'wesley83',
 'i',
 'have',
 'a',
 '3g',
 'iphone',
 'after',
 '3',
 'hrs',
 'tweeting',
 'at',
 '#',
 'rise_austin',
 'it',
 'was',
 'dead',
 'i',
 'need',
 'to',
 'upgrade',
 'plugin',
 'stations',
 'at',
 '#',
 'sxsw']

In [17]:
def count_vectorize(tweet, vocab=None):
    if vocab:
        unique_words = vocab
    else:
        unique_words = list(set(tweet))
    
    tweet_dict = {i:0 for i in unique_words}
    
    for word in tweet:
        tweet_dict[word] +=1
    
    return tweet_dict


test_vectorized = count_vectorize(clean_tokenized_tweet)
print(test_vectorized)

{'wesley83': 1, 'at': 2, 'to': 1, 'need': 1, '3': 1, 'it': 1, 'have': 1, 'stations': 1, '#': 2, 'was': 1, 'after': 1, 'dead': 1, 'tweeting': 1, 'upgrade': 1, 'rise_austin': 1, 'hrs': 1, '@': 1, 'plugin': 1, 'a': 1, '3g': 1, 'iphone': 1, 'sxsw': 1, 'i': 2}
