# Business Problem

# Data Understanding

In [22]:
# Import Modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer, TweetTokenizer

from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../data/judge-1377884607_tweet_product_company.csv', encoding= 'unicode_escape')

In [3]:
df.head(10)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
5,@teachntech00 New iPad Apps For #SpeechTherapy...,,No emotion toward brand or product
6,,,No emotion toward brand or product
7,"#SXSW is just starting, #CTIA is around the co...",Android,Positive emotion
8,Beautifully smart and simple idea RT @madebyma...,iPad or iPhone App,Positive emotion
9,Counting down the days to #sxsw plus strong Ca...,Apple,Positive emotion


In [42]:
# Dropped the sole blank tweet_text field
df.drop(index=[6], inplace=True)

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9092 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9092 non-null   object
dtypes: object(3)
memory usage: 284.1+ KB


In [44]:
df['emotion_in_tweet_is_directed_at'].value_counts()

iPad                               946
Apple                              661
iPad or iPhone App                 470
Google                             430
iPhone                             297
Other Google product or service    293
Android App                         81
Android                             78
Other Apple product or service      35
Name: emotion_in_tweet_is_directed_at, dtype: int64

In [20]:
df["tweet_text"][9092]

'\x8cÏ¡\x8eÏà\x8aü_\x8b\x81Ê\x8b\x81Î\x8b\x81Ò\x8b\x81£\x8b\x81Á\x8bââ\x8b\x81_\x8b\x81£\x8b\x81\x8f\x8bâ_\x8bÛâRT @mention Google Tests \x89ÛÏCheck-in Offers\x89Û\x9d At #SXSW {link}'

In [15]:
df['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

In [45]:
df_ict = df.loc[df['is_there_an_emotion_directed_at_a_brand_or_product'] == "I can't tell"]

df_ict['emotion_in_tweet_is_directed_at'].value_counts()

iPad                               4
Apple                              2
Google                             1
Other Google product or service    1
iPhone                             1
Name: emotion_in_tweet_is_directed_at, dtype: int64

## Preprocessing Data

In [46]:
X = df['tweet_text']
y = df['is_there_an_emotion_directed_at_a_brand_or_product']

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42,
                                                    test_size=0.25)

In [48]:
X_train = X_train.str.lower()

In [49]:
def get_wordnet_pos(treebank_tag):
    '''
    Translate nltk POS to wordnet tags
    '''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [50]:
# Define Stop Words
sw = stopwords.words('english')

In [65]:
def doc_preparer(tweet, stop_words=sw):
    '''
    :param tweet: a tweet from the tweet_text column 
    :return: a document string with words which have been 
            lemmatized, 
            parsed for stopwords, 
            made lowercase,
            and tokenized using TweetTokenizer.
    '''
    
    tweet_token = TweetTokenizer(reduce_len=True, strip_handles=True)
    doc = tweet_token.tokenize(tweet)
    doc = [word.lower() for word in doc]
    doc = [word for word in doc if word not in sw]
    # print(doc)
    doc = pos_tag(doc)
    doc = [(word[0], get_wordnet_pos(word[1])) for word in doc]
    lemmatizer = WordNetLemmatizer() 
    doc = [lemmatizer.lemmatize(word[0], word[1]) for word in doc]
    return ' '.join(doc)

In [66]:
X_train_processed = X_train.apply(doc_preparer)

In [67]:
X_train_processed.head()

8605    perfect attention detail rt google recreate co...
1303                    iphone steal get back ! ! ! #sxsw
8759    fuck love austin . leave youtube google party ...
5021    spot something rare minute ago ... phone make ...
3210    fyi rt i'll austin convention center today w /...
Name: tweet_text, dtype: object

In [None]:
df['text'] = df['tweet_text'].str.lower()

In [None]:
tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)
df['text_tokenized'] = df['text'].apply(tokenizer.tokenize)

In [None]:
df.head()

In [None]:
tokenizer.tokenize(df['text'][0])

In [None]:
tweet_tokenizer = TweetTokenizer()
tweet_tokens = []
for sent in compare_list:
    print(tweet_tokenizer.tokenize(sent))

In [None]:
len(corpus)

### Pseudocode

#### Cleaning
- split on spaces - tokenizer
- remove capitalization
- remove punctuation
- remove stopwords
- remove non-letters - tokenizer


#### Engineering
- encode pos/negative 
- stem/lem"?

In [None]:
corpus['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()