In [1]:
import pandas as pd
import re

In [21]:
TWEET_DATA = '../data/01_raw/product_sentiment.csv'
LABEL_MAPPING = { 
    "No emotion toward brand or product": {
        "alt_label": "NEU",
        "class": 1,
    },
    "Positive emotion": {
        "alt_label": "POS",
        "class": 2,
    },
    "Negative emotion": {
        "alt_label": "NEG",
        "class": 0,
    },
}
USE_CLEANED_TWEET = True

MODEL_NAME = "tfidf"
MODEL_FILENAME = f"../data/07_model_output/{MODEL_NAME}.pkl"

# save filepath
processed_indicator = 'raw'
if USE_CLEANED_TWEET:
    processed_indicator = 'cleaned'
PREDICTION_FILEPATH = f"../data/07_model_output/{MODEL_NAME}/{processed_indicator}_tweet_predicitons.csv"


### Load tweet data

In [3]:
df = pd.read_csv(TWEET_DATA)
df.rename(columns={'is_there_an_emotion_directed_at_a_brand_or_product': 'label'}, inplace=True)

### Preprocess data
1. Remove "I can't tell" labels - 156 rows (1.7% of data) and any missing tweets (1 data point)
2. Clean tweet text - remove links and `@` / `#` prefixes
3. Rename the labels - see `LABEL_MAPPING`

In [4]:
# helper functions
def _strip_links(text):
    link_regex = re.compile("((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)", re.DOTALL)
    links = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ", ")
    return text


def _strip_all_entities(text):
    entity_prefixes = ["@", ".@", "#", ".#"]
    # replace all other punctuation with a space
    # for separator in string.punctuation:
    #     if separator not in entity_prefixes:
    #         text = text.replace(separator, " ")
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return " ".join(words)

def preprocess_tweet_text(text):
    return _strip_all_entities(_strip_links(text))

def rename_labels(row):
    row['alt_label'] = LABEL_MAPPING[row['label']]['alt_label']
    row['class'] = LABEL_MAPPING[row['label']]['class']
    return row

In [5]:
# remove unknown labels and drop na
df_cln = df.copy()[df['label'] != "I can't tell"].dropna(subset=['tweet_text'])
len_before, len_after = len(df), len(df_cln)
print(f'len before cln: {len_before}, len after cln: {len_after}')

# clean text
df_cln['tweet_text_cln'] = df_cln['tweet_text'].apply(preprocess_tweet_text)

# rename labels
df_cln = df_cln.apply(rename_labels, axis=1)
df_cln.head(3)

len before cln: 9093, len after cln: 8936


Unnamed: 0,index,tweet_text,emotion_in_tweet_is_directed_at,label,tweet_text_cln,alt_label,class
0,0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,NEG,0
1,1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,Know about ? Awesome iPad/iPhone app that you'...,POS,2
2,2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,Can not wait for 2 also. They should sale them...,POS,2


### Train BoW model

Bag of words model:
1. Clean text, remove stopwords, lemmatize
2. Split train/test
3. fit TFIDF vectorizer
4. Multiclass LogReg?

#### 1. Clean text, remove stopwords, lemmatize

In [6]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
stopwords = stopwords.words('english')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def clean_for_bow(text):
    cln_text = []
    for word in nltk.word_tokenize(text):
        # lower
        word = word.lower()
        # remove non-alpha
        word = re.sub("[^A-Za-z]+","", word)
        # stop word removal
        if word in stopwords:
            continue
        # lemmatize
        word = lemmatizer.lemmatize(word)
        word = word.lstrip()
        if word:
            cln_text.append(word)
    
    return " ".join(cln_text)

df_cln['tweet_text_cln_bow'] = df_cln['tweet_text_cln'].apply(clean_for_bow)

# show example
_df_cln = df_cln.copy().sample(1)
tweet_text_cln_ex = _df_cln['tweet_text_cln'].iloc[0]
tweet_text_cln_bow_ex = _df_cln['tweet_text_cln_bow'].iloc[0]
print(f'before bow clean:\t {tweet_text_cln_ex}')
print(f'after bow clean:\t {tweet_text_cln_bow_ex}')

[nltk_data] Downloading package punkt to /Users/tommy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/tommy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/tommy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/tommy/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


before bow clean:	 Bisotã¢ location 06:10, 3/16 Austin, TX, USA {link} platform 2011 is killing... {link}
after bow clean:	 bisot location austin tx usa link platform killing link


#### 2. Split train/test

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_cln['tweet_text_cln_bow'], df_cln['class'], test_size=0.25, random_state=40, stratify=df_cln['class'], shuffle=True)

print(f"Train: {X_train.shape}, {y_train.shape}")
print(f"Test: {X_test.shape}, {y_test.shape}")

Train: (6702,), (6702,)
Test: (2234,), (2234,)


#### 3. TFIDF vectorizer and 4. Model

In [42]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer

pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        ("clf", LinearSVC(random_state=0)),
    ]
)

pipeline.fit(X_train, y_train)


### Eval

In [43]:
from sklearn.metrics import classification_report

y_test_pred = pipeline.predict(X_test)
report = classification_report(y_test, y_test_pred, output_dict=True)

report

{'0': {'precision': 0.6491228070175439,
  'recall': 0.2605633802816901,
  'f1-score': 0.37185929648241206,
  'support': 142},
 '1': {'precision': 0.7263779527559056,
  'recall': 0.821826280623608,
  'f1-score': 0.7711598746081504,
  'support': 1347},
 '2': {'precision': 0.6140888208269525,
  'recall': 0.538255033557047,
  'f1-score': 0.5736766809728183,
  'support': 745},
 'accuracy': 0.6915846016114593,
 'macro avg': {'precision': 0.6631965268668006,
  'recall': 0.5402148981541149,
  'f1-score': 0.5722319506877936,
  'support': 2234},
 'weighted avg': {'precision': 0.6840209097917528,
  'recall': 0.6915846016114593,
  'f1-score': 0.6799218883269609,
  'support': 2234}}

### Save

In [52]:
df_cln

Unnamed: 0,index,tweet_text,emotion_in_tweet_is_directed_at,label,tweet_text_cln,alt_label,class,tweet_text_cln_bow
0,0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,NEG,0,wesley g iphone hr tweeting dead need upgrade ...
1,1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,Know about ? Awesome iPad/iPhone app that you'...,POS,2,know awesome ipadiphone app likely appreciate ...
2,2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,Can not wait for 2 also. They should sale them...,POS,2,wait also sale
3,3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,I hope this year's festival isn't as crashy as...,NEG,0,hope year festival nt crashy year iphone app
4,4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,"great stuff on Fri Marissa Mayer (Google), Tim...",POS,2,great stuff fri marissa mayer google tim oreil...
...,...,...,...,...,...,...,...,...
9088,9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion,Ipad everywhere. {link},POS,2,ipad everywhere link
9089,9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product,"Wave, buzz... RT We interrupt your regularly s...",NEU,1,wave buzz rt interrupt regularly scheduled gee...
9090,9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product,"Google's Zeiger, a physician never reported po...",NEU,1,google zeiger physician never reported potenti...
9091,9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product,Some Verizon iPhone customers complained their...,NEU,1,verizon iphone customer complained time fell b...


In [49]:
y_test

6098    0
7745    2
5698    1
798     1
2009    1
       ..
7707    1
7911    0
2560    1
6823    1
6870    1
Name: class, Length: 2234, dtype: int64

In [46]:
pd.DataFrame(y_test_pred).value_counts()

1    1524
2     653
0      57
dtype: int64

In [22]:
import joblib
joblib.dump(pipeline, MODEL_FILENAME)

['../data/07_model_output/tfidf.pkl']

In [54]:
pipeline = joblib.load(MODEL_FILENAME)
pipeline.predict(['this is good'])[0]

2

In [39]:
pipeline.predict(['something'])
CLASS_MAPPING = {
    0:"NEG",
    1:"NEU",
    2:"POS"
}

array([1])