In [None]:
nb = MultinomialNB()

nb.fit(X_train_vectorized, y_train)

fig, ax = plt.subplots(figsize=(12,12))
plot_confusion_matrix(nb, X_train_vectorized, y_train, ax=ax, cmap="cividis");

In [None]:
# get the coefficient and sort them
coef = nb.coef_[0]
top_positive_coefficients = np.argsort(coef)[:10]


plt.figure(figsize=(15, 10))
colors = ['red' if c < 0 else 'blue' for c in coef[top_positive_coefficients]]
plt.barh(np.arange(len(top_positive_coefficients)), coef[top_positive_coefficients], color=colors)
feature_names = np.array(vectorizer.get_feature_names())
plt.yticks(np.arange(len(top_positive_coefficients)), feature_names[top_positive_coefficients], rotation=0, ha='right')
plt.ylabel('Features')
plt.xlabel('Coefficient')
plt.title('Most Predictive Features and Associated Coefficients - Naive Bayes')
plt.show()

In [None]:
from sklearn.metrics import f1_score

def plot_feature_importance(model, X, y):
    y = np.array(y)  # Convert y to numpy array
    target_names = np.unique(y)
    n_targets = len(target_names)
    
    if hasattr(model, 'feature_importances_'):
        feat_imp_all = model.feature_importances_.reshape(n_targets, -1)
    else:
        feat_imp_all = np.log(np.abs(model.coef_))
    
    f1_scores = []
    for t in range(n_targets):
        X_subset = X[np.where(y == target_names[t])]
        y_subset = y[np.where(y == target_names[t])]
        y_pred_subset = model.predict(X_subset)
        f1_scores.append(f1_score(y_subset, y_pred_subset, average='weighted'))
    
    f1_scores = np.array(f1_scores)
    plt.figure(figsize=(15, 10))
    colors = ['r', 'g', 'b']
    all_sorted_idx = []
    
    for t, target_name in enumerate(target_names):
        feat_imp = 100.0 * (feat_imp_all[t] * f1_scores[t] / feat_imp_all[t].max())
        sorted_idx = np.argsort(feat_imp)[::-1][:10]
        all_sorted_idx.extend(sorted_idx)

        plt.bar(np.arange(t * 10, t * 10 + 10), feat_imp[sorted_idx], color=colors[t], alpha=.7)

    plt.xticks(np.arange(30), np.array(vectorizer.get_feature_names())[all_sorted_idx], rotation=90)
    plt.title("Top 10 Feature Importances for Each Target (weighted by f1_score)")
    plt.legend(target_names)
    plt.show()

plot_feature_importance(nb, X_train_vectorized, y_train)

In [None]:
# plot for feature importance


nb = MultinomialNB()

nb.fit(X_train_vectorized, y_train)

feature_importances = nb.coef_[0]

feature_importances_sorted = sorted(zip(feature_importances, vectorizer.get_feature_names()), reverse=True)

N = 20
top_features = [f[1] for f in feature_importances_sorted[:N]]
top_importances = [f[0] for f in feature_importances_sorted[:N]]

plt.barh(np.arange(len(top_features)), top_importances)
plt.yticks(np.arange(len(top_features)), top_features)
plt.title('Top {} Most Important Features'.format(N))
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

In [13]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, make_scorer, plot_confusion_matrix
from wordcloud import WordCloud

from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier

from imblearn.over_sampling import SMOTENC, SMOTE
from imblearn.pipeline import Pipeline as imbPipeline

import matplotlib.pyplot as plt
import seaborn as sns

from code.cleaner import preprocess
from code.viz import word_plot

sns.set()

In [14]:
sentiments = pd.read_csv('./data/judge-1377884607_tweet_product_company.csv',encoding='ISO-8859-1')
sentiments.tail()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product
9092,Ï¡Ïàü_ÊÎÒ£Áââ_£â_ÛâRT @...,,No emotion toward brand or product


In [15]:
sentiments.rename(columns={
    'emotion_in_tweet_is_directed_at':'product',
    'is_there_an_emotion_directed_at_a_brand_or_product':'sentiment'
},inplace=True)

In [16]:
sentiments.dropna(subset=['tweet_text'],inplace=True)

In [17]:
reclassify = sentiments[sentiments['sentiment'] == "I can't tell"]
sentiments = sentiments[sentiments['sentiment'] != "I can't tell"]

In [45]:
sw = stopwords.words('english')
#specific_words = ['@mention','link','sxsw','#sxsw','@sxsw']
specific_words = ['@mention','link','sxsw','#sxsw','@sxsw',
                  'google','iphone', 'ipad', 'android', 'app',
                  'apple', 'rt', 'quot', 'store', 'new', 'austin'
                  'circle'
                 ]
# ^^ these are the original extended stop words from initial data
# discovery and knowledge 
discovered_words = [
    'google','iphone', 'ipad', 'android', 'app',
    'apple', 'rt', 'quot', 'store', 'new', 'austin'
    ]
specific_words.extend(discovered_words)
sw.extend(specific_words)

### Fitting a Naive Bayes Model

$$
\begin{bmatrix}
 & \text{Negative} & \text{Neutral} & \text{Positive} \\
\text{Negative} & \text{True-Neg} & \text{FNeu-Neg} & \text{FP-Neg} \\
\text{Neutral} & \text{False-Neg-Neu} & \text{True-Neu} & \text{False-P-Neu} \\
\text{Positive} & \text{False-Neg} & \text{False-Neu-Pos} & \text{TP} \\
\end{bmatrix}
$$

In [46]:
sentiments['tokenized'], sentiments['tokens'] = \
    preprocess(sentiments['tweet_text'],sw=sw,ret_tokens=True)

In [53]:
X2 = sentiments[['tokenized','tokens']]
y2 = sentiments['sentiment']

X_train2, X_test2, y_train2, y_test2 = \
    train_test_split(X2,y2,
        test_size=0.2,
        stratify=y2,random_state=13)

train = X_train2.merge(
    y_train,left_index=True,right_index=True
    )
test = X_test2.merge(
    y_test,left_index=True,right_index=True
    )

In [54]:
len(train)

7148

In [68]:
sentiments['sentiment'].value_counts()*.8

No emotion toward brand or product    4310.4
Positive emotion                      2382.4
Negative emotion                       456.0
Name: sentiment, dtype: float64

In [79]:
X = sentiments['tokenized']
y = sentiments['sentiment']

X_train, X_test, y_train, y_test = \
    train_test_split(X,y,
        test_size=0.2,
        stratify=y,random_state=13)

In [80]:
vectorizer = TfidfVectorizer(ngram_range=[1,3],max_df=0.4,min_df=20)

vectorizer.fit(X_train)

TfidfVectorizer(max_df=0.4, min_df=20, ngram_range=[1, 3])

In [88]:
X_train_vectorized = vectorizer.transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [89]:
majority_length = len(train[train['sentiment'] == 'No emotion toward brand or product'])

In [77]:
sm_dict = {"No emotion toward brand or product": majority_length,
           "Positive emotion": majority_length*0.5,
           "Negative emotion": majority_length*0.5}

In [122]:
svec = imbPipeline(steps=[
    ('sm', SMOTE(random_state=13)),
    ('svec', SVC(random_state=13))
])

svec.fit(X_train_vectorized, y_train)

Pipeline(steps=[('sm', SMOTE(random_state=13)), ('svec', SVC(random_state=13))])

In [123]:
preds = svec.predict(X_train_vectorized)
f1_score(y_train, preds, average=None)

array([0.84405458, 0.9141656 , 0.86169535])

In [124]:
preds = svec.predict(X_test_vectorized)
f1_score(y_test, preds, average=None)

array([0.3877551 , 0.75332742, 0.57193606])