In [1]:
import pandas as pd
pd.set_option('max_colwidth', 80)

from re import sub, split
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE, MDS
import seaborn as sns
import spacy
from spacy import displacy

def plot_embedding(V, y):
    """ Visualizes a vocabulary embedding via TSNE """
    V = TruncatedSVD(50).fit_transform(V)
    d = TSNE(metric='cosine').fit_transform(V)
    d = pd.DataFrame(d).assign(label = y.reset_index(drop=True))
    return sns.scatterplot(x = 0, y = 1, hue = 'label', data = d), d


def clean_twitter(s):
    """ Cleans Twitter specific issues
    Should probably clean out mentions, URLs, and RT's.
    """
# Use regular expressions to remove unwanted
# Remove : @,https, RTs, all non letter characters (#,numbers,emojis,etc),
# then, remove single character and multiple spaces.
    pat1=r'@\S+|https?[^\s]+|RT\s+|[^\w+]'
    pat2=r'\s+[a-zA-Z]\s+|\s+'
    pat3=r'^\s+|\s+$'
    tex_clean=sub(pat1," ", s)
    text_space=sub(pat2," ", tex_clean)
    output = sub(pat3,"", text_space)
# library "spacy" to
    sp = spacy.load('en_core_web_sm')
    sentence=sp(output)
    # Extract part-of-speech and lemmatize this sentence.
    part_of_speech = [(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in sentence 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]
    #Named entity extraction 
    entity = dict([(str(x), x.label_) for x in sentence.ents])
    #lemmatize
    lemmatize=" ".join([word.lemma_ for word in sentence])
    # I Return the sentence lemmatized. 
    return lemmatize
        

    # TODO: Use regular expressions to remove unwanted
    # text and clean up our tweets to be more usable!

    # BONUS: Try using the library "spacy" to 
    # do further processing, such as lemmatizing
    # or replacing Named Entities with constants (i.e. "[NAMED]")
    # or adding the part of speech or dependency code to the word 

 

In [2]:
X = pd.read_csv('data/tweets.csv').tweet
y = pd.read_csv('data/tweets.csv').label


In [9]:
X[0:10].map(clean_twitter)


0    here CNN on Sharia law -PRON- can be stone or have -PRON- hand cut off but b...
1    LOOK obama Clinton crony ILLEGALLY arm amp train muslim terrorist include is...
2    ThrowbackThursday BenGarrison cartoon from 2013 Obama get crown War Debt tax...
3                                    Say Islam be peace or else trump maga isis tcot
4                                                all aboard the Trump Train ChooChoo
5           FLASHBACK gt gt Judicial Watch Releases Huma Abedin Deposition testimony
6                follow FBI presser say the system be rig amp weigh in foxldt 7 p.m.
7                                           trump -PRON- re run Against Rigged Press
8    literally 98 of Hillary supporter see online be astroturfe spammer with 12 0...
9                                                                              Islam
Name: tweet, dtype: object

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer


In [None]:
# Let's visualize our data by using nothing but the Sklearn default
# cleaning and tokenizing

vectorizer = CountVectorizer()
V = vectorizer.fit_transform(X)
ax, d = plot_embedding(V, y)


In [None]:
# Now let's see what our cleaning has done
vectorizer = CountVectorizer(preprocessor = clean_twitter)
V = vectorizer.fit_transform(X)
ax, d = plot_embedding(V, y)


In [None]:
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
french_stop_words = set(stopwords.words('french'))
stw=stop_words.union(french_stop_words)

In [None]:
# Now try with TF-IDF vectorizer, and add implicit stopwords!
tfidf_vectorizer = TfidfVectorizer(preprocessor = clean_twitter,
                                   stop_words=stw)
V = tfidf_vectorizer.fit_transform(X)
ax, d = plot_embedding(V, y)


In [None]:
# Can you get things to separate in the space in a better way?
#As usual the best way to adjust the feature extraction parameters is to use a 
#cross-validated grid search. 
# In this case for example y use some values of the hyperparameters and get to separate in the space a lit better. 
tfidf_vectorizer = TfidfVectorizer(preprocessor = clean_twitter,
                                   max_df=0.75,
                                   max_features=10000,
                                   stop_words=stw)
V = tfidf_vectorizer.fit_transform(X)
ax, d = plot_embedding(V, y)
# Can you get things to separate in the space in a better way?

In [None]:
# Now try with Hashing vectorizer, and add implicit stopwords!
Hash_vectorizer = HashingVectorizer(preprocessor = clean_twitter,
                                     stop_words=stw)
V = Hash_vectorizer.fit_transform(X)
ax, d = plot_embedding(V, y)