# N-Grams from Text

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams
from operator import itemgetter # The operator module exports a set of efficient functions corresponding to the intrinsic operators of Python. For example, operator.add(x, y) is equivalent to the expression x + y.

from nltk.corpus import stopwords
import string
from nltk import bigrams
from nltk import FreqDist
from nltk import bigrams

from nltk.stem import (PorterStemmer, LancasterStemmer)
from nltk.stem.snowball import SnowballStemmer # This is "Porter 2" and is considered the optimal stemmer.

from nltk import WordNetLemmatizer

from nltk.corpus import words



In [2]:
tweets_dataset = pd.read_csv("../input/tweets_dataset_all.csv")
tweets_dataset.tail(4)

Unnamed: 0,text
149564,Off a Xanax have some drunk sex
149565,I’ve heard he had an Adderall and cocaine prob...
149566,Si vous avez de la vaisselle à faire appelez m...
149567,“portuguese is like spanish in xanax ” DE VERDAD


In [3]:
def n_upper_chars(string):
    return sum(1 for c in string if c.isupper())

In [4]:
tweets_dataset['upper'] = tweets_dataset['text'].apply(lambda x: n_upper_chars(x))

In [5]:
tweets_dataset['text'] = tweets_dataset['text'].apply(lambda x: x.lower())

In [6]:
tweets_dataset[tweets_dataset.upper > 0].count()

text     46626
upper    46626
dtype: int64

In [7]:
def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"(\d)", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.lower()
    return df

df_clean_text = standardize_text(tweets_dataset, "text")

In [8]:
def drug_name(text):
    
    drugName = np.nan
    if 'adderall' in text:
        drugName = 'Adderall'
    elif 'prozac' in text: 
        drugName = 'Prozac'
    elif 'xanax' in text:
        drugName = 'Xanax'
        
    return drugName   

In [9]:
tweets_dataset["drugName"] = tweets_dataset["text"].apply(drug_name)

In [10]:
tweets_dataset[tweets_dataset.drugName.isna()].text.count()

19735

In [11]:
def useless(text):
    
    useless = np.nan
    if 'realdonaldtrum' in text:
        useless = 'realDonaldTrump'
    elif 'onaldTrum p' in text:
        useless = 'realDonaldTrump'
        
    return useless

In [12]:
tweets_dataset["useless"] = tweets_dataset["text"].apply(useless)

In [13]:
tweets_dataset[tweets_dataset["useless"] == 'realDonaldTrump'].useless.count()

1717

In [14]:
def tokenizer(text):
    tokenizer = RegexpTokenizer("[a-zA-Z'`éèî]+")
    s_tokenized = tokenizer.tokenize(text)
    return s_tokenized

In [15]:
tweets_dataset['tokens'] = tweets_dataset['text'].apply(lambda x: tokenizer(x))

In [16]:
tweets_dataset.to_csv("../output/tweets_dataset_all.csv", index=False)
tweets_dataset.head()

Unnamed: 0,text,upper,drugName,useless,tokens
0,i am buying spy put amd leaving dsx alone that...,0,Prozac,,"[i, am, buying, spy, put, amd, leaving, dsx, a..."
1,if my psychiatrist forgets to call in my adder...,0,Adderall,,"[if, my, psychiatrist, forgets, to, call, in, ..."
2,i wa da person on xanax i ain have nun to lose,0,Xanax,,"[i, wa, da, person, on, xanax, i, ain, have, n..."
3,came from being a xanax addict to a bad as bit...,0,Xanax,,"[came, from, being, a, xanax, addict, to, a, b..."
4,what is going on with violent and super sleazy...,0,Adderall,,"[what, is, going, on, with, violent, and, supe..."


In [17]:
def n_grams(tokens):
    generated_4grams = []

    for word in tokens:
        generated_4grams.append(list(ngrams(word, 4, pad_left=True, pad_right=True, left_pad_symbol='_', right_pad_symbol='_'))) # n = 4.
        generated_4grams = [word for sublist in generated_4grams for word in sublist]
        ng_list_4grams = generated_4grams
        
    for idx, val in enumerate(generated_4grams):
        ng_list_4grams[idx] = ''.join(val)
    ng_list_4grams
    
    freq_4grams = {}

    for ngram in ng_list_4grams:
        if ngram not in freq_4grams:
            freq_4grams.update({ngram: 1})
        else:
            ngram_occurrences = freq_4grams[ngram]
            freq_4grams.update({ngram: ngram_occurrences + 1})

    freq_4grams_sorted = sorted(freq_4grams.items(), key=itemgetter(1), reverse=True)[0:300] # We only keep the 300 most popular n-grams. This was suggested in the original paper written about n-grams.
    return freq_4grams_sorted
    
    

In [20]:
from nltk import everygrams
def ngram_extractor(sent):
    return [''.join(ng) for ng in everygrams(sent.replace(' ', '_ _'), 1, 4) 
            if ' ' not in ng and '\n' not in ng and ng != ('_',)]

In [21]:
tweets_dataset['ngrams'] = tweets_dataset['text'].apply(lambda x: ngram_extractor(x))

In [22]:
tweets_dataset.head()

Unnamed: 0,text,upper,drugName,useless,tokens,ngrams
0,i am buying spy put amd leaving dsx alone that...,0,Prozac,,"[i, am, buying, spy, put, amd, leaving, dsx, a...","[i, a, m, b, u, y, i, n, g, s, p, y, p, u, t, ..."
1,if my psychiatrist forgets to call in my adder...,0,Adderall,,"[if, my, psychiatrist, forgets, to, call, in, ...","[i, f, m, y, p, s, y, c, h, i, a, t, r, i, s, ..."
2,i wa da person on xanax i ain have nun to lose,0,Xanax,,"[i, wa, da, person, on, xanax, i, ain, have, n...","[i, w, a, d, a, p, e, r, s, o, n, o, n, x, a, ..."
3,came from being a xanax addict to a bad as bit...,0,Xanax,,"[came, from, being, a, xanax, addict, to, a, b...","[c, a, m, e, f, r, o, m, b, e, i, n, g, a, x, ..."
4,what is going on with violent and super sleazy...,0,Adderall,,"[what, is, going, on, with, violent, and, supe...","[w, h, a, t, i, s, g, o, i, n, g, o, n, w, i, ..."


In [23]:
stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [24]:
tweets_dataset.tokens[0]

['i',
 'am',
 'buying',
 'spy',
 'put',
 'amd',
 'leaving',
 'dsx',
 'alone',
 'that',
 'lovely',
 'german',
 'lady',
 'is',
 'bipolar',
 'and',
 'need',
 'some',
 'prozac',
 'before',
 'i',
 'can',
 'flirt',
 'again']

In [25]:
def cln_tokens(tokens):

    trade_words_condensed = [w.lower() for w in tokens if w.lower() not in stopwords.words('english')]


    punct_combo = [c + "\"" for c in string.punctuation ] + ["\"" + c for c in string.punctuation] + ["xanax","adderall","prozac" ] + [".-", ":-", "..", "..."]
    trade_words_condensed = [w for w in trade_words_condensed if w not in string.punctuation and w not in punct_combo]

    #bi_trade_words_condensed = list(bigrams(trade_words_condensed))
    #bi_fdist = FreqDist(bi_trade_words_condensed)

    return trade_words_condensed

In [26]:
tweets_dataset['cln_tokens'] = tweets_dataset['tokens'].apply(lambda x: cln_tokens(x))

In [27]:
tweets_dataset['cln_tokens'][0]

['buying',
 'spy',
 'put',
 'amd',
 'leaving',
 'dsx',
 'alone',
 'lovely',
 'german',
 'lady',
 'bipolar',
 'need',
 'flirt']

In [28]:
porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer("english")

def stems_collection(tokens):
    for stemmer in [porter, lancaster, snowball]:
        stems = [stemmer.stem(t) for t in tokens]
    return stems
    

In [29]:
tweets_dataset['stem_tokens'] = tweets_dataset['cln_tokens'].apply(lambda x: stems_collection(x))

In [30]:
tweets_dataset

Unnamed: 0,text,upper,drugName,useless,tokens,ngrams,cln_tokens,stem_tokens
0,i am buying spy put amd leaving dsx alone that...,0,Prozac,,"[i, am, buying, spy, put, amd, leaving, dsx, a...","[i, a, m, b, u, y, i, n, g, s, p, y, p, u, t, ...","[buying, spy, put, amd, leaving, dsx, alone, l...","[buy, spi, put, amd, leav, dsx, alon, love, ge..."
1,if my psychiatrist forgets to call in my adder...,0,Adderall,,"[if, my, psychiatrist, forgets, to, call, in, ...","[i, f, m, y, p, s, y, c, h, i, a, t, r, i, s, ...","[psychiatrist, forgets, call, im, flipping, tf...","[psychiatrist, forget, call, im, flip, tf, got..."
2,i wa da person on xanax i ain have nun to lose,0,Xanax,,"[i, wa, da, person, on, xanax, i, ain, have, n...","[i, w, a, d, a, p, e, r, s, o, n, o, n, x, a, ...","[wa, da, person, nun, lose]","[wa, da, person, nun, lose]"
3,came from being a xanax addict to a bad as bit...,0,Xanax,,"[came, from, being, a, xanax, addict, to, a, b...","[c, a, m, e, f, r, o, m, b, e, i, n, g, a, x, ...","[came, addict, bad, bitch, life, together, im,...","[came, addict, bad, bitch, life, togeth, im, y..."
4,what is going on with violent and super sleazy...,0,Adderall,,"[what, is, going, on, with, violent, and, supe...","[w, h, a, t, i, s, g, o, i, n, g, o, n, w, i, ...","[going, violent, super, sleazy, democrat, vict...","[go, violent, super, sleazi, democrat, victim,..."
5,i went to rehab last summer cuz i wa withdraw ...,0,Xanax,,"[i, went, to, rehab, last, summer, cuz, i, wa,...","[i, w, e, n, t, t, o, r, e, h, a, b, l, a, s, ...","[went, rehab, last, summer, cuz, wa, withdraw,...","[went, rehab, last, summer, cuz, wa, withdraw,..."
6,extremely aerosmith voice im baack back on my ...,0,Prozac,,"[extremely, aerosmith, voice, im, baack, back,...","[e, x, t, r, e, m, e, l, y, a, e, r, o, s, m, ...","[extremely, aerosmith, voice, im, baack, back,...","[extrem, aerosmith, voic, im, baack, back, aga..."
7,this is officially an uncomfortable xanax infu...,0,Xanax,,"[this, is, officially, an, uncomfortable, xana...","[t, h, i, s, i, s, o, f, f, i, c, i, a, l, l, ...","[officially, uncomfortable, infused, friday, f...","[offici, uncomfort, infus, friday, fun, giant,..."
8,trump is sharing his adderall,0,Adderall,,"[trump, is, sharing, his, adderall]","[t, r, u, m, p, i, s, s, h, a, r, i, n, g, h, ...","[trump, sharing]","[trump, share]"
9,he never ever not once ever taken adderall mor...,0,Adderall,,"[he, never, ever, not, once, ever, taken, adde...","[h, e, n, e, v, e, r, e, v, e, r, n, o, t, o, ...","[never, ever, ever, taken, moron, hed, never, ...","[never, ever, ever, taken, moron, hed, never, ..."


In [31]:
wnl = WordNetLemmatizer()
def lemmatizer (tokens):
    lemm = [wnl.lemmatize(t, pos='v') for t in tokens]
    return lemm

In [32]:
tweets_dataset['lemm_tokens'] = tweets_dataset['cln_tokens'].apply(lambda x: lemmatizer(x))

In [36]:
tweets_dataset

Unnamed: 0,text,upper,drugName,useless,tokens,ngrams,cln_tokens,stem_tokens,lemm_tokens,unusual
0,i am buying spy put amd leaving dsx alone that...,0,Prozac,,"[i, am, buying, spy, put, amd, leaving, dsx, a...","[i, a, m, b, u, y, i, n, g, s, p, y, p, u, t, ...","[buying, spy, put, amd, leaving, dsx, alone, l...","[buy, spi, put, amd, leav, dsx, alon, love, ge...","[buy, spy, put, amd, leave, dsx, alone, lovely...","{dsx, amd}"
1,if my psychiatrist forgets to call in my adder...,0,Adderall,,"[if, my, psychiatrist, forgets, to, call, in, ...","[i, f, m, y, p, s, y, c, h, i, a, t, r, i, s, ...","[psychiatrist, forgets, call, im, flipping, tf...","[psychiatrist, forget, call, im, flip, tf, got...","[psychiatrist, forget, call, im, flip, tf, get...","{im, tf, smh}"
2,i wa da person on xanax i ain have nun to lose,0,Xanax,,"[i, wa, da, person, on, xanax, i, ain, have, n...","[i, w, a, d, a, p, e, r, s, o, n, o, n, x, a, ...","[wa, da, person, nun, lose]","[wa, da, person, nun, lose]","[wa, da, person, nun, lose]",{}
3,came from being a xanax addict to a bad as bit...,0,Xanax,,"[came, from, being, a, xanax, addict, to, a, b...","[c, a, m, e, f, r, o, m, b, e, i, n, g, a, x, ...","[came, addict, bad, bitch, life, together, im,...","[came, addict, bad, bitch, life, togeth, im, y...","[come, addict, bad, bitch, life, together, im,...",{im}
4,what is going on with violent and super sleazy...,0,Adderall,,"[what, is, going, on, with, violent, and, supe...","[w, h, a, t, i, s, g, o, i, n, g, o, n, w, i, ...","[going, violent, super, sleazy, democrat, vict...","[go, violent, super, sleazi, democrat, victim,...","[go, violent, super, sleazy, democrat, victim,...","{ovaloffice, wtf}"
5,i went to rehab last summer cuz i wa withdraw ...,0,Xanax,,"[i, went, to, rehab, last, summer, cuz, i, wa,...","[i, w, e, n, t, t, o, r, e, h, a, b, l, a, s, ...","[went, rehab, last, summer, cuz, wa, withdraw,...","[went, rehab, last, summer, cuz, wa, withdraw,...","[go, rehab, last, summer, cuz, wa, withdraw, n...","{stepdad, rehab, minookaconfesions, cuz}"
6,extremely aerosmith voice im baack back on my ...,0,Prozac,,"[extremely, aerosmith, voice, im, baack, back,...","[e, x, t, r, e, m, e, l, y, a, e, r, o, s, m, ...","[extremely, aerosmith, voice, im, baack, back,...","[extrem, aerosmith, voic, im, baack, back, aga...","[extremely, aerosmith, voice, im, baack, back,...","{im, baack, aerosmith, agaaaiin}"
7,this is officially an uncomfortable xanax infu...,0,Xanax,,"[this, is, officially, an, uncomfortable, xana...","[t, h, i, s, i, s, o, f, f, i, c, i, a, l, l, ...","[officially, uncomfortable, infused, friday, f...","[offici, uncomfort, infus, friday, fun, giant,...","[officially, uncomfortable, infuse, friday, fu...",{}
8,trump is sharing his adderall,0,Adderall,,"[trump, is, sharing, his, adderall]","[t, r, u, m, p, i, s, s, h, a, r, i, n, g, h, ...","[trump, sharing]","[trump, share]","[trump, share]",{}
9,he never ever not once ever taken adderall mor...,0,Adderall,,"[he, never, ever, not, once, ever, taken, adde...","[h, e, n, e, v, e, r, e, v, e, r, n, o, t, o, ...","[never, ever, ever, taken, moron, hed, never, ...","[never, ever, ever, taken, moron, hed, never, ...","[never, ever, ever, take, moron, hed, never, t...","{hed, google, youll}"


In [33]:
english_vocab = set(w.lower() for w in words.words())
def unusual(lemm_tokens):
    text_vocab = set(w.lower() for w in lemm_tokens if w.isalpha()) # Note .isalpha() removes punctuation tokens. However, tokens with a hyphen like 'browser-based' are totally skipped over because .isalpha() would be false.
    unusual = text_vocab.difference(english_vocab)
    return unusual
    

In [34]:
tweets_dataset['unusual'] = tweets_dataset['lemm_tokens'].apply(lambda x: unusual(x))

In [165]:
import nltk
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     /Users/saeedahmadgill/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [35]:
tweets_dataset.to_csv("../output/final_tweets_dataset_all.csv", index=False)