In [1]:
#Importing required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set() 
import dask.dataframe as dd
import multiprocessing
import numpy as np

In [2]:
# Loading data using pandas library and creating an hdf5 file using vaex 

cols = ['Target', 'ID', 'Date', 'Query', 'User', 'Text']
raw_tweets = pd.read_csv('sentiment_140/training.1600000.processed.noemoticon.csv', encoding='latin-1', names = cols)

In [3]:
# the most important thing is the text and the sentiment of it so we will get rid of the rest of the columns

raw_tweets = raw_tweets[['Target', 'Text']]

In [4]:
#Making sure we do not have duplicates

print(raw_tweets.shape)
raw_tweets.drop_duplicates(inplace = True)
print(raw_tweets.shape)

(1600000, 2)
(1583691, 2)


In [5]:
#importing libraries for cleaning text
import nltk
import re

In [6]:
#Creating a function that deletes extra letters when they are repeated more than 2 times
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

print(reduce_lengthening( "finallllllly" ))

finally


In [7]:
from pattern.en import suggest

word = "amazzziiing"
word_wlf = reduce_lengthening(word) #calling function defined above
print(word_wlf) #word lengthening isn't being able to fix it completely

correct_word = suggest(word_wlf) 
print(correct_word)

amazziing
[('amazing', 1.0)]


In [8]:
#Creating a function to clean text and git rid of punctuations, stopwords, and lowering the text
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')
from nltk.stem import PorterStemmer
from pattern.en import suggest
ps = PorterStemmer()

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    #reducing charachters that repeat more than twice
    reduced_tokens = [reduce_lengthening(token) for token in tokens]
    #correcting the spelling of words
    corrected = [suggest(token)[0][0] for token in reduced_tokens]
    # filter stopwords and non english words out of document
    filtered_tokens = [token for token in corrected if token not in stop_words]
    # re-create document from filtered tokens
    stem_tokens = [ps.stem(word) for word in filtered_tokens]
    doc = ' '.join(stem_tokens)
    return doc


In [9]:
sample = "I eat 3 upple's with my frend!"
normalize_document(sample)

'eat appl friend'

In [10]:
# defining a function to clean the text in our dataframe.
def dataframe_normalize(row):
    row.Text = normalize_document(row.Text)
    return(row)

In [11]:
#Using dask for faster process
norm_tweets = dd.from_pandas(raw_tweets, npartitions=4*multiprocessing.cpu_count())\
    .map_partitions(lambda df: df.apply(dataframe_normalize,axis =1))\
    .compute(scheduler='processes')

In [12]:
norm_tweets.head()

Unnamed: 0,Target,Text
0,0,switchfoot httptwitpiccomyzl www summer got da...
1,0,upset updat facebook test might cri result sch...
2,0,kenichan dive mani time ball manag save rest g...
3,0,whole bodi feel itchi like fire
4,0,nationwideclass behav mad see


In [43]:
sample = norm_tweets[:2]
sample

Unnamed: 0,Target,Text
0,0,switchfoot httptwitpiccomyzl www summer got da...
1,0,upset updat facebook test might cri result sch...


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

def Tfidf_extractor(corpus, ngram_range=(1,1)):
    vectorizer = TfidfVectorizer(min_df = 50,
                                 norm = 'l2',
                                 smooth_idf=True,
                                 use_idf= True,
                                 ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [14]:
#Building tfidf vectorizer and get corpus feature vectors
tfidf_vectorizer, tfidf_features = Tfidf_extractor(norm_tweets.Text.values)

In [112]:
import gensim

model = gensim.models.Word2Vec(norm_tweets.Text.str.split().values, size = 100, window = 30, min_count = 50, sample =1e-3)

In [121]:
# define function to compute tfidf weighted averaged word vector for a document

def tfidf_wtd_avg_word_vectors(words, tfidf_vector, tfidf_vocabulary, model, num_features):
    word_tfidfs = [tfidf_vector[0, tfidf_vocabulary.get(word)]
                   if tfidf_vocabulary.get(word)
                   else 0 for word in words]
    
    word_tfidf_map = {word:tfidf_val for word, tfidf_val in zip(words, word_tfidfs)}
    feature_vector = np.zeros((num_features,),dtype="float64")
    vocabulary = set(model.wv.index2word)
    wts = 0.
    for word in words:
        if word in vocabulary:
            word_vector = model[word]
            weighted_word_vector = word_tfidf_map[word] * word_vector
            wts = wts + word_tfidf_map[word]
            feature_vector = np.add(feature_vector, weighted_word_vector)
    if wts:
        feature_vector = np.divide(feature_vector, wts)
    return feature_vector

In [122]:
# generalize above function for a corpus of documents
from nltk import word_tokenize

def tfidf_weighted_averaged_word_vectorizer(corpus, tfidf_vectors,
                                   tfidf_vocabulary, model, num_features):
    docs_tfidfs = [(doc.split(), doc_tfidf)
                   for doc, doc_tfidf
                   in zip(corpus, tfidf_vectors)]
    features = [tfidf_wtd_avg_word_vectors(tokenized_sentence, tfidf, tfidf_vocabulary,model, num_features)
                    for tokenized_sentence, tfidf in docs_tfidfs]
    return np.array(features)

In [126]:
# get tfidf weights and vocabulary
vocab = tfidf_vectorizer.vocabulary_
wt_tfidf_word_vec_features = tfidf_weighted_averaged_word_vectorizer(corpus=norm_tweets.Text.values,
                            tfidf_vectors=tfidf_features, tfidf_vocabulary=vocab, model=model,num_features=100)

  


In [127]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(wt_tfidf_word_vec_features, norm_tweets.Target.values, random_state=42, test_size=0.3)

In [34]:
from sklearn.linear_model import LogisticRegression


lr = LogisticRegression()
lr.fit(x_train,y_train)
print('Training Accuracy = {}'.format(lr.score(x_train, y_train)))
print('Test Accuracy = {}'.format(lr.score(x_test, y_test)))

Training Accuracy = 0.5005650298852069
Test Accuracy = 0.5022092675509353


In [75]:
from sklearn import metrics

def get_metrics(true_labels, predicted_labels):
    print('Accuracy:', np.round(metrics.accuracy_score(true_labels,predicted_labels),2))
    print('Precision:', np.round(metrics.precision_score(true_labels, predicted_labels, average='weighted'),2))
    print('Recall:', np.round(metrics.recall_score(true_labels, predicted_labels, average='weighted'),2))
    print('F1 Score:', np.round(metrics.f1_score(true_labels, predicted_labels, average='weighted')))

In [129]:
def train_predict_evaluate_model(classifier,
                                 train_features, train_labels,
                                 test_features, test_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    # predict using model
    predictions = classifier.predict(test_features)
    # evaluate model prediction performance  
    get_metrics(true_labels=test_labels,
                predicted_labels=predictions)
    return predictions

In [134]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
rf = RandomForestClassifier(n_jobs= -1)
svm = SGDClassifier(loss='hinge', n_jobs= -1)

In [132]:
train_predict_evaluate_model(svm,x_train,y_train,x_test,y_test)

Accuracy: 0.7
Precision: 0.7
Recall: 0.7
F1 Score: 1.0


array([1, 0, 0, ..., 1, 0, 1], dtype=int64)

In [135]:
train_predict_evaluate_model(rf,x_train,y_train,x_test,y_test)

Accuracy: 0.71
Precision: 0.71
Recall: 0.71
F1 Score: 1.0


array([1, 0, 0, ..., 1, 0, 1], dtype=int64)

# Entity Recogintion

In [60]:
def parse_document(document):
    document = re.sub('\n', ' ', document)
    if isinstance(document, str):
        document = document.lower()
    elif isinstance(document, unicode):
        return unicodedata.normalize('NFKD', document).encode('ascii', 'ignore').lower()
    else:
        raise ValueError('Document is not string or unicode!')
    document = document.strip()
    sentences = nltk.sent_tokenize(document)
    sentences = [sentence.strip() for sentence in sentences]
    return sentences

In [56]:
def dataframe_normalize2(row):
    row.Text = parse_document(row.Text)
    return(row)

In [59]:
norm_tweets = dd.from_pandas(raw_tweets, npartitions=4*multiprocessing.cpu_count())\
    .map_partitions(lambda df: df.apply(dataframe_normalize2,axis =1))\
    .compute(scheduler='processes')

In [24]:
norm_tweets.Target.replace(4,1, inplace =True)
norm_tweets.Target.replace(0,-1, inplace =True)

In [77]:
norm_tweets.Text[0]

["@switchfoot http://twitpic.com/2y1zl - awww, that's a bummer.",
 'you shoulda got david carr of third day to do it.',
 ';d']

In [78]:
import spacy

nlp = spacy.load('en_core_web_sm')
Person = {}
GPE = {}
ORG = {}
for tweet, sent in zip(norm_tweets.Text.values,norm_tweets.Target.values):
    doc = nlp(tweet[0])
    for ent in doc.ents: 
        if ent.label_ == 'PERSON':
            if ent.text in Person.keys():
                Person[ent.text]+=sent
            else:
                Person[ent.text]=sent
        elif ent.label_ == 'GPE':
            if ent.text in GPE.keys():
                GPE[ent.text]+=sent
            else:
                GPE[ent.text]=sent
        elif ent.label_ == 'ORG': 
            if ent.text in ORG.keys():
                ORG[ent.text]+=sent
            else:
                ORG[ent.text]=sent
                    

In [79]:
Person_df = pd.Series(Person,index=Person.keys())
Person_df.sort_values(inplace = True)
ORG_df = pd.Series(ORG, index = ORG.keys())
ORG_df.sort_values(inplace = True)
GPE_df = pd.Series(GPE, index = GPE.keys())
GPE_df.sort_values(inplace = True)

In [80]:
Person_df[:20]

kate              -625
argh              -438
u                 -331
dang              -272
farrah fawcett    -237
jon               -192
xbox              -152
kinda sad         -126
ed mcmahon        -124
nyc               -113
david carradine   -105
jay leno          -102
grrrr             -102
xbox live          -66
grrrrr             -52
doh                -52
sooo               -49
throat             -49
kinda              -47
max                -46
dtype: int64

### Most negative sentiments for known names is for Farrah Fawcett, David Carradine, and Ed McMahon that must be because of their deaths.

In [81]:
Person_df[-20:]

mm                  49
god                 54
adam                54
hehe                57
star trek           58
andy                59
miley               60
david archuleta     62
joe                 64
kim                 72
amy                 77
tom                 83
hun                 84
matt                88
yay                100
haha               103
woohoo             134
harry potter       141
chillin            144
hannah montana     173
dtype: int64

### Hannah Montana, Harry Potter, David Archuleta, Star Trek, and God are the known entities with positive sentiment

In [83]:
ORG_df[:20]

ugh          -1073
house         -310
headache      -230
grr           -223
air france    -199
jon &amp      -190
at&amp;t      -186
feelin        -147
mac           -146
aww           -136
urgh          -130
atm           -129
doc           -126
nooo          -122
cat           -115
msn           -114
jon           -112
bf            -106
blah           -91
apple          -88
dtype: int64

### Interestingly the most negative sentiments are regarding Air France and Apple products

In [85]:
ORG_df[-20:]

@banksyart           78
www.m2e.asia         79
wooo                 80
i &lt;3              87
@mrtweet             95
@kristenstewart9    110
@therealjordin      124
chillin             144
@ashleytisdale      158
@ddlovato           188
@youngq             189
hahaha              193
lol                 202
mmm                 220
u                   221
@jonathanrknight    252
yay                 318
ur                  340
mtv                 504
haha                728
dtype: int64

### Most Positive tweets are for MTV, Demi Lovato, Banksy, David Archie

In [87]:
GPE_df[:20]

iran        -378
kinda       -359
uk          -249
us          -185
dallas      -168
canada      -148
chicago     -119
noooooo     -105
cleveland    -97
china        -92
texas        -85
booo         -83
miami        -81
india        -81
houston      -78
florida      -76
toronto      -76
boston       -73
america      -63
tehran       -61
dtype: int64

In [88]:
GPE_df[-20:]

p.s                28
me&quot            29
kris               31
amsterdam          33
berlin             35
eminem             36
eatin              38
ï¿½                43
yaaay              50
hollywood          54
@alyssa_milano     59
yayyy              64
hehehe             75
philippines        75
twitterville       76
norway             90
germany            92
jonas             143
yummy             214
hehe              466
dtype: int64

# Sentiment Analysis Using TextBlob

In [64]:
from textblob import TextBlob

In [65]:
def dataframe_normalize3(row):
    row.Text = cleanTxt(row.Text)
    return(row)

In [66]:
# Create a function to clean the tweets
def cleanTxt(text):
    text = re.sub('@[A-Za-z0–9]+', '', text) #Removing @mentions
    text = re.sub('#', '', text) # Removing '#' hash tag
    text = re.sub('RT[\s]+', '', text) # Removing RT
    text = re.sub('https?:\/\/\S+', '', text) # Removing hyperlink
 
    return text


# Clean the tweets
df = dd.from_pandas(raw_tweets, npartitions=4*multiprocessing.cpu_count())\
    .map_partitions(lambda df: df.apply(dataframe_normalize3,axis =1))\
    .compute(scheduler='processes')

# Show the cleaned tweets
df.head()

Unnamed: 0,Target,Text
0,-1,"- Awww, that's a bummer. You shoulda got Da..."
1,-1,is upset that he can't update his Facebook by ...
2,-1,I dived many times for the ball. Managed to s...
3,-1,my whole body feels itchy and like its on fire
4,-1,"no, it's not behaving at all. i'm mad. why am..."


In [70]:
df.Target.replace(-1,0, inplace = True)

In [72]:
# Create a function to get the subjectivity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

# Create a function to get the polarity
def getPolarity(text):
    return  TextBlob(text).sentiment.polarity


# Create two new columns 'Subjectivity' & 'Polarity'
def dataframe_normalize4(row):
    row['Subjectivity'] = getSubjectivity(row.Text)
    row['Polarity'] = getPolarity(row.Text)
    return(row)

df= dd.from_pandas(df, npartitions=4*multiprocessing.cpu_count())\
    .map_partitions(lambda df: df.apply(dataframe_normalize4,axis =1))\
    .compute(scheduler='processes')
# Show the new dataframe with columns 'Subjectivity' & 'Polarity'
df.head()

Unnamed: 0,Target,Text,Subjectivity,Polarity
0,0,"- Awww, that's a bummer. You shoulda got Da...",0.633333,0.216667
1,0,is upset that he can't update his Facebook by ...,0.0,0.0
2,0,I dived many times for the ball. Managed to s...,0.5,0.5
3,0,my whole body feels itchy and like its on fire,0.4,0.2
4,0,"no, it's not behaving at all. i'm mad. why am...",1.0,-0.625


In [74]:
def getAnalysis(score):
    if score < 0:
        return 0
    else:
        return 1
df['Analysis'] = df['Polarity'].apply(getAnalysis)

In [76]:
get_metrics(df.Target.values, df.Analysis.values)

Accuracy: 0.61
Precision: 0.66
Recall: 0.61
F1 Score: 1.0
