In [50]:
import numpy as np
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt 

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer


from textblob import TextBlob
from wordcloud import WordCloud

import nltk
from nltk.probability import FreqDist

#Model Selection and Validation
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

In [51]:
data = pd.read_csv("data/1600000.processed.noemoticon.csv", encoding='latin-1')
# o.decode('latin-1').encode("utf-8")

In [52]:
print(data.head())

   0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY _TheSpecialOne_  \
0  0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   scotthamilton   
1  0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY        mattycus   
2  0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY         ElleCTF   
3  0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY          Karoli   
4  0  1467811372  Mon Apr 06 22:20:00 PDT 2009  NO_QUERY        joy_wolf   

  @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D  
0  is upset that he can't update his Facebook by ...                                                                   
1  @Kenichan I dived many times for the ball. Man...                                                                   
2    my whole body feels itchy and like its on fire                                                                    
3  @nationwideclass no, it's not behaving at all....           

In [53]:
new_data = data.iloc[:, [0,5]]

print(new_data.head())

   0  \
0  0   
1  0   
2  0   
3  0   
4  0   

  @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D  
0  is upset that he can't update his Facebook by ...                                                                   
1  @Kenichan I dived many times for the ball. Man...                                                                   
2    my whole body feels itchy and like its on fire                                                                    
3  @nationwideclass no, it's not behaving at all....                                                                   
4                      @Kwesidei not the whole crew                                                                    


In [54]:
def clean_tweet(tweet):
    
    excluded_punct = [".", ",", ":", "^", ";", "-"]
    
    tweet_list = tweet.split()
    clean_tokens = [re.sub('@[^\s]+','AT_USER', t) for t in tweet_list if re.match(r'[^\d]*$', t)]
    clean_s = ' '.join(clean_tokens)
    clean_url = re.sub(r'http\S+', '', clean_s)
    
    clean_punctuation = re.sub('(?<! )(?=[.,#!?()])|(?<=[.,#!?()])(?! )', ' ', clean_url)
    clean_mess = [word.lower() for word in clean_punctuation.split() if word.lower() not in stopwords.words('english') and word not in  excluded_punct]
    
    clean_apos = [item.replace("'", "") for item in clean_mess]
    
    return [item.replace("\"", "") for item in clean_apos]

cell = 1

print(new_data.iloc[cell, 1])
print("\n--------------\n")
print(clean_tweet(new_data.iloc[cell, 1]))

@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds

--------------

['at_user', 'dived', 'many', 'times', 'ball', 'managed', 'save', 'rest', 'go', 'bounds']


In [55]:
def normalization(tweet_list):
    lem = WordNetLemmatizer()
    normalized_tweet = []
    for word in tweet_list:
        normalized_text = lem.lemmatize(word,'v')
        normalized_tweet.append(normalized_text)
    return normalized_tweet

print(normalization(new_data.iloc[1,1].split()))

['@Kenichan', 'I', 'dive', 'many', 'time', 'for', 'the', 'ball.', 'Managed', 'to', 'save', '50%', 'The', 'rest', 'go', 'out', 'of', 'bound']


In [56]:
def delete_long_words(vocabulary):
    new_vocab = []
    for i in range(len(vocabulary)):
        if len(vocabulary[i]) <= 15:
            new_vocab.append(vocabulary[i])
            
    return new_vocab

In [57]:
def text_processing(tweet):
    return delete_long_words(normalization(clean_tweet(tweet)))

print(text_processing(new_data.iloc[1, 1]))

['at_user', 'dive', 'many', 'time', 'ball', 'manage', 'save', 'rest', 'go', 'bound']


### Create Bag Of Words

In [58]:
processed_sentence = new_data.iloc[:, 1].apply(text_processing).to_list()

vocabulary = [inner for outer in processed_sentence for inner in outer]

In [59]:
print(vocabulary[:10])

['upset', 'cant', 'update', 'facebook', 'texting', 'might', 'cry', 'result', 'school', 'today']


In [60]:
freq_dist = FreqDist(vocabulary)
freq_dist.most_common(10)

[('!', 10983),
 ('at_user', 8186),
 ('?', 3025),
 ('get', 2229),
 ('im', 2156),
 ('go', 2037),
 ('work', 1413),
 ('good', 1228),
 ('day', 1175),
 ('today', 1002)]

In [61]:
FreqDist(len(w) for w in vocabulary).most_common()

[(4, 41982),
 (5, 26810),
 (3, 21816),
 (7, 19098),
 (1, 17036),
 (6, 15800),
 (2, 8101),
 (8, 6141),
 (9, 3947),
 (10, 2166),
 (11, 1005),
 (12, 551),
 (13, 341),
 (14, 147),
 (15, 134),
 (0, 14)]

### Model

In [62]:
X_train, X_test, y_train, y_test = train_test_split(new_data.iloc[:, 1], new_data.iloc[:, 0], test_size=0.25)

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2)

In [63]:
#param = [MultinomialNB(), LogisticRegression(penalty='l2', tol=0.001, C=1)]

# MNB : ; LR : 0.73933

alpha = [1, 0.1, 0.01, 0.001, 0]
prior = [False, True]


for i in range(len(prior)):
    for j in range(len(alpha)):

        pipeline = Pipeline([
            ('bow',CountVectorizer(analyzer=text_processing)), 
            ('tfidf', TfidfTransformer()),  
            ('classifier', MultinomialNB(alpha=alpha[j], fit_prior=prior[i])),  
        ])
        print("ALPHA : ", alpha[j], ", PRIOR : ", prior[i])
        pipeline.fit(X_train, y_train)
        predictions = pipeline.predict(X_valid)
        print(accuracy_score(predictions, y_valid))

ALPHA :  1 , PRIOR :  False
0.7473333333333333
ALPHA :  0.1 , PRIOR :  False
0.7136666666666667
ALPHA :  0.01 , PRIOR :  False
0.7023333333333334
ALPHA :  0.001 , PRIOR :  False
0.702
ALPHA :  0 , PRIOR :  False


  'setting alpha = %.1e' % _ALPHA_MIN)


0.7003333333333334
ALPHA :  1 , PRIOR :  True
0.748
ALPHA :  0.1 , PRIOR :  True
0.7126666666666667
ALPHA :  0.01 , PRIOR :  True
0.701
ALPHA :  0.001 , PRIOR :  True
0.7013333333333334
ALPHA :  0 , PRIOR :  True


  'setting alpha = %.1e' % _ALPHA_MIN)


0.6996666666666667


In [64]:
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_valid)
print(classification_report(predictions, y_valid))
print(confusion_matrix(predictions, y_valid))
print(accuracy_score(predictions, y_valid))

KeyboardInterrupt: 

In [None]:

pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_processing)), 
    ('tfidf', TfidfTransformer()),  
    ('classifier', LogisticRegression(penalty='l2', tol=0.001, C=1)),  
])


In [None]:
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)
print(classification_report(predictions, y_test))
print(confusion_matrix(predictions, y_test))
print(accuracy_score(predictions, y_test))

# 0.74275 

In [47]:
unique, counts = np.unique(predictions, return_counts=True)

print(np.asarray((unique, counts)).T)

[[   0 2461]
 [   4 2539]]


### Bi-grams

In [66]:
tokens = [inner for outer in processed_sentence for inner in outer]
def get_bigrams(tokens):
    return [(tokens[i],tokens[i+1]) for i in range(0,len(tokens)-1)]
 
bigrams_vocab = get_bigrams(tokens)

In [69]:
freq_dist = FreqDist(bigrams_vocab)
common_bigrams = freq_dist.most_common(15)

In [71]:
for i in common_bigrams:
    print(i[0])

('!', '!')
('!', 'at_user')
('?', '?')
('?', 'at_user')
('at_user', 'at_user')
('at_user', 'thank')
('good', 'morning')
('!', 'im')
('at_user', 'im')
('last', 'night')
('?', '!')
('at_user', 'oh')
('cant', 'wait')
('at_user', 'good')
('day', '!')


### Subjectivity and Objectivity

In [None]:
# how subjective or opinionated the text is; a score of 0 is fact, and a score of +1 is very much an opinion
def getSubjectivity(tweet):
    return TextBlob(tweet).sentiment.subjectivity

# how positive or negative the text is;
def getPolarity(tweet):
    return TextBlob(tweet).sentiment.polarity


sentences_info = {'Subjectivity':[], 'Polarity':[], 'Sentiment': []} 

concat_pro_words = new_data.iloc[:, 1].apply(text_processing).str.join(" ").to_list()

for i in range(len(concat_pro_words)):
    sentiment = getPolarity(concat_pro_words[i])
    sentences_info['Subjectivity'].append(getSubjectivity(concat_pro_words[i]))
    sentences_info['Polarity'].append(sentiment)
    
    if sentiment == 0:
        sentences_info['Sentiment'].append('Neutral')
    elif sentiment > 0:
        sentences_info['Sentiment'].append('Positive')
    else:
        sentences_info['Sentiment'].append('Negative')

    
sentences_info = pd.DataFrame(sentences_info) 


In [None]:
print(sentences_info.iloc[:5, ])

In [None]:
print(new_data.iloc[:, 0].value_counts())
print("\n-----------\n")
print(sentences_info.iloc[:, 2].value_counts())


In [None]:
print(concat_pro_words)

In [None]:
allWords = ' '.join([twts for twts in concat_pro_words])
wordCloud = WordCloud(width=500, height=300, random_state=42, max_font_size=100).generate(allWords)


plt.imshow(wordCloud, interpolation="bilinear")
plt.axis('off')
plt.show()