In [29]:
import pandas as pd
import re


In [30]:
import nltk
nltk.download('stopwords')

from collections import Counter
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from heapq import nlargest

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91818\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Database connection

In [31]:
data= pd.read_csv(r"C:\Users\91818\Desktop\dailywork\Datasets_Day21-Natural Language Processing(NLP)\Data.csv")

In [32]:
from sqlalchemy import create_engine

engine = create_engine("mysql+pymysql://{user}:{pw}@localhost/{db}"
                       .format(user="root",# user
                               pw="password", # passwrd
                               db="mydatabase")) #database

data.to_sql('tweets_data',con = engine, if_exists = 'replace', index = False)


In [33]:
sql = "SELECT * from tweets_data"


tweets_data = pd.read_sql_query(sql, engine)

In [34]:
data.head()

Unnamed: 0,tweet_id,sentiment,text,tweet_created,tweet_location,user_timezone
0,1,neutral,What @dhepburn said.,24/02/2015 11:35,,Eastern Time (US & Canada)
1,2,positive,plus you've added commercials to the experienc...,24/02/2015 11:15,,Pacific Time (US & Canada)
2,3,neutral,I didn't today... Must mean I need to take ano...,24/02/2015 11:15,Lets Play,Central Time (US & Canada)
3,4,negative,"it's really aggressive to blast obnoxious ""ent...",24/02/2015 11:15,,Pacific Time (US & Canada)
4,5,negative,and it's a really big bad thing about it,24/02/2015 11:14,,Pacific Time (US & Canada)


# Cleaning Text

In [35]:
def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = re.sub(r'http\S+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

In [36]:
def remove_users(tweet):
    '''Takes a string and removes retweet and @user information'''
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove retweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at
    return tweet


In [37]:
from nltk.corpus import stopwords
from string import punctuation
from heapq import nlargest

STOPWORDS = set(stopwords.words('english') + list(punctuation))
MIN_WORD_PROP, MAX_WORD_PROP = 0.1, 0.9

In [38]:
def clean_tweet(tweet, bigrams=False):
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.lower() # lower case
    #tweet = re.sub('['+punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub('\s+', ' ', tweet) #remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
    tweet_token_list = [word for word in tweet.split(' ')
                            if word not in STOPWORDS] # remove stopwords

   # tweet_token_list = [word_rooter(word) if '#' not in word else word
    #                    for word in tweet_token_list] # apply word rooter
    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tweet_token_list)
    return tweet

In [39]:
data['clean_tweet'] = data.text.apply(clean_tweet)
text = " ".join(data['text'])

In [40]:
data.head()

Unnamed: 0,tweet_id,sentiment,text,tweet_created,tweet_location,user_timezone,clean_tweet
0,1,neutral,What @dhepburn said.,24/02/2015 11:35,,Eastern Time (US & Canada),said.
1,2,positive,plus you've added commercials to the experienc...,24/02/2015 11:15,,Pacific Time (US & Canada),plus added commercials experience... tacky.
2,3,neutral,I didn't today... Must mean I need to take ano...,24/02/2015 11:15,Lets Play,Central Time (US & Canada),today... must mean need take another trip!
3,4,negative,"it's really aggressive to blast obnoxious ""ent...",24/02/2015 11:15,,Pacific Time (US & Canada),"t's really aggressive blast obnoxious ""enterta..."
4,5,negative,and it's a really big bad thing about it,24/02/2015 11:14,,Pacific Time (US & Canada),really big bad thing


# Text summarizer

In [41]:
def compute_word_frequencies(word_sentences):
    words = [word for sentence in word_sentences 
                     for word in sentence 
                         if word not in STOPWORDS]
    counter = Counter(words)
    limit = float(max(counter.values()))
    word_frequencies = {word: freq/limit 
                                for word,freq in counter.items()}
    # Drop words if too common or too uncommon
    word_frequencies = {word: freq 
                            for word,freq in word_frequencies.items() 
                                if freq > MIN_WORD_PROP 
                                and freq < MAX_WORD_PROP}
    return word_frequencies

In [42]:
def sentence_score(word_sentence, word_frequencies):
    return sum([ word_frequencies.get(word,0) 
                    for word in word_sentence])

In [43]:
def summarize(text:str, num_sentences=3):
    """
    Summarize the text, by return the most relevant sentences
     :text the text to summarize
     :num_sentences the number of sentences to return
    """
    text = text.lower() # Make the text lowercase
    
    sentences = sent_tokenize(text) # Break text into sentences 
    
    # Break sentences into words
    word_sentences = [word_tokenize(sentence) for sentence in sentences]
    
    # Compute the word frequencies
    word_frequencies = compute_word_frequencies(word_sentences)
    
    # Calculate the scores for each of the sentences
    scores = [sentence_score(word_sentence, word_frequencies) for word_sentence in word_sentences]
    sentence_scores = list(zip(sentences, scores))
    
    # Rank the sentences
    top_sentence_scores = nlargest(num_sentences, sentence_scores, key=lambda t: t[1])
    
    # Return the top sentences
    return [t[0] for t in top_sentence_scores]

In [44]:
len(sent_tokenize(text))



21338

In [45]:
summarize(text, num_sentences=3)

["cant seem to get hail of the right area at all .. i know you guys are good i have asked lindsey to call me instead of dm like a schoolgirl but so far no call   getting tired of typing little notes give us some more clever advertising about #bagsflyfree is that because #badpolicy allows you to #cheatcustomers it's just the principle - it's hard to get mugged &amp; not be upset you took my money &amp; didn't give me anything #wrongiswrong customer service you shouldn't use those words wasted my time and took my money #badbussiness #theft http://t.co/63zaq2lt8f three hour flight to orlando and no wifi?",
 "my call is lost weather doesn't excuse poor customer service, lack of info from staff or the fact staff haven't turned up please help on hold 3 hours can't change flights i'm stuck here on hold 3 hours !!!!!!",
 "today usair cancelled flightled our rescheduled flight &amp; did not notify us except possibly to our home phone-not helpful since we are in co and i can't get through on any

# Topic Modeling-LDA

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
#pip install spacy
from spacy.lang.en.stop_words import STOP_WORDS as stopwords


tfidf_para_vectorizer = TfidfVectorizer(stop_words = stopwords, min_df = 5,max_df = 0.7)
tfidf_para_vectors = tfidf_para_vectorizer.fit_transform(data["text"])
tfidf_para_vectors.shape

from sklearn.feature_extraction.text import CountVectorizer

count_para_vectorizer = CountVectorizer(stop_words = stopwords, min_df = 5, max_df = 0.7)
count_para_vectors = count_para_vectorizer.fit_transform(data["text"])

from sklearn.decomposition import LatentDirichletAllocation

lda_para_model = LatentDirichletAllocation(n_components = 10, random_state = 42)
W_lda_para_matrix = lda_para_model.fit_transform(count_para_vectors)
H_lda_para_matrix = lda_para_model.components_

def display_topics(model, features, no_top_words = 5):
    for topic, word_vector in enumerate(model.components_):
        total = word_vector.sum()
        largest = word_vector.argsort()[::-1] # invert sort order
        print("\nTopic %02d" % topic)
        for i in range(0, no_top_words):
            print("  %s (%2.2f)" % (features[largest[i]],
                  word_vector[largest[i]]*100.0/total))

display_topics(lda_para_model, tfidf_para_vectorizer.get_feature_names())




Topic 00
  thank (3.01)
  dm (2.59)
  like (2.41)
  great (2.38)
  help (1.97)

Topic 01
  don (2.75)
  flight (1.70)
  sent (1.61)
  let (1.30)
  problem (1.06)

Topic 02
  flight (7.43)
  hours (3.24)
  plane (2.69)
  late (2.32)
  gate (2.25)

Topic 03
  cancelled (9.36)
  flight (8.58)
  flightled (4.47)
  flights (3.43)
  flighted (2.01)

Topic 04
  flight (2.41)
  change (1.74)
  phone (1.65)
  ticket (1.63)
  email (1.62)

Topic 05
  service (9.34)
  customer (7.34)
  phone (1.15)
  response (1.10)
  person (1.05)

Topic 06
  co (15.77)
  http (15.00)
  thank (2.71)
  jetblue (2.70)
  fleek (1.98)

Topic 07
  airline (3.56)
  fly (2.92)
  flying (1.94)
  experience (1.92)
  worst (1.85)

Topic 08
  thanks (9.77)
  amp (2.04)
  bag (1.62)
  trip (1.60)
  flight (1.30)

Topic 09
  flight (8.51)
  problems (1.88)
  delayed (1.80)
  flights (1.55)
  going (1.49)




In [47]:
import gensim
import string
from gensim import corpora
from nltk.corpus import stopwords

# Define the raw text
raw_text = "This is a very large piece of text. It can be a paragraph, an article or even a book. This text can contain multiple topics such as NLP, Machine learning, deep learning and so on. This text contains numbers like 12, 34, 56 and punctuations like . , ; !"
#raw_text=text


# Remove punctuations from the text
text_without_punct = raw_text.translate(str.maketrans('', '', string.punctuation))

# Tokenize the text
text_list = gensim.utils.simple_preprocess(text_without_punct)

# Remove stopwords
stop_words = stopwords.words("english")
text_list = [word for word in text_list if word not in stop_words]

# Create the Dictionary and Corpus
dictionary = corpora.Dictionary([text_list])
corpus = [dictionary.doc2bow(text_list)]

# Perform LSA
Num_Topics=3
lsa = gensim.models.LsiModel(corpus=corpus, id2word=dictionary, num_topics=Num_Topics)

# Print the topics
topics = lsa.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.530*"text" + 0.354*"learning" + 0.354*"like" + 0.177*"numbers"')


Named Entity Recognition 

In [48]:
from spacy import displacy
import en_core_web_sm
nlp = en_core_web_sm.load()

doc=text = " ".join(data['clean_tweet'][:10])

doc2 = nlp(doc)
displacy.render(doc2, style='ent', jupyter=True)

In [49]:
displacy.render(doc2, style='dep')

In [50]:
#POS Tagging

In [51]:
for token in doc2:
    print(token.text, token.pos_)

said VERB
. PUNCT
plus CCONJ
added VERB
commercials NOUN
experience NOUN
... PUNCT
tacky ADJ
. PUNCT
today NOUN
... PUNCT
must AUX
mean VERB
need VERB
take VERB
another DET
trip NOUN
! PUNCT
t PROPN
's PART
really ADV
aggressive ADJ
blast NOUN
obnoxious ADJ
" PUNCT
entertainment NOUN
" PUNCT
guests NOUN
' PART
faces NOUN
& CCONJ
amp ADJ
; PUNCT
little ADJ
recourse NOUN
really ADV
big ADJ
bad ADJ
thing NOUN
seriously ADV
would AUX
pay VERB
flight NOUN
seats NOUN
playing VERB
. PUNCT
really ADV
bad ADJ
thing NOUN
flying VERB
va INTJ
yes INTJ
, PUNCT
nearly ADV
every PRON
time NOUN
fly VERB
vx VERB
_ NOUN
_ NUM
ar NOUN
worm NOUN
_ NOUN
won VERB
_ NOUN
_ NOUN
go VERB
away ADV
:) PUNCT
really ADV
missed VERB
prime ADJ
opportunity NOUN
men NOUN
without ADP
hats NOUN
parody NOUN
, PUNCT
there ADV
. PUNCT
  SPACE
well ADV
, PUNCT
didn't_but CCONJ
do VERB
! PUNCT
: PUNCT
-d PUNCT
amazing ADJ
, PUNCT
arrived VERB
hour NOUN
early ADV
. PUNCT
good ADJ
me PRON
. PUNCT


In [None]:
#Topic modeling LSA

In [52]:
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation \
                                        , preprocess_string, strip_short, stem_text

# preprocess given text
def preprocess(text):
    
    # clean text based on given filters
    CUSTOM_FILTERS = [lambda x: x.lower(), 
                                remove_stopwords, 
                                strip_punctuation, 
                                strip_short, 
                                stem_text]
    text = preprocess_string(text, CUSTOM_FILTERS)
    
    return text

# apply function to all reviews 
data['Text (Clean)'] = data['text'].apply(lambda x: preprocess(x))

In [53]:
from gensim import corpora

# create a dictionary with the corpus
corpus = data['Text (Clean)']
dictionary = corpora.Dictionary(corpus)

# convert corpus into a bag of words
bow = [dictionary.doc2bow(text) for text in corpus]

In [54]:
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel

# find the coherence score with a different number of topics


In [55]:
# perform SVD on the bag of words with the LsiModel to extract 2 topics
lsi = LsiModel(bow, num_topics=2, id2word=dictionary)

In [56]:
# find the 5 words with the srongest association to the derived topics
for topic_num, words in lsi.print_topics(num_words=5):
    print('Words in {}: {}.'.format(topic_num, words))

Words in 0: 0.903*"flight" + 0.221*"cancel" + 0.111*"delai" + 0.104*"hour" + 0.091*"flightl".
Words in 1: 0.550*"thank" + 0.336*"servic" + 0.318*"custom" + -0.261*"flight" + 0.210*"you".
