In [None]:
import pandas as pd 
tweets = pd.read_csv("tweets.csv", encoding = "ISO-8859-1")
tweets.head()

In [None]:
tweets["text"].head()

In [None]:
import string 
from nltk.corpus import stopwords 
stopwords = stopwords.words("english")

def _clean(txt):
    txt = txt.lower()
    txt = "".join(x for x in txt if x not in string.punctuation)
    words = txt.split()
    words = [w for w in words if w not in stopwords]
    txt = " ".join(words)
    return txt

tweets["cleaned"] = tweets["text"].apply(lambda x : _clean(x))
tweets[["text", "cleaned"]]

In [None]:
## Keyword Analysis 
from collections import Counter
complete_text = " ".join(tweets["text"])
clean_text = _clean(complete_text)
Counter(clean_text.split()).most_common(100)

In [None]:
## Top Mentions 
mentions = [w for w in complete_text.split() if w.startswith("@")]
Counter(mentions).most_common(100) 

In [None]:
## Top HashTags
htags = [w for w in complete_text.split() if w.startswith("#")]
htags = [w for w in htags if "demo" not in w.lower()]
Counter(htags).most_common(100) 

In [None]:
## Top URLs
htags = [w for w in complete_text.split() if w.startswith("http")]
htags = [w for w in htags if "demon" not in w.lower()]
Counter(htags).most_common(100) 

In [None]:
from nltk import ngrams

bigrams = ngrams(clean_text.split(), 2)
Counter(bigrams).most_common(100)

In [None]:
## NER 
import nltk
from nltk import word_tokenize, pos_tag 
from nltk.chunk import tree2conlltags

for text in tweets["text"]:
    entities = nltk.ne_chunk(pos_tag(word_tokenize(text))) 
    for chunk in entities:
        if hasattr(chunk, "label"):
            if "GPE" in (str(chunk)):
                print (chunk)
            if "ORGANIZATION" in (str(chunk)):
                print (chunk)
#     break

In [None]:
## Sentiment analysis 
from textblob import TextBlob
TextBlob("many people hate policy changes such as Demonitization").sentiment

In [None]:
TextBlob("Indians are happy after from Demonitization").sentiment

In [None]:
## Topic Modelling 
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np 

def generate_topic_models(text):
    cvectorizer = CountVectorizer(min_df=4, max_features=2000)
    cvz = cvectorizer.fit_transform(text)

    lda_model = LatentDirichletAllocation(n_components=10, learning_method='online', max_iter=20, random_state=42)
    X_topics = lda_model.fit_transform(cvz)

    topic_word = lda_model.components_ 
    vocab = cvectorizer.get_feature_names()
    return topic_word, vocab 

n_top_words = 10
topic_word, vocab = generate_topic_models(tweets["cleaned"].values)
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print ("Topic " + str(i+1) + ": " + " | ".join(topic_words) + "\n")

## Ideas for information linkings

- Descriptive Stats 
  example : which are the top mentioned persons, which locations are the ones with high negative sentiments etc. 
- TimeSeries Insights 
  example : how does the information changes over time 
- What are the action items
- Use this information in Recommendation Engines 
- Use this information in Machine Learning Models 
- Use this information to create knowledge banks 