In [None]:
import pandas as pd
import numpy as np

news= pd.read_csv('new.csv')
news.head(3)

In [None]:
news['tweet'].str.len().hist()

In [None]:
news['tweet'].str.split().\
   apply(lambda x : [len(i) for i in x]). \
   map(lambda x: np.mean(x)).hist()

In [None]:
import nltk
nltk.download('stopwords')


In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop=set(stopwords.words('english'))
corpus=[]
new= news['tweet'].str.split()
new=new.values.tolist()
corpus=[word for i in new for word in i]

from collections import defaultdict
dic=defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word]+=1

In [None]:
import matplotlib.pyplot as plt

top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:15]
x,y=zip(*top)
plt.bar(x,y)

**Here you can see that there are some useless words that are being used, we can remove them for cleaner data**



In [None]:
from collections import Counter
import seaborn as sns

counter=Counter(corpus)
most=counter.most_common()

x, y= [], []
for word,count in most[:40]:
    if (word not in stop):
        x.append(word)
        y.append(count)

sns.barplot(x=y,y=x)

In [None]:
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer

def get_top_ngram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx])
                  for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:10]

In [None]:
 top_n_bigrams=get_top_ngram(news['tweet'],2)[:10]
 x,y=map(list,zip(*top_n_bigrams))
 sns.barplot(x=y,y=x)

In [None]:
top_tri_grams=get_top_ngram(news['tweet'],n=3)
x,y=map(list,zip(*top_tri_grams))
sns.barplot(x=y,y=x)

In [None]:
!pip install pyLDAvis

In [None]:
import nltk
import gensim
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.tokenize import word_tokenize
import pyLDAvis.gensim_models as gensimvis


nltk.download('punkt')
nltk.download('wordnet')

def preprocess_news(df):
    corpus=[]
    stem=PorterStemmer()
    lem=WordNetLemmatizer()
    for news in df['tweet']:
        words=[w for w in word_tokenize(news) if (w not in stop)]

        words=[lem.lemmatize(w) for w in words if len(w)>2]

        corpus.append(words)
    return corpus



In [None]:
corpus=preprocess_news(news)

In [None]:
dic=gensim.corpora.Dictionary(corpus)
bow_corpus = [dic.doc2bow(doc) for doc in corpus]

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus,
                                   num_topics = 4,
                                   id2word = dic,
                                   passes = 10,
                                   workers = 2)
lda_model.show_topics()

In [None]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, bow_corpus, dic)
vis

In [None]:
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)

def show_wordcloud(data):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=100,
        max_font_size=30,
        scale=3,
        random_state=1)

    wordcloud=wordcloud.generate(str(data))

    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')

    plt.imshow(wordcloud)
    plt.show()

show_wordcloud(corpus)

**Named entity recognition Step**

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

Example:

In [None]:
doc=nlp('India and Iran have agreed to boost the economic viability \
of the strategic Chabahar port through various measures, \
including larger subsidies to merchant shipping firms using the facility, \
people familiar with the development said on Thursday.')

[(x.text,x.label_) for x in doc.ents]

In [None]:
from spacy import displacy

displacy.render(doc, style='ent', jupyter=True)

Application on Data:

In [None]:
def ner(text):
    doc=nlp(text)
    return [X.label_ for X in doc.ents]

ent=news['fulltext'].\
    apply(lambda x : ner(x))
ent=[x for sub in ent for x in sub]

counter=Counter(ent)
count=counter.most_common()

In [None]:
x,y=map(list,zip(*count))
sns.barplot(x=y,y=x)

In [None]:
def ner(text,ent="GPE"):
    doc=nlp(text)
    return [X.text for X in doc.ents if X.label_ == ent]

gpe=news['fulltext'].apply(lambda x: ner(x))
gpe=[i for x in gpe for i in x]
counter=Counter(gpe)

x,y=map(list,zip(*counter.most_common(10)))
sns.barplot(y,x)

In [None]:
per=news['fulltext'].apply(lambda x: ner(x,"PERSON"))
per=[i for x in per for i in x]
counter=Counter(per)

x,y=map(list,zip(*counter.most_common(10)))
sns.barplot(y,x)

In [None]:
org=news['fulltext'].apply(lambda x: ner(x,"ORG"))
org=[i for x in org for i in x]
counter=Counter(org)

x,y=map(list,zip(*counter.most_common(10)))
sns.barplot(y,x)