### *In this notebook we are going to do visualization for news category dataset. Hope it will help fellow beginners in getting started with data visualization.*

## lets start..

In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from nltk import tokenize
from tensorflow.keras.preprocessing.text import Tokenizer,  text_to_word_sequence
import warnings
warnings.filterwarnings('ignore')


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [6]:
df=pd.read_csv("../input/news-category-dataset/final_news_df.csv")
df.sample(4)

In [8]:
df['text']= df['headline'] + df['short_description']+ df['keywords']
df.drop(['headline','short_description','date','keywords'],axis=1,inplace=True)
df.dropna(how='all',axis=0,subset=['text'],inplace=True)
df= df.reset_index(drop=True)

### Plot each article's length

In [9]:
df['News_length']= df.text.str.len()   #this gives length of each news article
df.dropna(axis=0,how='all',subset=['text','News_length'],inplace=True) #dropping all nans in text column
df['News_length']=df['News_length'].astype(int)

In [10]:
sns.displot(df['News_length'])
# most articles have 200 words

### Plot number and length of each sentence in each article

In [11]:
df=df.reset_index(drop=True)

In [12]:
df.text=df.text.apply(lambda x: x.strip().lower())   # removing empty spaces and converting text to lowercase

In [14]:
sentence_len=[]     #here we will store length of each sentence in each article
sentence_num=[]    #here we will store number of sentences in each article
all_texts=[]       #here all the tokenized sentences will go 

for index in range(df.text.shape[0]):
    sentences=tokenize.sent_tokenize(df.text[index])
    sentence_num.append(len(sentences))
    
    for sentence in sentences:
        sentence_len.append(len(text_to_word_sequence(sentence)))
        
    all_texts.append(sentences)

In [15]:
sns.distplot(sentence_len,bins=200)     # we have more sentences of shorter length

In [16]:
sns.distplot(sentence_num)  # in each record sentence-number(2) has more words than other sentences 

# Now we perform nlp operations to get a cleaner data

In [17]:
from nltk.corpus import  stopwords
from nltk.stem import WordNetLemmatizer
import re

def clean_texts(df):
    lemmatizer=WordNetLemmatizer()
    corpus=[]
    for i in range(0,len(df.text)):
        review=re.sub('[^a-zA-Z]',' ',df.text[i])
        review=review.lower().split()
            
        review=[lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
        review=' '.join(review)
        corpus.append(review)
    df['text']= corpus

    return df

In [18]:
df1=clean_texts(df)

## Word Cloud

In [19]:
from wordcloud import WordCloud

In [20]:
def wordcloud_generator(words):
    wc= WordCloud(width=800,height=600,random_state=42,max_font_size=100).generate(words)
    plt.figure(figsize=(8,8))
    plt.imshow(wc,interpolation='bilinear')
    plt.show()
    

def word_category(category):
    subset=df1[df1['category']==category]
    texts=subset.text.values
    words= ''.join(texts)
    wordcloud_generator(words)
    
word_category('POLITICS') #enter here the category 

# Now we make few more columns for wordcount, character count and avg_word_length

In [22]:
df1["wordcount"]= df1.text.apply(lambda x: len(x.split()))
df1['char_count']= df1.text.apply(lambda x:sum (len(word) for word in x.split()))
df1['avg_word_len']= round(df1.char_count/df1.wordcount,2)

In [23]:
sns.set_theme( palette="husl")
fig,ax= plt.subplots(nrows=1,ncols=2,figsize=(10,6))

for i in df1.category.unique():
    sns.distplot(df1[df1.category==i]['char_count'],kde=False,bins=10,ax=ax[0])
    sns.distplot(df1[df1.category==i]['char_count'],kde=True,ax=ax[1])
    

ax[0].legend(df1.category.unique())
ax[1].legend(df1.category.unique())

# Lets use textblob to understand polarity of sentiment in each text record

In [24]:
from textblob import TextBlob

In [25]:
df1['sentiment']=df1.text.apply(lambda x:TextBlob(x).sentiment.polarity)
df1.sentiment= np.round(df1.sentiment,2)

In [26]:
sns.set_theme( palette="husl")
fig,ax= plt.subplots(nrows=1,ncols=2,figsize=(10,6))
fig.suptitle('Sentiment polarity in different categories',fontsize=10)


for i in df1.category.unique():
    sns.distplot(df1[df1["category"]==i]['sentiment'],kde=False,hist=True,ax=ax[0])
    sns.distplot(df1[df1["category"]==i]["sentiment"],kde=True,hist=False,ax=ax[1])
    
ax[0].legend(df1.category.unique())
ax[1].legend(df1.category.unique())

In [None]:
df1.loc[1,'text']


# Name Entity Recognizer

In [27]:
import spacy
ner=spacy.load("en_core_web_lg")
text='RIM CEO Thorsten Heins Significant Plans For a Party.'
doc=ner(text).ents
spacy.displacy.render(doc,style='ent')

## **IMPORTANT NOTE**:
### RIM==> tag ,  ORG==>tag_type 
### Thorsten Heins==>tag, PERSON==>tag_type

In [28]:
df1['tags']=df1.text.apply(lambda x: [(tag.text,tag.label_) for tag in ner(x).ents])

Now we will have a list of tags and tag_names in a separate columns

# In next few cells we will count eachtime a tag or tag_type occurs and join with our dataframe

In [29]:
import collections 
def utils_lst_count(lst):
    dic_counter = collections.Counter()
    for x in lst:
        dic_counter[x] += 1

    dic_counter = collections.OrderedDict( 
                     sorted(dic_counter.items(), 
                     key=lambda x: x[1], reverse=True))
    lst_count = [ {key:value} for key,value in dic_counter.items() ]
    return lst_count

In [38]:
df1.tags=df1.tags.apply(lambda x: utils_lst_count(x))

In [31]:
def utils_ner_features(lst_dics_tuples, tag):
    if len(lst_dics_tuples) > 0:
        tag_type = []
        for dic_tuples in lst_dics_tuples:
            for tuples in dic_tuples:
                types, n = tuples[1], dic_tuples[tuples]
                tag_type = tag_type + [types]*n
                dic_counter = collections.Counter()
                for x in tag_type:
                    dic_counter[x] += 1
        return dic_counter[tag]
    else:
        return 0

### We will add a column for each tag_type  and then increment it each time a tag is found in the corresponding row

In [32]:
tags_set = []
for lst in df1["tags"].tolist():
    for dic in lst:
        for k in dic.keys():
              tags_set.append(k[1])
tags_set = list(set(tags_set))
for feature in tags_set:
    df1["tags_"+feature] = df1["tags"].apply(lambda x:utils_ner_features(x, feature))                      

## Here we plot summation of each tag_type count 

In [33]:
sns.barplot(data=df1.iloc[:,7:])
plt.xticks(rotation=90)

#  most used tag_type are 'person' followed by 'org'

# Now plotting most famous tags in a specific news category

In [34]:
def popular_tags(category):
    tags_list = df1[df1["category"]==category]["tags"].sum()
    map_lst = list(map(lambda x: list(x.keys())[0], tags_list))
    dtf_tags = pd.DataFrame(map_lst, columns=['tag','type'])
    dtf_tags["count"] = 1
    dtf_tags = dtf_tags.groupby(['type',  
                    'tag']).count().reset_index().sort_values("count", 
                     ascending=False)
    return dtf_tags


def plot_popular_tags():
    dtf_tags=popular_tags('WORLD NEWS')
    fig, ax = plt.subplots()
    fig.suptitle("Top frequent tags", fontsize=12)
    sns.barplot(x="count", y="tag", hue="type", 
                data=dtf_tags.iloc[:10,:], dodge=False, ax=ax)
    plt.show()

In [35]:
popular_tags('BUSINESS')  #place a category here

# one is the most popular word/tag in Business category

# Lets do something similar with Bigrams and Trigrams

In [36]:
import nltk
corpus=df1[df1["category"]=='POLITICS']['text']
lst_tokens= tokenize.word_tokenize(corpus.str.cat(sep=' '))
fig, ax = plt.subplots(nrows=2, ncols=1)
fig.suptitle("Most frequent words", fontsize=15)
    
## unigrams
dic_words_freq = nltk.FreqDist(lst_tokens)
dtf_uni = pd.DataFrame(dic_words_freq.most_common(), 
                       columns=["Word","Freq"])
dtf_uni.set_index("Word").iloc[:10,:].sort_values(by="Freq").plot(
                  kind="barh", title="Unigrams", ax=ax[0], 
                  legend=False).grid(axis='x')

## bigrams
dic_words_freq = nltk.FreqDist(nltk.ngrams(lst_tokens,2))
dtf_uni = pd.DataFrame(dic_words_freq.most_common(), 
                       columns=["Word","Freq"])
dtf_uni['Word']= dtf_uni['Word'].apply(lambda x: ' '.join(x))
dtf_uni.set_index("Word").iloc[:10,:].sort_values(by="Freq").plot(
                  kind="barh", title="Bigrams", ax=ax[1], 
                  legend=False).grid(axis='x')
plt.yticks(fontsize=8,rotation=0)