In [1]:
%ls

In [2]:
try:
    from google.colab import drive

    drive.mount('./drive/')

    %cd drive/My \ Drive/Text_Summarization

except:
    print("No Colab Environment")

No Colab Environment


In [3]:
import json
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib
import textwrap


In [4]:
parameters = {'axes.labelsize': 25,
          'axes.titlesize': 35,
             'xtick.labelsize':20,
             "figure.figsize" : (20, 10)
             }


plt.rcParams.update(parameters)

NameError: name 'plt' is not defined

# Load Data

In [None]:
df_overview = pd.read_csv("./Data/processed/Telehealth.csv")
df_overview.head()

## Pre and Post Covid Dataset

In [None]:
df_precovid = df_overview[df_overview["Classification"] == "Pre-Covid"]
df_covid = df_overview[df_overview["Classification"] == "Covid"]

## EDA

In [None]:
import matplotlib
import matplotlib.pyplot as plt

from yellowbrick.text import FreqDistVisualizer, TSNEVisualizer, DispersionPlot, PosTagVisualizer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

matplotlib.rcParams["figure.figsize"] = (20, 7)

In [None]:
_ = df_overview.hist()

In [None]:
_ = df_precovid.hist()

In [None]:
_ = df_covid.hist()

In [None]:
datasets = [df_overview,df_covid,df_precovid]
dataset_labels = ["Overview","During Covid","Pre-Covid"]

In [None]:
# There are not strong linear correlations between the lengths or Date Published

fig, ax = plt.subplots(1,3)

fig.tight_layout(h_pad=2,w_pad=8)

for index, dataset in enumerate(datasets):

    corr = dataset.corr()

    mask = np.zeros_like(corr)
    mask[np.triu_indices_from(mask)] = True

    sns.heatmap(corr,ax=ax[index], annot=True,square=True,mask=mask)
    ax[index].set_title(f"{dataset_labels[index]}")


In [None]:
fig, ax = plt.subplots(1,3,figsize=(30,10))

fig.tight_layout(h_pad=2,w_pad=8)


for index, dataset in enumerate(datasets):

    dataset[["Journal Title","Content"]].rename(columns={"Content":"Count"}).groupby("Journal Title").count().sort_values("Count",ascending=False).T.plot(kind="bar"
    ,ax=ax[index]
    ,title=f"Article Frequency by Journal: {dataset_labels[index]}")
    

# Text Mining - frequency counts of words, length of sentences, presence/absence of specific words

In [None]:
def countplots(data,dataset_name,vectorizers,cols=4,):

    rows = len(vectorizers)
    cols = cols #dynamically change n_grams and plots

    fig, ax = plt.subplots(rows,cols,figsize=(30,15))
    fig.tight_layout(h_pad=2,w_pad=cols*4)

    for index, vect in enumerate(vectorizers):

        for i in range(cols):

            vectorizer = vect(stop_words=None,ngram_range=(i+1,i+1),lowercase=True)
            
            vectorizer_name = str(vectorizer)
            
            vectorizer_title = vectorizer_name[:vectorizer_name.find("(")]
            
            documents = vectorizer.fit_transform(data)
            
            features = vectorizer.get_feature_names()
            
            if len(vectorizers) < 2:
                axis = ax[i]
            else:
                axis = ax[index][i]
            
            visualizer = FreqDistVisualizer(features=features, orient="h",n=10,ax=axis)
            
            axis.set_title(f"Dataset: {dataset_name} - {vectorizer_title} {vectorizer.ngram_range}")
            
            _ = visualizer.fit(documents) 
            #ax[index,i].set_yticklabels([textwrap.fill(phrase, 20) for phrase in data.head(25)])



# Keyword Extraction Based on Count



In [None]:
countplots(df_overview["Clean_Content"],"Overview",[CountVectorizer,TfidfVectorizer],cols=2)


In [None]:
countplots(df_covid["Clean_Content"],"Covid",[CountVectorizer,TfidfVectorizer],cols=2)


In [None]:
countplots(df_precovid["Clean_Content"],"Pre-covid",[CountVectorizer,TfidfVectorizer],cols=2)


# Keyword Extraction from Given Dataset - Count

In [None]:
parsed_words = df_overview["Parsed_Keywords"].to_list()
countplots(parsed_words,"Overview",[CountVectorizer,TfidfVectorizer],cols=2)

In [None]:
parsed_words = df_overview["Parsed_Keywords"].to_list()
countplots(parsed_words,"Overview",[CountVectorizer,TfidfVectorizer],cols=2)

## Keyword Extraction Gensim

In [None]:
import gensim
from gensim.summarization import keywords, mz_keywords
full_text = ''
for i in df_overview["Clean_Content"]:
  full_text = full_text + i
Overall_Keywords = keywords(full_text, words = 10, scores = True) #Returns list of keywords and their relevance scores
# print(Overall_Keywords)
Overall1 = pd.DataFrame(Overall_Keywords)
Overall1.head()

In [None]:
Overall2 = pd.DataFrame(mz_keywords(full_text, scores=True, split = True, weighted=False, threshold=1.0))
Overall2.head()

In [None]:
full_text = ''
for i in df_precovid["Clean_Content"]:
  full_text = full_text + i
Pre_Keywords = keywords(full_text, words = 10, scores = True) #Returns list of keywords and their relevance scores
# print(Overall_Keywords)
pre1 = pd.DataFrame(Pre_Keywords)
pre2 = pd.DataFrame(mz_keywords(full_text,blocksize = 25, scores=True, split = True, weighted=False, threshold=.5))

In [None]:
full_text = ''
for i in df_covid["Clean_Content"]:
  full_text = full_text + i
Covid_Keywords = keywords(full_text, words = 10, scores = True) #Returns list of keywords and their relevance scores
# print(Overall_Keywords)
covid1 = pd.DataFrame(Covid_Keywords)
covid2 = pd.DataFrame(mz_keywords(full_text, blocksize = 25, scores=True, split = True, weighted=False, threshold=.5))

In [None]:
covid2[0][:10].tolist

In [None]:
data = {'Overall Keywords': Overall1[0][:10].to_list(), 'Overall MZ Keywords':Overall2[0][:10].to_list(), 'Prepandemic Keywords':pre1[0][:10].to_list(), 'Prepandemic MZ Keywords':pre2[0][:10].to_list(), 'Covid Keywords':covid1[0][:10].to_list(), 'Covid MZ Keywords': covid2[0][:10].tolist()}
data.keys()

In [None]:
merged_df = pd.DataFrame(data=data)
merged_df

In [None]:
merged_df.to_csv('./references/Keywords.csv')