# Pre Processing Functions

These functions are responsible for generating the objects needed for data visualization.

In [None]:
"""
This function takes in two previously generated objects; a dictionary of articles that contain the sentiment analysis
value for each topic, and a dataframe of articles (rows) and topics (columns) where each cell is the relevancy of the
topic for that article.

It combines the two objects into one dataframe of articles (rows) and topics (columns) where each cell is a tuple
(sentiment, relevancy) of that article for that topic. Returns the tuple dataframe.
"""
def create_per_topic_tuple_df(topic_level_sentiment, df_topics):
    topic_level_sentiment_df = dictionaryToDataFrame(topic_level_sentiment)#turn to dataframe
    topic_level_sentiment_df = topic_level_sentiment_df.transpose() #transpose rows and columns
    
    def to_int(str):
        return int(str)
    topic_level_sentiment_df = topic_level_sentiment_df.rename(to_int, axis= 'columns')#rename str names to ints
    
    
    tuple_df = pd.concat([topic_level_sentiment_df, df_topics]).groupby(level=0).agg(tuple)#zip the two dfs together
    return tuple_df

In [2]:
"""
This function takes our generated sentiment analysis dataframe, and the LDA model and corpus for the texts. It adds
several variables to the dataframe (Topics, Main Topic, Main Topic Score (relevancy), Shortened Address) and 
then sorts them based on main topic for data visualization. Returns a new, processed dataframe.
"""
def preprocess_dataframe_for_datavis(dataframe, lda_model, corpus):
    
    #make deep copy of dataframe to prevent changes to original
    df = dataframe.copy(deep = True)
    
    df['Topics'] = lda_model.get_document_topics(corpus)


    sf = pd.DataFrame(data=df['Topics'])
    af = pd.DataFrame()

    df_topic_list = []
    df_score_list = []

    for ind in sf.index: #here we find most relevant topic for each article
        #print(sf['Topics'][ind])
        rtl = sf['Topics'][ind]
        relevant_topic = -1
        relevant_topic_score = 0
        #print("rtl:" , rtl)
        for (topic,score) in rtl:
            #print(topic, " " , score)
            if score > relevant_topic_score:
                relevant_topic = topic
                relevant_topic_score = score
        df_topic_list.append(relevant_topic)
        df_score_list.append(relevant_topic_score)

    #We add main topic and the main topic's relevancy score
    df['Main Topic'] = df_topic_list #add most relevant topic to df
    df['Main Topic Score'] = df_score_list
    
    #grab list of urls for parsing
    url_list = df['URL'].to_list()

    #iterate through urls and parse them into their base site names
    for i, url in enumerate(url_list):
        if 'https://' in url:
            url = url.split("https://")[1]
            url_list[i] = url.split("/")[0]
        else:
            url_list[i] = url.split("/")[0]

    
    
    df['Shortened Address'] = url_list #shorten the urls to make them easier to read in hover text

    df = df.sort_values('Main Topic') #sort df by main topic so it is in order in the graph.

    
    return df

In [None]:
"""
Taking in our LDA model, creates a new dataframe of all articles (rows) and all topics (columns) where a cell corresponds
to the relevancy of that topic on that article. So cell [2,3] with a value of 0.988 means article 2 has a 0.988 
relevancy score for Topic 3.
"""
def generate_topic_relevancy_dataframe(LDA_model):
    #gather all topics per document as a list of lists of tuples
    document_topics = [LDA_model.get_document_topics(item, minimum_probability = 0.0) for item in corpus]

    topic_cols = [x[0] for x in document_topics[0]] #get the num of topics to add to df


    df_topics = pd.DataFrame(columns = topic_cols)#make df with topics
    
    
    for i in document_topics:
        topic_scores = [x[1] for x in i]
        df_topics.loc[len(df_topics.index)] = topic_scores
    
    #this gathers sentiment score, subjectivity score, and main topic of all articles
    #df_slice = df_main_topic.iloc[:,[1,3,8]]

    #document_topics
    return df_topics

# Data Visualization Functions

Below contains our functions that generate data visualizations.

In [None]:
"""
Taking in our (pre-processed) main dataframe, generates a plot of all articles sorted by their main topic, and plotted
along their sentiment values. Also returns useful information through hovertext.
"""
def visualize_all_articles_on_main_topic(df):
    
    fig_w_topics = px.scatter(df, x="Main Topic", y="Sentiment Score",
                         size="Main Topic Score", custom_data = ['Shortened Address', 'Sentiment Label', 'Main Topic Score'],
                         title="Articles Sorted By Main Topic")#creating a scatter plot with x, y, and color from our df. 
    #The custom_data is what we will add to our hover text

    fig_w_topics.update_traces(hovertemplate="<br>".join(["Address: %{customdata[0]}",
                                                     "Sentiment Label: %{customdata[1]}",
                                                     "Main Topic Score :%{customdata[2]}"])
                          )#set the hover text to show whatwas in custom_data

    fig_w_topics.update_xaxes(type="category")#turn into categorical not continuous data for the x-axis

    fig_w_topics.show()

In [None]:
"""
Taking in both our main (pre-processed) dataframe and our LDA model, generates a topic clustering graph based on the
topic relevancy of each article's topic. Displays the clustering in a 2D space.
"""
def visualize_topic_cluster_TSNE(LDA_model, df):
    num_topics = len(LDA_model.print_topics())
    #print(num_topics)

    #Get Topic Weights
    topic_weights = []
    for i in df["Topics"]:
        per_doc_list = [None] * num_topics
        #print(len(per_doc_list))
        for x in i:
            #print(x)
            per_doc_list[x[0]] = x[1]
        topic_weights.append(per_doc_list)

    # Array of topic weights    
    arr = pd.DataFrame(topic_weights).fillna(0).to_numpy()

    # Dominant topic number in each doc
    topic_num = np.argmax(arr, axis=1)

    #print(arr)

    # tSNE Model Creation
    tsne_model = TSNE(n_components=2, verbose=1, 
                  random_state=0, angle=.99, 
                  init='pca', perplexity = (arr.shape[0] - 1) / 3)
    
    tsne_lda = tsne_model.fit_transform(arr)

    #Color palette for the clusters
    mycolors = np.array([color for name, color in mcolors.CSS4_COLORS.items()])

    #Formatting
    df['Main Topic'] = df['Main Topic'].apply(int)#turn them into ints so we can sort by main topic, then back to str

    df = df.sort_values(by=['Main Topic'],ascending = True)#sort by main topic to make the legend pretty

    df['Main Topic'] = df['Main Topic'].apply(str) #this makes it so we can use main topic as categorical data

    #creating the cluster graph in plotly
    fig_cluster = px.scatter(df, x = tsne_lda[:,0],y = tsne_lda[:,1],
                        custom_data = ['Shortened Address', 'Sentiment Label', 'Main Topic'],
                        color = "Main Topic",#mycolors[topic_num],
                        size = "Main Topic Score",
                        title = "Topic Clustering Graph",
                        labels = dict(color = "Main Topic"))

    fig_cluster.update_traces(hovertemplate="<br>".join(["Address: %{customdata[0]}",
                                                     "Sentiment Label: %{customdata[1]}",
                                                     "Main Topic:%{customdata[2]}"])
                          )#set the hover text to show whatwas in custom_data
    
    fig_cluster.show()

    df['Main Topic'] = df['Main Topic'].apply(int)#turn back to int

In [None]:
"""
Taking in the main (pre-processed) dataframe and an int corresponding to the topic number you want to visualize, 
generates a 2D plot of all articles with that topic number as their main topic, plotted along their sentiment scores
and their subjectivity scores (higher is more subjective).
"""
def visualize_single_topic_subjectivity_vs_sentiment(df, topic_num):
    df_topic = df[df['Main Topic'] == topic_num]


    t_string = "Sentiment Analysis on Topic " + str(topic_num)
    fig = px.scatter(df_topic, x = "Subjectivity Score", y = "Sentiment Score", 
                size = "Main Topic Score", hover_name = "Shortened Address", title = t_string)


    fig.show()

# K-means clustering and helper functions

In [None]:
"""
Taking in our topic relevancy dataframe, the max cluster count that you want to stop at, and the number of PCA 
components to use, generates a visualization of the inertia score of our clustering model for every number of 
clusters up to the max.
"""
def visualize_optimal_cluster_count(df_topics, max_clusters, pca_components):
    wcss = []
    
    pca = PCA(pca_components) #set to number of components
    data = pca.fit_transform(df_topics) #apply principled component analysis
   
    for i in range(2, max_clusters):
       model = KMeans(n_clusters = i, init = "k-means++", n_init = 10)
       model.fit(data)
       wcss.append(model.inertia_)
    
    #plot inertia for the different number of clusters
    plt.figure(figsize=(10,10))
    plt.plot(range(2, max_clusters), wcss)
    plt.xlabel('Number of clusters')
    plt.ylabel('WCSS')
    plt.show()

In [None]:
"""
Taking in our topic relevancy dataframe, the number of clusters, and the number of PCA components, generates a 
k-means clustering using those parameters and visualizes it in a 2D space. The clustering is based solely on the topic
relevancy for each article. Also generates centers of each cluster shown with an 'X'.
"""
def visualize_kmeans_clustering(df_topics, num_clusters, pca_components):
    pca = PCA(pca_components) #set to number of components
    data = pca.fit_transform(df_topics) #apply principled component analysis
    
    #creating KMeans model
    model = KMeans(n_clusters = num_clusters, init = "k-means++", n_init = 10)
    label = model.fit_predict(data)
    centers = np.array(model.cluster_centers_)

    plt.figure(figsize=(10,10))
    uniq = np.unique(label)
    
    #creating the scatter plot of all articles
    for i in uniq:
       plt.scatter(data[label == i , 0] , data[label == i , 1] , label = i)
    plt.scatter(centers[:,0], centers[:,1], marker="x", color='k')#This is done to find the centroid for each clusters.
    
    plt.legend()
    plt.show()
    