In [7]:
import json
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
pd.options.mode.chained_assignment = None

In [8]:
f = open('output/all_data.json')
news_tense = json.load(f)
f.close()

In [9]:
news_tense_seq_flat = []

for category in news_tense:
    for article in news_tense[category]:
        rows = {}
        rows['category'] = category
        rows['id'] = article['id']
        for tense_seq in article['sentence_tense_seq']:
            rows[f's_{tense_seq}'] = article['sentence_tense_seq'][tense_seq]
        for tense_seq in article['full_tense_seq']:
            rows[f'f_{tense_seq}'] = article['full_tense_seq'][tense_seq]
        news_tense_seq_flat.append(rows)   
        
df = pd.DataFrame(news_tense_seq_flat).fillna(0)  

In [10]:
columns = []
sentence_columns = []
full_article_columns = []
for key in df:
    if (key == "category" or key == "id"):
        pass
    else:
        if (key[0] == 's'):
            sentence_columns.append(key)
        if (key[0] == 'f'):
            full_article_columns.append(key)
        columns.append(key)

In [11]:
def get_category_cosine_similarity_mean(df):
    sentence_values = df.iloc[:][sentence_columns].values
    full_article_values = df.iloc[:][full_article_columns].values
    
    df.loc['mean'] = df[columns].mean()
    sentence_value_mean = df.iloc[-1][sentence_columns].values.reshape(1,-1)
    full_article_value_mean = df.iloc[-1][full_article_columns].values.reshape(1,-1)
    
    sentence_cosine_similarities = cosine_similarity(sentence_values, sentence_value_mean)
    full_article_cosine_similarities = cosine_similarity(full_article_values, full_article_value_mean)
    return { 'sentence': sentence_cosine_similarities.reshape(1,-1)[0].mean(), 'full_article': full_article_cosine_similarities.reshape(1,-1)[0].mean() }

In [12]:
for category in df.category.unique():
    print(category, get_category_cosine_similarity_mean(df.loc[df['category'] == category]))

weather_news {'sentence': 0.7709106077203581, 'full_article': 0.8347927064181683}
technology_news {'sentence': 0.8382967588834873, 'full_article': 0.8510715539468406}
politics {'sentence': 0.9217693377856612, 'full_article': 0.9314782968421115}
business_news {'sentence': 0.8725802741044121, 'full_article': 0.8803336232105213}
sports_news {'sentence': 0.8812538763556228, 'full_article': 0.8970154362199009}
crime_news {'sentence': 0.8021961412717665, 'full_article': 0.8091326131262181}


In [13]:
df_grouped = df.groupby('category').mean().reset_index()
df_grouped

Unnamed: 0,category,s_present present,f_present present,s_present past,s_past past,s_past present,s_future present,s_past future,s_future past,f_present past,f_past past,f_past present,f_present future,f_future present,f_past future,f_future past,s_present future,s_future future,f_future future
0,business_news,0.338091,0.341157,0.181983,0.148922,0.1424,0.04558,0.026306,0.029792,0.159763,0.146994,0.159205,0.051445,0.050865,0.027174,0.026808,0.051596,0.009679,0.011677
1,crime_news,0.115822,0.112817,0.163405,0.358909,0.158606,0.009223,0.01545,0.019832,0.160645,0.36005,0.163779,0.011745,0.009876,0.016329,0.017707,0.011878,0.001841,0.002017
2,politics,0.222907,0.215529,0.20981,0.251119,0.187568,0.028574,0.030809,0.03002,0.199276,0.257788,0.199042,0.029812,0.03036,0.030648,0.029287,0.031704,0.005459,0.006227
3,sports_news,0.218572,0.213494,0.209177,0.266419,0.187759,0.02845,0.023575,0.02712,0.19825,0.271825,0.195582,0.030699,0.029591,0.026781,0.025148,0.029265,0.004302,0.006333
4,technology_news,0.295642,0.29592,0.179426,0.159851,0.15053,0.049262,0.030113,0.031154,0.165529,0.153437,0.1628,0.050041,0.051291,0.035268,0.028808,0.048753,0.008977,0.011139
5,weather_news,0.30312,0.331675,0.163886,0.089096,0.118307,0.083368,0.044273,0.07778,0.126496,0.077119,0.124966,0.088648,0.090945,0.044175,0.040853,0.061829,0.044705,0.070578


In [14]:
df_grouped.category

0      business_news
1         crime_news
2           politics
3        sports_news
4    technology_news
5       weather_news
Name: category, dtype: object

In [15]:
sentence_values = df_grouped.iloc[:][sentence_columns].values
full_article_values = df_grouped.iloc[:][full_article_columns].values

In [16]:
sentence_cosine_similarities = cosine_similarity(sentence_values)
full_article_cosine_similarities = cosine_similarity(full_article_values)

In [17]:
df_sentence_cosine_similarities = pd.DataFrame(sentence_cosine_similarities, columns=df_grouped.category)
df_sentence_cosine_similarities.insert(0, 'category', df_grouped.category, False)
df_sentence_cosine_similarities

category,category.1,business_news,crime_news,politics,sports_news,technology_news,weather_news
0,business_news,1.0,0.751351,0.930478,0.920157,0.99623,0.973613
1,crime_news,0.751351,1.0,0.93037,0.942343,0.79652,0.656326
2,politics,0.930478,0.93037,1.0,0.999272,0.956584,0.870978
3,sports_news,0.920157,0.942343,0.999272,1.0,0.947787,0.855484
4,technology_news,0.99623,0.79652,0.956584,0.947787,1.0,0.967027
5,weather_news,0.973613,0.656326,0.870978,0.855484,0.967027,1.0


In [18]:
df_full_article_cosine_similarities = pd.DataFrame(full_article_cosine_similarities, columns=df_grouped.category)
df_full_article_cosine_similarities.insert(0, 'category', df_grouped.category, False)
df_full_article_cosine_similarities

category,category.1,business_news,crime_news,politics,sports_news,technology_news,weather_news
0,business_news,1.0,0.741757,0.917915,0.910145,0.996012,0.963367
1,crime_news,0.741757,1.0,0.936568,0.946337,0.784333,0.596131
2,politics,0.917915,0.936568,1.0,0.999435,0.945726,0.815612
3,sports_news,0.910145,0.946337,0.999435,1.0,0.938461,0.80413
4,technology_news,0.996012,0.784333,0.945726,0.938461,1.0,0.951028
5,weather_news,0.963367,0.596131,0.815612,0.80413,0.951028,1.0


In [19]:
from numpy import dot
from numpy.linalg import norm

In [20]:
def calculate_diff_from_mean(df):
    map = {}
    for column in columns:
        values = df[column].values
        mean = df[column].mean()
        sum_diff_w_mean = 0
        for i in values:
            sum_diff_w_mean += abs(i-mean)
        avg = sum_diff_w_mean / len(values)
        map[column] = avg*100
    return map
    

In [21]:
diff_from_mean = []
for category in df.category.unique():
    current = {}
    current['category'] = category
    current |= calculate_diff_from_mean(df.loc[df['category'] == category])
    diff_from_mean.append(current)
    
df_diff_from_mean = pd.DataFrame(diff_from_mean)
df_diff_from_mean

Unnamed: 0,category,s_present present,f_present present,s_present past,s_past past,s_past present,s_future present,s_past future,s_future past,f_present past,f_past past,f_past present,f_present future,f_future present,f_past future,f_future past,s_present future,s_future future,f_future future
0,weather_news,16.421334,16.2408,9.541364,8.921711,6.984276,7.501125,4.778691,7.918003,5.792655,7.159767,5.111739,4.981497,5.090221,3.270749,3.001404,5.588161,5.990572,8.232251
1,technology_news,13.703417,13.402835,6.325352,11.433552,6.106228,4.026078,2.820954,2.809679,5.058776,10.718026,5.155813,3.49913,3.63929,2.910089,2.28254,3.799017,1.282664,1.406872
2,politics,8.825379,8.323082,4.352082,10.345334,4.268161,2.256561,2.24835,2.123538,3.300841,10.229047,3.34145,2.049761,2.12243,1.949022,1.86787,2.327095,0.793248,0.822529
3,business_news,13.909975,13.79335,5.820004,10.111671,5.431262,3.191317,2.178462,2.372087,4.897028,10.116078,4.875869,3.152076,3.109831,1.923338,1.929936,3.428669,1.208789,1.265697
4,sports_news,11.444506,10.866374,6.096756,13.32644,6.030184,2.879987,2.394503,2.658543,4.208273,13.310945,4.256837,2.651109,2.596858,2.319263,2.121141,2.857879,0.71853,0.959483
5,crime_news,7.773809,7.427515,6.979911,16.287608,6.583103,1.301446,1.765409,2.060331,5.990729,16.145536,5.996282,1.366177,1.263393,1.627959,1.697846,1.51041,0.337463,0.357634
