In [1]:
from sklearn.cluster import DBSCAN
#from matplotlib import pyplot as plt 
import json
import numpy as np
import pandas as pd
from Metrics import getSentiment, getQuoteBased

In [2]:
print('Loading File')
file = open('rust-articles-backup.json', encoding='utf8')
articles = pd.DataFrame.from_dict(json.load(file))
print('File Loaded')

Loading File
File Loaded


-1 means that it isn't clustered as a group

In [3]:
X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80], [100,100]])
clustering = DBSCAN(eps=3, min_samples=2).fit(X)
clustering.labels_

array([ 0,  0,  0,  1,  1, -1, -1], dtype=int64)

In [4]:
from Clustering import ClusterMetrics

# max_dist works for 0.4-1.1
metrics = np.array(
    [
        [0.9,-0.4], # positive, opinion
        [0.6,-0.3], # positive, opinion
        [0.1,0.7], # neutral, factual
        [-0.2,0.8], # neutral, factual
        [-0.9,-0.8], # negative, opinion
        [-0.6,-0.6], # negative, opinion
    ]
)
labels = ClusterMetrics(metrics)
print(labels)

[0 0 1 1 2 2]


In [5]:
metrics = []
sentiments = []
quoteBaseds = []
for i in range(1000):
    sentiment = getSentiment(articles.at[i,'body'])
    quoteBased, _, _ = getQuoteBased(articles.at[i,'body'])
    
    sentiments.append(sentiment['compound'])
    quoteBaseds.append(quoteBased)
    metrics.append([sentiment['compound'], quoteBased])
print(metrics)

[[-0.3818, 0.51423], [0.0772, 0.16694], [0.972, 1], [-0.9933, 0.40527], [0.9554, 0.27697], [-0.8484, 0.08282], [0.9917, 0.10471], [0.9917, 0.10468], [0.2003, 0.31558], [0.9967, 0.41131], [-0.982, 0.39148], [-0.9246, 0.09973], [-0.3818, 0.0], [0.9369, 0.72513], [0.0, 0.0], [-0.9287, 0.40544], [-0.9904, 0.20576], [-0.9827, 0.07456], [0.9847, 0.13839], [-0.5729, 0.31351], [-0.0498, 0.22599], [0.8122, 0.0], [-0.9937, 0.27488], [-0.9674, 0.27567], [-0.9653, 0.0], [-0.7351, 0.17039], [-0.9766, 0.2476], [-0.8225, 0.0], [-0.9941, 0.21488], [0.9442, 0.37209], [0.9983, 0.19406], [0.9808, 0.16796], [0.9825, 0.36376], [-0.9051, 0.25635], [0.168, 0.29743], [0.997, 0.3074], [-0.0357, 0.58855], [0.9757, 0.41344], [-0.9945, 0.56143], [0.9948, 0.2265], [0.2159, 0.47851], [-0.4038, 0.56529], [0.9976, 0.7847], [-0.4109, 0.20088], [-0.9924, 0.16814], [-0.9921, 0.28986], [-0.2496, 0.56522], [0.9895, 0.39139], [-0.9764, 0.3216], [0.8047, 0.0], [0.9973, 0.39771], [0.7498, 0.09315], [0.9279, 0.49367], [-0.950

Gets best cluster distance

In [6]:
max = 0
maxCounter = 0
counter = 0.01
while counter <= 1:
    labels = ClusterMetrics(np.array(metrics),counter,10)
    length = len(set(labels))
    withGroup = 1 - (np.count_nonzero(labels == -1) / len(labels))
    maximizeValue = withGroup * length # Trying to maximize the number of groups and minimize the number of articles without a group
    if maximizeValue > max:
        maxCounter = counter
        max = maximizeValue
    counter = round(counter + 0.01,3)

print(maxCounter,max)

0.02 5.258


In [7]:
counter = maxCounter
labels = ClusterMetrics(np.array(metrics),counter,10)
length = len(set(labels))
print(labels)
print(np.count_nonzero(labels == -1))

[-1 -1 -1  0  1 -1  1  1 -1  2  0 -1 -1 -1 -1 -1  0  3  1 -1 -1 -1  0  0
  9 -1  0 -1  0 -1  1  1  1 -1 -1  1 -1  2 -1  1 -1 -1 -1 -1  0  0 -1  2
  0 -1  2 -1 -1 -1 -1 -1 -1 -1 -1  9  3  0  0  0  0 -1 -1 -1 -1  7 -1 -1
 -1  4  2  9  2  0  0 -1 -1  8 -1  0 -1 -1  0 -1 -1 -1  1  0 -1 -1 -1 -1
 -1 -1  7  0 -1 -1 -1  1 -1  2 -1 -1 -1 -1 -1 -1  1  2 -1  0 -1  3  1  0
  0 -1 -1 -1 -1 -1 -1 -1  9 -1 -1 -1 -1  0 -1  2 -1 -1 -1 -1 -1 -1 -1 -1
  0 -1 -1 -1 -1 -1  1 -1  1 -1 -1 -1 -1 -1  1 -1 -1 -1  1 -1  0 -1 -1  0
 -1 -1  0  0 -1 -1  0  7  2  2 -1 -1 -1  1 -1 -1 -1 -1  9 -1 -1 -1  0  1
  0  0  0  0 -1 -1 -1 -1  0 -1  0 -1  3  0  0  0 -1  0  2  0  0  0 -1 -1
  0  0 -1  1  0  0  0  0  0  0  3  0  0 -1  0  1 -1  0  0  0 -1  0  0 -1
  0 -1  3 -1  0 -1  2 -1  0 -1  0 -1  4 -1  0 -1 -1  0  1 -1 -1  1  1 -1
 -1 -1  0 -1 -1  0  0 -1  0 -1  5 -1 -1  6 -1  6  6 -1 -1  1  6  6  6 -1
  1  2 -1  2  1  1 -1 -1  1 -1 -1  1  1 -1  1 -1 -1  0 -1  7 -1  0 -1  4
  1  1 -1 -1  1 -1 -1 -1  7  0 -1 -1 -1 -1  5 -1 -1

In [8]:
x = [0,1,2]
y = [0,1,1]
for i in range(length):
    i = i - 1
    x = []
    y = []
    for j in range(len(labels)):
        if labels[j] == i:
            x.append(sentiments[j])
            y.append(quoteBaseds[j])
    #plt.scatter(x,y) 
#plt.show()