In [3]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from nltk.cluster import KMeansClusterer, cosine_distance
from sklearn.decomposition import LatentDirichletAllocation
import json
from nltk.corpus import stopwords
import pandas as pd
import nltk
from sklearn import metrics
import numpy as np
from numpy import arange
from sklearn.metrics import f1_score

In [4]:
train_file = 'train_text.json'
test_file = 'test_text.json'

def load_json_file(filepath):
    with open(filepath) as f:
        return json.load(f)
    
def prepare_df(train_df, test_df):       
    
    train_df = pd.DataFrame(load_json_file(train_file), columns=['text'])
    
    test_df = pd.DataFrame(load_json_file(test_file), columns=['text', 'labels'])
    test_df['single_label'] = test_df['labels'].apply(lambda x: x[0])    
    
    return train_df, test_df    
    
    
# def map_labels_and_print_outcomes():
    

In [86]:
train_df = pd.DataFrame(train, columns=['text'])
test_df = pd.DataFrame(test, columns=['text', 'labels'])

test_df['single_label'] = test_df['labels'].apply(lambda x: x[0])
train_df.shape
# test_df.head()

(3421, 1)

In [22]:
n_clusters = 3

tfidf_vect = TfidfVectorizer(stop_words="english", min_df=3) 

tfidf= tfidf_vect.fit_transform(train_df['text'])
print (tfidf.shape)

(3421, 18819)


In [28]:
cluster_clf = KMeansClusterer(n_clusters, cosine_distance, repeats=15)

clusters = cluster_clf.cluster(tfidf.toarray(), assign_clusters=True)

clusters[0:10]

[1, 1, 0, 2, 0, 2, 1, 2, 1, 0]

In [30]:
test_tfidf = tfidf_vect.transform(test_df['text'])
print(test_tfidf.shape)

preds = [cluster_clf.classify(doc) for doc in test_tfidf.toarray()]

preds[:10]

(600, 18819)


[0, 0, 1, 2, 2, 0, 2, 1, 0, 1]

In [39]:
test_df.head()

Unnamed: 0,text,labels,single_label,cluster_id
0,faa issues fire warning for lithium batteries ...,[Travel & Transportation],Travel & Transportation,0
1,paraglider collides with hot air balloon in ar...,"[Disaster and Accident, Travel & Transportation]",Disaster and Accident,0
2,rescuers pull from flooded coal mine in chinab...,[Disaster and Accident],Disaster and Accident,1
3,japan factory output slides for fifth month in...,[News and Economy],News and Economy,2
4,obama signs emergency bill to halt teacher lay...,[News and Economy],News and Economy,2


In [177]:
test_df['cluster_id'] = preds

cluster_0 = test_df['single_label'][test_df.cluster_id==0]
cluster_1 = test_df['single_label'][test_df.cluster_id==1]
cluster_2 = test_df['single_label'][test_df.cluster_id==2]

# list(nltk.FreqDist(cluster_0).keys())[0]
# nltk.FreqDist(cluster_1)
# nltk.FreqDist(cluster_2)

# cluster and ground_truth label mapping
cluster_dict = {0: list(nltk.FreqDist(cluster_0).keys())[0],
                1: list(nltk.FreqDist(cluster_1).keys())[0],
                2: list(nltk.FreqDist(cluster_2).keys())[0]}

# Map true label to cluster id
preds_label = [cluster_dict[i] for i in preds]

confusion_df = pd.DataFrame(list(zip(test_df["single_label"].values, preds)),\
                            columns = ["actual class", "cluster"])

print(pd.crosstab(index=confusion_df.cluster, columns=confusion_df['actual class']))

print('Cluster 0: Topic',cluster_dict[0])
print('Cluster 1: Topic',cluster_dict[1])
print('Cluster 2: Topic',cluster_dict[2])


print(metrics.classification_report(test_df["single_label"], preds_label))

IndexError: list index out of range

In [5]:
def cluster_kmean(train_file, test_file):
    
    # dataset preparation
    train_df, test_df = prepare_df(train_file, test_file)    

    # parameters
    n_clusters = 3
    MIN_DF = 3
    STOP_WORDS = 'english'
    EPOCHS = 20

    tfidf_vect = TfidfVectorizer(stop_words=STOP_WORDS, min_df=MIN_DF) 
    tfidf= tfidf_vect.fit_transform(train_df['text'])

    cluster_clf = KMeansClusterer(n_clusters, cosine_distance, repeats=EPOCHS)
    clusters = cluster_clf.cluster(tfidf.toarray(), assign_clusters=True)

    test_tfidf = tfidf_vect.transform(test_df['text'])

    preds = [cluster_clf.classify(doc) for doc in test_tfidf.toarray()]

    test_df['cluster_id'] = preds

    # extract ground_truth labels for each cluter
    cluster_0 = test_df['single_label'][test_df.cluster_id==0]
    cluster_1 = test_df['single_label'][test_df.cluster_id==1]
    cluster_2 = test_df['single_label'][test_df.cluster_id==2]

    # cluster and ground_truth label mapping
    cluster_dict = {0: list(nltk.FreqDist(cluster_0).keys())[0],
                    1: list(nltk.FreqDist(cluster_1).keys())[0],
                    2: list(nltk.FreqDist(cluster_2).keys())[0]}

    # Map true label to cluster id
    preds_label = [cluster_dict[i] for i in preds]

    # confusion matrix/table
    confusion_df = pd.DataFrame(list(zip(test_df["single_label"].values, preds)),\
                                columns = ["actual class", "cluster"])    
    print(pd.crosstab(index=confusion_df.cluster, columns=confusion_df['actual class']))

    # cluster and topic assigned to it
    print('Cluster 0: Topic',cluster_dict[0])
    print('Cluster 1: Topic',cluster_dict[1])
    print('Cluster 2: Topic',cluster_dict[2])

    # evaluation metrics
    print(metrics.classification_report(test_df["single_label"], preds_label))

In [91]:
train_df.head()

Unnamed: 0,text
0,mabus to hold town hall meetings on gulf s fut...
1,flood waters swamp guthrie s hwy erin guy shot...
2,express trains collide in fog bound northern i...
3,new home sales up but sales remain slowwashing...
4,itv com contains the following categories ther...


In [105]:
# dataset preparation
train_df, test_df = prepare_df(train_file, test_file)

n_topics = 3
MIN_DF = 5
MAX_DF = 0.9
STOP_WORDS = 'english'

tf_vectorizer = CountVectorizer(max_df=MAX_DF, min_df=MIN_DF, stop_words=STOP_WORDS)

tf = tf_vectorizer.fit_transform(train_df['text'])

lda = LatentDirichletAllocation(n_components=n_topics, max_iter=25, verbose=1, evaluate_every=1, n_jobs=1,
                                random_state=0).fit(tf)



iteration: 1 of max_iter: 25, perplexity: 4250.5376
iteration: 2 of max_iter: 25, perplexity: 3801.0843
iteration: 3 of max_iter: 25, perplexity: 3609.7047
iteration: 4 of max_iter: 25, perplexity: 3530.5465
iteration: 5 of max_iter: 25, perplexity: 3494.8408
iteration: 6 of max_iter: 25, perplexity: 3473.0364
iteration: 7 of max_iter: 25, perplexity: 3454.9284
iteration: 8 of max_iter: 25, perplexity: 3439.2839
iteration: 9 of max_iter: 25, perplexity: 3426.7504
iteration: 10 of max_iter: 25, perplexity: 3416.5917
iteration: 11 of max_iter: 25, perplexity: 3409.4712
iteration: 12 of max_iter: 25, perplexity: 3403.4632
iteration: 13 of max_iter: 25, perplexity: 3396.3725
iteration: 14 of max_iter: 25, perplexity: 3387.7223
iteration: 15 of max_iter: 25, perplexity: 3382.7160
iteration: 16 of max_iter: 25, perplexity: 3380.7555
iteration: 17 of max_iter: 25, perplexity: 3379.6781
iteration: 18 of max_iter: 25, perplexity: 3378.8929
iteration: 19 of max_iter: 25, perplexity: 3378.2673
it

In [257]:
tf_test = tf_vectorizer.transform(test_df['text'])
topic_assign=lda.transform(tf_test)

topic_assign
preds = np.argmax(topic_assign, axis=1)

test_df['cluster_id'] = preds

# extract ground_truth labels for each cluter
cluster_0 = test_df['single_label'][test_df.cluster_id==0]
cluster_1 = test_df['single_label'][test_df.cluster_id==1]
cluster_2 = test_df['single_label'][test_df.cluster_id==2]

# cluster and ground_truth label mapping
cluster_dict = {0: list(nltk.FreqDist(cluster_0).keys())[0],
                1: list(nltk.FreqDist(cluster_1).keys())[0],
                2: list(nltk.FreqDist(cluster_2).keys())[0]}

# Map true label to cluster id
preds_label = [cluster_dict[i] for i in preds]


# confusion matrix/table
confusion_df = pd.DataFrame(list(zip(test_df["single_label"].values, preds)),\
                            columns = ["actual class", "cluster"])    
print(pd.crosstab(index=confusion_df.cluster, columns=confusion_df['actual class']))

# cluster and topic assigned to it
print('Cluster 0: Topic',cluster_dict[0])
print('Cluster 1: Topic',cluster_dict[1])
print('Cluster 2: Topic',cluster_dict[2])

# evaluation metrics
print(metrics.classification_report(test_df["single_label"], preds_label))

actual class  Disaster and Accident  News and Economy  Travel & Transportation
cluster                                                                       
0                                30                18                      138
1                                12               182                        8
2                               168                 6                       38
Cluster 0: Topic Travel & Transportation
Cluster 1: Topic News and Economy
Cluster 2: Topic Disaster and Accident
                         precision    recall  f1-score   support

  Disaster and Accident       0.79      0.80      0.80       210
       News and Economy       0.90      0.88      0.89       206
Travel & Transportation       0.74      0.75      0.75       184

              micro avg       0.81      0.81      0.81       600
              macro avg       0.81      0.81      0.81       600
           weighted avg       0.81      0.81      0.81       600



In [258]:
topic_assign.shape

(600, 3)

In [6]:
def cluster_lda(train_file, test_file):
    topic_assig = None
    labels = None
    
    # dataset preparation
    train_df, test_df = prepare_df(train_file, test_file)
    
    labels = test_df['labels']

    n_topics = 3
    MIN_DF = 5
    MAX_DF = 0.9
    STOP_WORDS = 'english'

    tf_vectorizer = CountVectorizer(max_df=MAX_DF, min_df=MIN_DF, stop_words=STOP_WORDS)

    tf = tf_vectorizer.fit_transform(train_df['text'])

    lda = LatentDirichletAllocation(n_components=n_topics, max_iter=25, verbose=1, evaluate_every=1, n_jobs=1,
                                    random_state=0).fit(tf)

    tf_test = tf_vectorizer.transform(test_df['text'])
    topic_assig=lda.transform(tf_test)
    
    preds = np.argmax(topic_assig, axis=1)

    test_df['cluster_id'] = preds

    # extract ground_truth labels for each cluter
    cluster_0 = test_df['single_label'][test_df.cluster_id==0]
    cluster_1 = test_df['single_label'][test_df.cluster_id==1]
    cluster_2 = test_df['single_label'][test_df.cluster_id==2]

    # cluster and ground_truth label mapping
    cluster_dict = {0: list(nltk.FreqDist(cluster_0).keys())[0],
                    1: list(nltk.FreqDist(cluster_1).keys())[0],
                    2: list(nltk.FreqDist(cluster_2).keys())[0]}

    # Map true label to cluster id
    preds_label = [cluster_dict[i] for i in preds]

    # confusion matrix/table
    confusion_df = pd.DataFrame(list(zip(test_df["single_label"].values, preds)),\
                                columns = ["actual class", "cluster"])    
    print(pd.crosstab(index=confusion_df.cluster, columns=confusion_df['actual class']))

    # cluster and topic assigned to it
    print('Cluster 0: Topic',cluster_dict[0])
    print('Cluster 1: Topic',cluster_dict[1])
    print('Cluster 2: Topic',cluster_dict[2])

    # evaluation metrics
    print(metrics.classification_report(test_df["single_label"], preds_label))
    
    return topic_assign, labels

In [259]:
topic_assign

array([[0.63987687, 0.00139012, 0.358733  ],
       [0.0072369 , 0.00566312, 0.98709997],
       [0.01754705, 0.03554542, 0.94690754],
       ...,
       [0.07718694, 0.39525681, 0.52755625],
       [0.04108562, 0.95689438, 0.00202   ],
       [0.1435063 , 0.56214291, 0.29435079]])

In [265]:
list(test_df['labels'].apply(lambda x: len(x))).count(3)
test_df['combined_labels'] = test_df['labels'].apply(lambda x: "_".join(x))

test_df.head()

Unnamed: 0,text,labels,single_label,cluster_id,combined_labels
0,faa issues fire warning for lithium batteries ...,[Travel & Transportation],Travel & Transportation,0,Travel & Transportation
1,paraglider collides with hot air balloon in ar...,"[Disaster and Accident, Travel & Transportation]",Disaster and Accident,2,Disaster and Accident_Travel & Transportation
2,rescuers pull from flooded coal mine in chinab...,[Disaster and Accident],Disaster and Accident,2,Disaster and Accident
3,japan factory output slides for fifth month in...,[News and Economy],News and Economy,1,News and Economy
4,obama signs emergency bill to halt teacher lay...,[News and Economy],News and Economy,1,News and Economy


In [289]:
# labels = test_df['labels']
labels = list(set(test_df['combined_labels']))
f1_score_l1 = [] #Travel&Transportation
f1_score_l2 = [] #Disaster and Accident
f1_score_l3 = [] #News and Economy
f1_score_l4 = [] #Disaster and Accident_Travel & Transportation
f1_score_l5 = [] #News and Economy_Travel & Transportation
threshold = []
prediction_df = pd.DataFrame(data=test_df['combined_labels'])


for t in arange(0.05, 0.95, 0.05):
    
    preds = []
    preds_label = []
    threshold.append(t)
#     print(i)
    for row in range(topic_assign.shape[0]):        
        labels_ = ""        
        for topic in range(3):
            if topic_assign[row][topic] > t:
                labels_ += str(topic)            
        
        if labels_ == "":
            preds.append(str(np.argmax(topic_assign[row])))
        else:
            preds.append(labels_)
    
#     print(preds)
    
    test_df['cluster_id'] = preds

#     print(test_df['cluster_id'])
    # extract ground_truth labels for each cluter
    
    cluster_0 = []
    cluster_01 = []
    cluster_02 = []
    cluster_1 = []
    cluster_12 = []
    cluster_2 = []
    
    for i, row in test_df['combined_labels'].items():
#         print(row)
        if '0' in test_df['cluster_id'][i]:
#         if test_df['cluster_id'][i] == '0':
            cluster_0.append((row))
        if '1' in test_df['cluster_id'][i]:
            cluster_1.append(row)
        if '2' in test_df['cluster_id'][i]:
#         if test_df['cluster_id'][i] == '01':
            cluster_2.append((row))
#         if test_df['cluster_id'][i] == '02':
#             cluster_02.append((row))
#         if test_df['cluster_id'][i] == '1':
#             cluster_1.append((row))
#         if test_df['cluster_id'][i] == '12':
#             cluster_12.append((row))
#         if test_df['cluster_id'][i] == '2':
#             cluster_2.append((row))
#         print('out')
#         cluster_1 = test_df['labels'][1 in test_df.cluster_id]
#         cluster_2 = test_df['labels'][2 in test_df.cluster_id]

#     print((nltk.FreqDist(cluster_12).most_common()))
    
#     print(cluster_0)
    # cluster and ground_truth label mapping
    #'':'Travel & Transportation',
    
#     predicted_topics = list(set(preds))
#     cluster_dict = {}
#     for t in predicted_topics:
#         if predicted_topics not in ['0', '1', '2']:
#             cluster_dict[t] = nltk.FreqDist(cluster_0).most_common()[0][0]
        
        
    cluster_dict = { 
                    '0': nltk.FreqDist(cluster_0).most_common()[0][0],
                    '01': nltk.FreqDist(cluster_1).most_common()[0][0]+'_'+
                            nltk.FreqDist(cluster_0).most_common()[0][0],
                    '02': nltk.FreqDist(cluster_2).most_common()[0][0]+'_'+
                            nltk.FreqDist(cluster_0).most_common()[0][0],
                    '1':  nltk.FreqDist(cluster_1).most_common()[0][0],
                    '12': nltk.FreqDist(cluster_2).most_common()[0][0]+'_'+
                            nltk.FreqDist(cluster_1).most_common()[0][0],
                    '2':  nltk.FreqDist(cluster_2).most_common()[0][0],
                    '012': nltk.FreqDist(cluster_0).most_common()[0][0]+'_'+
                            nltk.FreqDist(cluster_1).most_common()[0][0]+'_'+
                            nltk.FreqDist(cluster_2).most_common()[0][0]}

    
#     print(dict(nltk.FreqDist(cluster_1)))
#     print(dict(nltk.FreqDist(cluster_2)))
    # Map true label to cluster id
    
    #preds_label = [cluster_dict[i] for i in preds]
    for i in preds:
#         if i in ['0', '1', '2']:
        preds_label.append(cluster_dict[i])
#         elif i == '01':
#             preds_label.append(cluster_dict)
    
    prediction_df[t] = preds_label
#     print(labels)
    scores = f1_score(test_df['single_label'], preds_label, labels, average=None)
    f1_score_l1.append(scores[2]) #travel
    f1_score_l2.append(scores[3]) #disaster
    f1_score_l3.append(scores[1]) #news
    
    combined_label_scores = f1_score(test_df['combined_labels'], preds_label, labels, average=None)
    f1_score_l4.append(combined_label_scores[4]) #disaster_and_travel
    f1_score_l5.append(combined_label_scores[0]) #newsand_travel
    
    
print(labels[2]+': ',max(f1_score_l1))
print(labels[3]+': ',max(f1_score_l2))
print(labels[1]+': ',max(f1_score_l3))
print("")
print(labels[2]+': ',threshold[np.argmax(f1_score_l1)])
print(labels[3]+': ',threshold[np.argmax(f1_score_l2)])
print(labels[1]+': ',threshold[np.argmax(f1_score_l3)])

print("")
print("################# Extras: Displaying multilabel classification #######################")
print(labels[4]+': ',max(f1_score_l4))
print(labels[0]+': ',max(f1_score_l5))
print(labels[4]+': ',threshold[np.argmax(f1_score_l4)])
print(labels[0]+': ',threshold[np.argmax(f1_score_l5)])

prediction_df
prediction_df[prediction_df['combined_labels']=='News and Economy_Travel & Transportation']

Travel & Transportation:  0.7459459459459459
Disaster and Accident:  0.7962085308056872
News and Economy:  0.8938271604938272

Travel & Transportation:  0.5
Disaster and Accident:  0.5
News and Economy:  0.45

################# Extras: Displaying multilabel classification #######################
Disaster and Accident_Travel & Transportation:  0.1724137931034483
News and Economy_Travel & Transportation:  0.047619047619047616
Disaster and Accident_Travel & Transportation:  0.2
News and Economy_Travel & Transportation:  0.3


Unnamed: 0,combined_labels,0.05,0.1,0.15000000000000002,0.2,0.25,0.3,0.35000000000000003,0.4,0.45,0.5,0.55,0.6000000000000001,0.6500000000000001,0.7000000000000001,0.7500000000000001,0.8,0.8500000000000001,0.9000000000000001
452,News and Economy_Travel & Transportation,News and Economy_Travel & Transportation,News and Economy_Travel & Transportation,News and Economy_Travel & Transportation,News and Economy_Travel & Transportation,News and Economy_Travel & Transportation,News and Economy_Travel & Transportation,Travel & Transportation,Travel & Transportation,Travel & Transportation,Travel & Transportation,Travel & Transportation,Travel & Transportation,Travel & Transportation,Travel & Transportation,Travel & Transportation,Travel & Transportation,Travel & Transportation,Travel & Transportation


In [None]:
def overlapping_cluster(topic_assign, labels):
    final_thresh, f1 = None, None
    
    df = pd.DataFrame(data=labels, columns=['labels'])
    df['combined_labels'] = df['labels'].apply(lambda x: "_".join(x))
    df['single_label'] = df['labels'].apply(lambda x: x[0])

    unique_labels = list(set(df['combined_labels']))
    f1_score_l1 = [] #Travel&Transportation
    f1_score_l2 = [] #Disaster and Accident
    f1_score_l3 = [] #News and Economy
    f1_score_l4 = [] #Disaster and Accident_Travel & Transportation
    f1_score_l5 = [] #News and Economy_Travel & Transportation
    threshold = []
    prediction_df = pd.DataFrame(data=df['combined_labels'])

    for t in arange(0.05, 0.95, 0.05):
        preds = []
        preds_label = []
        threshold.append(t)
        for row in range(topic_assign.shape[0]):        
            labels_ = ""        
            for topic in range(3):
                if topic_assign[row][topic] > t:
                    labels_ += str(topic)            

            if labels_ == "":
                preds.append(str(np.argmax(topic_assign[row])))
            else:
                preds.append(labels_)

        df['topic_id'] = preds

        topic_0 = []
        topic_01 = []
        topic_02 = []
        topic_1 = []
        topic_12 = []
        topic_2 = []

        for i, row in df['combined_labels'].items():
    
            if '0' in df['topic_id'][i]:    
                topic_0.append((row))
            if '1' in df['topic_id'][i]:
                topic_1.append(row)
            if '2' in df['topic_id'][i]:   
                topic_2.append((row))

        topic_dict = { 
                        '0': nltk.FreqDist(topic_0).most_common()[0][0],
                        '01': nltk.FreqDist(topic_1).most_common()[0][0]+'_'+
                                nltk.FreqDist(topic_0).most_common()[0][0],
                        '02': nltk.FreqDist(topic_2).most_common()[0][0]+'_'+
                                nltk.FreqDist(topic_0).most_common()[0][0],
                        '1':  nltk.FreqDist(topic_1).most_common()[0][0],
                        '12': nltk.FreqDist(topic_2).most_common()[0][0]+'_'+
                                nltk.FreqDist(topic_1).most_common()[0][0],
                        '2':  nltk.FreqDist(topic_2).most_common()[0][0],
                        '012': nltk.FreqDist(topic_0).most_common()[0][0]+'_'+
                                nltk.FreqDist(topic_1).most_common()[0][0]+'_'+
                                nltk.FreqDist(topic_2).most_common()[0][0]}
        
        for i in preds:
            preds_label.append(topic_dict[i])

        prediction_df[t] = preds_label
        scores = f1_score(df['single_label'], preds_label, unique_labels, average=None)
        f1_score_l1.append(scores[2]) #travel
        f1_score_l2.append(scores[3]) #disaster
        f1_score_l3.append(scores[1]) #news

        combined_label_scores = f1_score(df['combined_labels'], preds_label, unique_labels, average=None)
        f1_score_l4.append(combined_label_scores[4]) #disaster_and_travel
        f1_score_l5.append(combined_label_scores[0]) #news_and_travel

    print(labels[2]+': ',max(f1_score_l1))
    print(labels[3]+': ',max(f1_score_l2))
    print(labels[1]+': ',max(f1_score_l3))
    print("")
    print(labels[2]+': ',threshold[np.argmax(f1_score_l1)])
    print(labels[3]+': ',threshold[np.argmax(f1_score_l2)])
    print(labels[1]+': ',threshold[np.argmax(f1_score_l3)])

#     print("")
#     print("################# Uncomment to display multilabel classification #######################")
#     print(labels[4]+': ',max(f1_score_l4))
#     print(labels[0]+': ',max(f1_score_l5))
#     print(labels[4]+': ',threshold[np.argmax(f1_score_l4)])
#     print(labels[0]+': ',threshold[np.argmax(f1_score_l5)])

#     prediction_df[prediction_df['combined_labels']=='News and Economy_Travel & Transportation']
#     prediction_df[prediction_df['combined_labels']=='Disaster and Accident_Travel & Transportation']
    
    return final_thresh, f1

In [None]:
if __name__ == "__main__":
    # Due to randomness, you won't get the exact result
    # as shown here, but your result should be close
    # if you tune the parameters carefully
    print('### Q1 ###')
    # Q1
    cluster_kmean('train_text.json', 'test_text.json')
    print("\n### Q2 ###")
    # Q2
    topic_assign, labels =cluster_lda('train_text.json', 'test_text.json')
    print("\n### Q3 ###")
    # Q2
    threshold, f1 = overlapping_cluster(topic_assign, labels)
#     print(threshold)
#     print(f1)

### Q1 ###
