In [185]:
# Import Libraries
import os
import re 
import csv
import ast
import glob
import nltk
import string 
import pandas as pd
import numpy as np
import sklearn as sk
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [186]:
# Create path to files 
files_path = "/Users/queene/Desktop/Pattern Recognition/Abstract_Data"
read_files = glob.glob(os.path.join(files_path, "*.txt"))

# Put data into array 
np_array_values = []
for files in read_files: 
    abstract_data = pd.read_csv(files, sep='\t', header=None)
    np_array_values.append(abstract_data)
    
# View data in an array 
np_array_values

[                                                   0
 0  Music Genre Classification is one of the inter...,
                                                    0
 0  This paper examines the opinion of student can...,
                                                    0
 0  Instagram (IG) is a web-based and mobile socia...,
                                                    0
 0  E-Government basically comprises the use of el...,
                                                    0
 0  Hate speech develops along with the rapid deve...,
                                                    0
 0  The using of Twitter by selebrities has become...,
                                                    0
 0  Social media has changed the people mindset to...,
                                                    0
 0  Batik is the most popular tradisional cloth ma...,
                                                    0
 0  Movie has unique characteristics. When someone...,
                   

In [198]:
# Combined all text files into a single dataframe
merge_values = np.vstack(np_array_values)
abstract_data = pd.DataFrame(merge_values)

abstract_data[0].to_csv("Single_DataFrane.csv")
abstract_data.head(10)

Unnamed: 0,0
0,Music Genre Classification is one of the inter...
1,This paper examines the opinion of student can...
2,Instagram (IG) is a web-based and mobile socia...
3,E-Government basically comprises the use of el...
4,Hate speech develops along with the rapid deve...
5,The using of Twitter by selebrities has become...
6,Social media has changed the people mindset to...
7,Batik is the most popular tradisional cloth ma...
8,Movie has unique characteristics. When someone...
9,The promotion of goods or services is now faci...


In [188]:
# Lower case all characters 
abstract_data[0] = abstract_data[0].str.lower()

# Punctuation
def punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

abstract_data[0] = abstract_data[0].apply(punctuation)

# Whitepsaces
def whitespace(text):
    return text.strip()

abstract_data[0] = abstract_data[0].apply(whitespace)

# Multiple Whitespaces
def whitespace_multiple(text):
    return re.sub('\s+',' ',text)

abstract_data[0] = abstract_data[0].apply(whitespace_multiple)

# Single Char
def single_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

abstract_data[0] = abstract_data[0].apply(single_char)

# Number 
def number(text):
    return  re.sub(r"\d+", "", text)

abstract_data[0] = abstract_data[0].apply(number)

abstract_data

Unnamed: 0,0
0,music genre classification is one of the inter...
1,this paper examines the opinion of student can...
2,instagram ig is webbased and mobile social me...
3,egovernment basically comprises the use of ele...
4,hate speech develops along with the rapid deve...
5,the using of twitter by selebrities has become...
6,social media has changed the people mindset to...
7,batik is the most popular tradisional cloth ma...
8,movie has unique characteristics when someone ...
9,the promotion of goods or services is now faci...


In [199]:
# Tokenization 
def tokenize(text):
    return word_tokenize(text)

abstract_data[0] = abstract_data[0].apply(tokenize)

abstract_data[0].to_csv("Tokenize.csv")
abstract_data

Unnamed: 0,0
0,"[Music, Genre, Classification, is, one, of, th..."
1,"[This, paper, examines, the, opinion, of, stud..."
2,"[Instagram, (, IG, ), is, a, web-based, and, m..."
3,"[E-Government, basically, comprises, the, use,..."
4,"[Hate, speech, develops, along, with, the, rap..."
5,"[The, using, of, Twitter, by, selebrities, has..."
6,"[Social, media, has, changed, the, people, min..."
7,"[Batik, is, the, most, popular, tradisional, c..."
8,"[Movie, has, unique, characteristics, ., When,..."
9,"[The, promotion, of, goods, or, services, is, ..."


In [200]:
# Common Words
def freqDist_wrapper(text):
    return FreqDist(text)

abstract_data[0] = abstract_data[0].apply(freqDist_wrapper)

print('Frequency Tokens : \n') 
print(abstract_data[0].head(10).apply(lambda x : x.most_common()))

Frequency Tokens : 

0    [(the, 15), (music, 12), (and, 9), (genre, 9),...
1    [(,, 15), (to, 11), (degree, 10), (., 10), (of...
2    [(the, 13), (., 10), (%, 8), (of, 7), (spam, 7...
3    [(the, 13), (of, 11), (., 9), (as, 7), (and, 7...
4    [(the, 9), (to, 9), (of, 8), (a, 8), (., 7), (...
5    [(,, 10), (sentiment, 10), (., 9), (of, 7), (a...
6    [(the, 12), (., 8), (to, 5), (of, 5), (is, 5),...
7    [(the, 13), (is, 11), (., 9), (,, 7), (of, 7),...
8    [(,, 15), (the, 12), (%, 8), (movie, 6), (of, ...
9    [(the, 13), (of, 12), (., 12), (followers, 8),...
Name: 0, dtype: object


In [201]:
# Term Frequency
def calc_TF(document):
    
    TF_dict = {}
    for term in document:
        if term in TF_dict:
            TF_dict[term] += 1
        else:
            TF_dict[term] = 1
            
    for term in TF_dict:
        TF_dict[term] = TF_dict[term] / len(document)
    return TF_dict

abstract_data[0].head(10)

0    {'Music': 1, 'Genre': 2, 'Classification': 1, ...
1    {'This': 1, 'paper': 2, 'examines': 1, 'the': ...
2    {'Instagram': 2, '(': 3, 'IG': 1, ')': 3, 'is'...
3    {'E-Government': 1, 'basically': 1, 'comprises...
4    {'Hate': 2, 'speech': 4, 'develops': 1, 'along...
5    {'The': 2, 'using': 4, 'of': 7, 'Twitter': 1, ...
6    {'Social': 1, 'media': 2, 'has': 3, 'changed':...
7    {'Batik': 2, 'is': 11, 'the': 13, 'most': 2, '...
8    {'Movie': 1, 'has': 1, 'unique': 1, 'character...
9    {'The': 7, 'promotion': 3, 'of': 12, 'goods': ...
Name: 0, dtype: object

In [202]:
index = 0

print('%20s' % "term", "\t", "TF\n")
for key in abstract_data[0][index]:
    print('%20s' % key, "\t", abstract_data[0][index][key])

index = 1

print("\n"'%20s' % "term", "\t", "TF\n")
for key in abstract_data[0][index]:
    print('%20s' % key, "\t", abstract_data[0][index][key])

index = 2

print("\n"'%20s' % "term", "\t", "TF\n")
for key in abstract_data[0][index]:
    print('%20s' % key, "\t", abstract_data[0][index][key])
    
index = 3

print("\n"'%20s' % "term", "\t", "TF\n")
for key in abstract_data[0][index]:
    print('%20s' % key, "\t", abstract_data[0][index][key])

index = 4

print("\n"'%20s' % "term", "\t", "TF\n")
for key in abstract_data[0][index]:
    print('%20s' % key, "\t", abstract_data[0][index][key])
    
index = 5

print("\n"'%20s' % "term", "\t", "TF\n")
for key in abstract_data[0][index]:
    print('%20s' % key, "\t", abstract_data[0][index][key])

index = 6

print("\n"'%20s' % "term", "\t", "TF\n")
for key in abstract_data[0][index]:
    print('%20s' % key, "\t", abstract_data[0][index][key])

index = 7

print("\n"'%20s' % "term", "\t", "TF\n")
for key in abstract_data[0][index]:
    print('%20s' % key, "\t", abstract_data[0][index][key])
    
index = 8

print("\n"'%20s' % "term", "\t", "TF\n")
for key in abstract_data[0][index]:
    print('%20s' % key, "\t", abstract_data[0][index][key])

index = 9

print("\n"'%20s' % "term", "\t", "TF\n")
for key in abstract_data[0][index]:
    print('%20s' % key, "\t", abstract_data[0][index][key])

                term 	 TF

                 the 	 15
               music 	 12
                 and 	 9
               genre 	 9
                  of 	 8
                   . 	 8
                  to 	 8
                  is 	 7
                   , 	 6
                  in 	 5
      classification 	 5
                that 	 4
               files 	 4
                  by 	 4
             process 	 4
                 The 	 3
                 can 	 3
               Genre 	 2
                   a 	 2
                this 	 2
                  on 	 2
                folk 	 2
               songs 	 2
             dangdut 	 2
             problem 	 2
            features 	 2
             provide 	 2
              result 	 2
              genres 	 2
            classify 	 2
                  be 	 2
             Support 	 2
              Vector 	 2
             Machine 	 2
                   ( 	 2
                 SVM 	 2
                   ) 	 2
                song 	 2
               Music 

              public 	 3
            services 	 3
                  is 	 3
                 web 	 3
               based 	 3
          management 	 3
                that 	 3
                 The 	 3
           developed 	 3
                  in 	 2
              access 	 2
                  In 	 2
                   ’ 	 2
          complaints 	 2
                  an 	 2
        e-Government 	 2
               Model 	 2
                this 	 2
           complaint 	 2
                  at 	 2
            District 	 2
             Gihosha 	 2
                 has 	 2
                been 	 2
                 can 	 2
            enhances 	 2
        E-Government 	 1
           basically 	 1
           comprises 	 1
                 use 	 1
          electronic 	 1
      communications 	 1
        technologies 	 1
                such 	 1
            internet 	 1
           enhancing 	 1
           advancing 	 1
                most 	 1
          developing 	 1
           countries 	 1


               learn 	 2
                from 	 2
                time 	 2
                  In 	 2
                this 	 2
            research 	 2
             feature 	 2
          extraction 	 2
                with 	 2
         supervision 	 2
                 The 	 2
              Social 	 1
             changed 	 1
              people 	 1
             mindset 	 1
             express 	 1
            thoughts 	 1
               moods 	 1
                  As 	 1
            activity 	 1
              social 	 1
               users 	 1
           increases 	 1
                does 	 1
                rule 	 1
                 out 	 1
         possibility 	 1
              crimes 	 1
           spreading 	 1
                 can 	 1
              spread 	 1
             quickly 	 1
              widely 	 1
                  So 	 1
            possible 	 1
              detect 	 1
            manually 	 1
                 one 	 1
                deep 	 1
            learning 	 1


           determine 	 1
           potential 	 1
                 for 	 1
               using 	 1
              social 	 1
               media 	 1
               study 	 1
                uses 	 1
                   2 	 1
             popular 	 1
               first 	 1
                  an 	 1
              artist 	 1
            username 	 1
              second 	 1
          Infounjaya 	 1
            official 	 1
            Jenderal 	 1
              Achmad 	 1
                Yani 	 1
          University 	 1
          Yogyakarta 	 1
            grouping 	 1
              divide 	 1
           different 	 1
        interactions 	 1
               basic 	 1
          difference 	 1
             between 	 1
              number 	 1
          infounjaya 	 1
            analysis 	 1
              showed 	 1
               4,906 	 1
               3,211 	 1
               1,695 	 1
                 who 	 1
                like 	 1
             comment 	 1
                  on 	 1


In [205]:
# Inverse Document Frequency 
def calc_DF(tfDict):
    count_DF = {}
    
    for document in tfDict:
        for term in document:
            if term in count_DF:
                count_DF[term] += 1
            else:
                count_DF[term] = 1
    return count_DF

DF = calc_DF(abstract_data[0])

n_document = len(abstract_data)

def calc_IDF(__n_document, __DF):
    IDF_Dict = {}
    for term in __DF:
        IDF_Dict[term] = np.log(__n_document / (__DF[term] + 1))
    return IDF_Dict
  
IDF = calc_IDF(n_document, DF)
display(IDF)

{'the': -0.0953101798043249,
 'music': 1.6094379124341003,
 'and': -0.0953101798043249,
 'genre': 1.6094379124341003,
 'of': -0.0953101798043249,
 '.': -0.0953101798043249,
 'to': -0.0953101798043249,
 'is': -0.0953101798043249,
 ',': -0.0953101798043249,
 'in': -0.0953101798043249,
 'classification': 1.2039728043259361,
 'that': 0.0,
 'files': 1.6094379124341003,
 'by': 0.3566749439387324,
 'process': 1.2039728043259361,
 'The': 0.0,
 'can': 0.22314355131420976,
 'Genre': 1.6094379124341003,
 'a': 0.10536051565782635,
 'this': 0.22314355131420976,
 'on': 0.22314355131420976,
 'folk': 1.6094379124341003,
 'songs': 1.6094379124341003,
 'dangdut': 1.6094379124341003,
 'problem': 1.2039728043259361,
 'features': 1.2039728043259361,
 'provide': 1.2039728043259361,
 'result': 0.9162907318741551,
 'genres': 1.6094379124341003,
 'classify': 0.9162907318741551,
 'be': 0.22314355131420976,
 'Support': 1.2039728043259361,
 'Vector': 1.2039728043259361,
 'Machine': 1.2039728043259361,
 '(': 0.693

In [206]:
# TF-IDF
def calc_TF_IDF(TF):
    TF_IDF_Dict = {}   
  
    for key in TF:
        TF_IDF_Dict[key] = TF[key] * IDF[key]
    return TF_IDF_Dict

abstract_data[0] = abstract_data[0].apply(calc_TF_IDF)

abstract_data[0].to_csv("TF-IDF.csv")
abstract_data[0].head(10)

0    {'the': -1.4296526970648733, 'music': 19.31325...
1    {',': -1.4296526970648733, 'to': -1.0484119778...
2    {'the': -1.2390323374562235, '.': -0.953101798...
3    {'the': -1.2390323374562235, 'of': -1.04841197...
4    {'the': -0.857791618238924, 'to': -0.857791618...
5    {',': -0.9531017980432489, 'sentiment': 9.1629...
6    {'the': -1.1437221576518988, '.': -0.762481438...
7    {'the': -1.2390323374562235, 'is': -1.04841197...
8    {',': -1.4296526970648733, 'the': -1.143722157...
9    {'the': -1.2390323374562235, 'of': -1.14372215...
Name: 0, dtype: object

#### 

In [195]:
# Define the documents
doc1_2016 = "The using of Twitter by selebrities has become a new trend of impression management strategy. Mining public reaction in social media is a good strategy to obtain feedbacks, but extracting it are not trivial matter. Reads hundred of tweets while determine their sentiment polarity are time consuming. Extractive sentiment summarization machine are needed to address this issue. Previous research generally do not include sentiment information contained in a tweet as weight factor, as a results only general topics of discussion are extracted. This research aimed to do an extractive sentiment summarization on both positive and negative sentiment mentioning Indonesian selebrity, Agnes Monica, by combining SentiStrength, Hybrid TF-IDF, and Cosine Similarity. SentiStrength is used to obtain sentiment strength score and classify tweet as a positive, negative or neutral. The summarization of posisitve and negative sentiment can be done by rank tweets using Hybrid TF-IDF summarization and sentiment strength score as additional weight then removing similar tweet by using Cosine Similarity.The test results showed that the combination of SentiStrength, Hybrid TF-IDF, and Cosine Similarity perform better than using Hybrid TF-IDF only, given an average 60% accuracy and 62% f-measure. This is due to the addition of sentiment score as a weight factor in sentiment summ­ari­zation."
doc2_2017 = "E-Government basically comprises the use of electronic communications technologies such as the internet, in enhancing and advancing the citizens access to public services. In most developing countries including Burundi, citizens are facing many difficulties for accessing public services. One of the identified problems is the poor quality of service in managing citizens’ complaints. This study proposes an SMS and web based e-Government Model as a solution. In this study, a case study of a complaint management system at District of Gihosha has been used as a reference to prove that SMS and Web based e-Government Model can enhances the access of public services. The objective of this study is the development of an SMS and web-based system that can enhances the process and the management of citizens’ complaints at District of Gihosha. The system has been developed using PHP as front end, Apache as web server, MySQL as Database and Gammu as SMS gateway. The obtained results after testing the system shows that all the functionalities of the developed system worked properly. Thus, the SMS and web based complaint management system developed is considered to be effective."
doc3_2018 = "Movie has unique characteristics. When someone writes an opinions about a movie, not only the story in the movie itself is written, but also the people involved in the movie are also written. Opinion ordinary movie written in social media primarily  twitter.To get a tendency of opinion on the movie, whether opinion is likely  positive, negative or neutral, it takes a sentiment analysis. This study aims to classify the sentiment is positive, negative and neutral from opinions Indonesian language movie and look for the accuracy, precission, recall and f-meausre of the method used is Dynamic Convolutional Neural Network. The test results on a system that is built to show that Dynamic Convolutional Neural Network algorithm provides accuracy results better than Naive Bayes method, the value of accuracy of 80,99%, the value of precission 81,00%, recall 81,00%, f-measure 79,00%   while the value of the resulting accuracy Naive Bayes amounted to 76,21%, precission 78,00%, recall 76,00%, f-measure 75,00%."
doc4_2018 = "Batik is the most popular tradisional cloth made using the wax-resist dyeing technique. The fabric is found in various city in Indonesia, one of them is Lasem which popular with hand-drawn batik is called Batik Tulis Lasem. Natural dye selection is one of the most important priority for the batik tulis craftsmen. Natural dyes made from leaves and flowers. Proper selection of natural dye will impact on color, motif, and brightness on batik tulis fabric. AHP and TOPSIS methods can be used together to selecting natural dye especially the batik tulis lasem. AHP method is used in determining the weights of the criteria, and then TOPSIS method is needed for determining the best alternative on natural dye of batik tulis. According to the result of research, TOPSIS method is used to determine the priority of alternative on natural dye. Based on calculation with TOPSIS method , the fourth alternative (A4 is kayu secang) get priority value is 0.8478, so kayu secang is recommended to the craftsmen that will used this  material as the natural dye."
doc5_2019 = "Social media has changed the people mindset to express thoughts and moods. As the activity of social media users increases, it does not rule out the possibility of crimes of spreading hate speech can spread quickly and widely. So that it is not possible to detect hate speech manually. GRU is one of the deep learning methods that has the ability to learn information relations from the previous time to the present time. In this research feature extraction used is word2vec, because it has the ability to learn semantics between words. In this research the GRU performance will be compared with other supervision methods such as support vector machine, naive bayes, decision tree and logistic regression. The results obtained show that the best accuracy is 92.96% by the GRU model with word2vec feature extraction. The use of word2vec in the comparison supervision method is not good enough from tf and tf-idf."
doc6_2019 = "Instagram (IG) is a web-based and mobile social media application where users can share photos or videos with available features. Upload photos or videos with captions that contain an explanation of the photo or video that can reap spam comments. Comments on spam containing comments that are not relevant to the caption and photos. The problem that arises when identifying spam is non-spam comments are more dominant than spam comments so that it leads to the problem of the imbalanced dataset. A balanced dataset can influence the performance of a classification method. This is the focus of research related to the implementation of the CNB method in dealing with imbalance datasets for the detection of Instagram spam comments. The study used TF-IDF weighting with Support Vector Machine (SVM) as a comparison classification. Based on the test results with 2500 training data and 100 test data on the imbalanced dataset (25% spam and 75% non-spam), the CNB accuracy was 92%, precision 86% and f-measure 93%. Whereas SVM produces 87% accuracy, 79% precision, 88% f-measure. In conclusion, the CNB method is more suitable for detecting spam comments in cases of imbalanced datasets."
doc7_2020 = "The promotion of goods or services is now facilitated by the dissemination of information through Instagram. Dissemination of information is usually done by influencers or promotional accounts. The account used certainly has a lot of followers. Because of the large amount of follower data in that account, it can be grouped into the same characters. This is done to determine the potential for promotion using social media accounts. This study uses data from 2 popular accounts. The first account is an artist with the username ayutingting92. The second account is Infounjaya, the official promotion account from Jenderal Achmad Yani University, Yogyakarta. The results of grouping can divide follower data into two cluster groups with different interactions. The basic difference between the two groups is the number of likes and comments. The infounjaya account analysis results showed that of 4,906 followers, only 3,211 followers were actively involved in the interaction, 1,695 followers were passive followers who did not like or did not comment on the interaction. Meanwhile, the results of the ayutingting92 follower cluster show that out of 1 million sample data followers, only 13,591 followers were actively involved in the interaction of likes and comments, 986,409 were passive followers."
doc8_2020 = "This paper examines the opinion of student candidate about their plan to study further to master degree (S2) and doctoral degree (S3). There is lack of approach in finding public opinion about the interest of student candidate in continuing study to higher level such as master degree or doctoral degree. Through this paper, the Twitter’s user opinions are extracted using certain data mining technique to find out three sentiment types (negative, neutral, and positive) by taking the most dominant type of emotions (i.e., anger, anticipation, love, fear, joy, sadness, surprise, trust). The dataset is divided into two groups of Twitter’s users. Both datasets represent group A those opinion is about continuing study further to master degree versus group B whose continuing to doctoral degree. The groups are then divided into three types of sentiment statements about master degree versus doctoral degree. The first group is their sentiment about continuing study further to master degree with the result: (a) 109 negative tweets, 1683 neutral tweets and 131 positive tweets. For the second group (e.g., student’s sentiments about continuing to doctoral degree), it has results: (a) 421 negative tweets, 7666 neutral tweets and 1805 positive tweets. The data are tested to give accuracy value of 85%. The result of this sentiment analysis is useful as a reference for universities to understand the development of sentiments (opinion) from Twitter’s users and help the institutions to improve their reputation and quality."
doc9_2021 = "Music Genre Classification is one of the interesting digital music processing topics. Genre is a category of artistry, in this case, especially music, to characterize and categorize music is now available in various forms and sources. One of the applications is in determining the music genre classification on folk songs and dangdut songs. The main problem in the classification music genre is to find a combination of features and classifiers that can provide the best result in classifying music files into music genres. So we need to develop methods and algorithms that can classify genres appropriately. This problem can be solved by using the Support Vector Machine (SVM). The genre classification process begins by selecting the song file that will be classified by the genre, then the preprocessing process, the collection features by utilizing feature extraction, and the last process is Support Vector Machine (SVM) classification process to produce genre types from selected song files. The final result of this research is to classify Indonesian folk music genre and dangdut music genre along with the 83.3% accuracy values that indicate the level of system relevance to the results of music genre classification and to provide genre labels on music files as to facilitate the management and search of music files."
doc10_2021 = "Hate speech develops along with the rapid development of social media. Hate speech is often issued due to a lack of public awareness of the difference between criticism and statements that might contribute to this crime. Therefore, it is very important to do early detection of sentences that will be written before causing a criminal act due to public ignorance. In this paper, we use the advancement of deep neural networks to predict whether a sentence contains a hate speech and abusive tone. We demonstrate the robustness of different word and contextual embedding to represent the semantic of hate speech words. In addition, we use a document embedding representation via a recurrent neural networks with gated recurrent unit as the main architecture to provide richer representation. Compared to syntactic representation of the previous approach, the contextual embedding in our model proved to give a significant boost on the performance by a significant margin."

documents = [doc1_2016, doc2_2017, doc3_2018, doc4_2018, doc5_2019, doc6_2019, doc7_2020, doc8_2020, doc9_2021, doc10_2021]

In [196]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Create the Document Term Matrix
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
sparse_matrix = count_vectorizer.fit_transform(documents)

doc_term_matrix = sparse_matrix.todense()
df = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=['doc1_2016', 'doc2_2017', 'doc3_2018', 'doc4_2018', 'doc5_2019', 'doc6_2019', 'doc7_2020', 'doc8_2020', 'doc9_2021', 'doc10_2021'])
df

Unnamed: 0,00,100,109,13,131,1683,1805,21,211,25,2500,409,421,591,60,62,695,75,76,7666,78,79,80,81,83,8478,85,86,87,88,906,92,93,96,986,99,a4,ability,about,abusive,access,accessing,according,account,accounts,accuracy,achmad,act,actively,activity,addition,additional,address,advancement,advancing,after,agnes,ahp,aimed,aims,algorithm,algorithms,all,along,also,alternative,amount,amounted,an,analysis,and,anger,anticipation,apache,application,applications,approach,appropriately,architecture,are,ari,arises,artist,artistry,as,at,available,average,awareness,ayutingting92,balanced,based,basic,basically,batik,bayes,be,because,become,been,before,begins,best,better,between,boost,both,brightness,built,burundi,but,by,calculation,called,can,candidate,caption,captions,case,cases,categorize,category,causing,certain,certainly,changed,characteristics,characterize,characters,citizens,city,classification,classified,classifiers,classify,classifying,cloth,cluster,cnb,collection,color,combination,combining,comment,comments,communications,compared,comparison,complaint,complaints,comprises,conclusion,considered,consuming,contain,contained,containing,contains,contextual,continuing,contribute,convolutional,cosine,countries,craftsmen,crime,crimes,criminal,criteria,criticism,dangdut,data,database,dataset,datasets,dealing,decision,deep,degree,demonstrate,detect,detecting,detection,determine,determining,develop,developed,developing,development,develops,did,difference,different,difficulties,digital,discussion,dissemination,district,divide,divided,do,doctoral,document,does,dominant,done,drawn,due,dye,dyeing,dyes,dynamic,early,effective,electronic,embedding,emotions,end,enhances,enhancing,enough,especially,examines,explanation,express,extracted,extracting,extraction,extractive,fabric,facilitate,facilitated,facing,factor,fear,feature,features,feedbacks,file,files,final,find,finding,first,flowers,focus,folk,follower,followers,for,forms,found,fourth,from,front,functionalities,further,gammu,gated,gateway,general,generally,genre,genres,get,gihosha,give,given,good,goods,government,group,grouped,grouping,groups,gru,hand,has,hate,help,higher,hundred,hybrid,identified,identifying,idf,ig,ignorance,imbalance,imbalanced,impact,implementation,important,impression,improve,in,include,including,increases,indicate,indonesia,indonesian,influence,influencers,information,infounjaya,instagram,institutions,interaction,interactions,interest,interesting,internet,into,involved,is,issue,issued,it,itself,jenderal,joy,kayu,labels,lack,language,large,lasem,last,leads,learn,learning,leaves,level,like,likely,likes,logistic,look,lot,love,machine,made,main,management,managing,manually,many,margin,master,material,matter,meanwhile,measure,meausre,media,mentioning,method,methods,might,million,mindset,mining,mobile,model,monica,moods,more,most,motif,movie,music,mysql,naive,natural,need,needed,negative,network,networks,neural,neutral,new,non,not,now,number,objective,obtain,obtained,of,official,often,on,one,only,opinion,opinions,or,ordinary,other,our,out,paper,passive,people,perform,performance,photo,photos,php,plan,polarity,poor,popular,posisitve,positive,possibility,possible,potential,precision,precission,predict,preprocessing,present,previous,primarily,priority,problem,problems,process,processing,produce,produces,promotion,promotional,proper,properly,proposes,prove,proved,provide,provides,public,quality,quickly,rank,rapid,reaction,reads,reap,recall,recommended,recurrent,reference,regression,related,relations,relevance,relevant,removing,represent,representation,reputation,research,resist,result,resulting,results,richer,robustness,rule,s2,s3,sadness,same,sample,score,search,secang,second,selebrities,selebrity,selected,selecting,selection,semantic,semantics,sentence,sentences,sentiment,sentiments,sentistrength,server,service,services,share,show,showed,shows,significant,similar,similarity,sms,so,social,solution,solved,someone,song,songs,sources,spam,speech,spread,spreading,statements,story,strategy,strength,student,study,such,suitable,summ,summarization,supervision,support,surprise,svm,syntactic,system,takes,taking,technique,technologies,tendency,test,tested,testing,tf,than,that,the,their,them,then,there,therefore,this,those,thoughts,three,through,thus,time,to,together,tone,topics,topsis,tradisional,training,tree,trend,trivial,trust,tulis,tweet,tweets,twitter,two,type,types,understand,unique,unit,universities,university,upload,use,used,useful,user,username,users,uses,using,usually,utilizing,value,values,various,vector,versus,very,via,video,videos,was,wax,we,web,weight,weighting,weights,were,when,where,whereas,whether,which,while,who,whose,widely,will,with,word,word2vec,words,worked,writes,written,yani,yogyakarta,zation
doc1_2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,2,0,7,0,0,0,0,0,0,0,0,4,1,0,0,0,5,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,2,0,0,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,4,0,0,4,0,0,0,0,0,0,0,1,0,3,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,3,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,3,0,0,0,1,1,0,2,0,0,0,2,0,7,0,0,1,0,2,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,0,0,0,2,0,0,0,0,0,0,0,0,3,0,0,0,1,1,0,0,0,0,0,0,0,10,0,3,0,0,0,0,0,1,0,0,1,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,1,4,0,0,0,0,0,0,0,0,0,0,0,1,0,0,4,1,1,5,1,0,1,0,0,3,0,0,0,0,0,1,5,0,0,1,0,0,0,0,1,1,0,0,3,2,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
doc2_2017,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,2,0,7,0,0,1,0,0,0,0,0,1,0,0,0,0,7,2,0,0,0,0,0,4,0,1,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,2,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,0,0,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,2,0,0,0,0,3,0,0,0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,11,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1,1,0,0,0,3,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,3,0,0,0,1,0,0,0,5,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,1,0,0,0,0,0,0,0,0,6,0,0,0,1,0,0,0,1,0,0,3,16,0,0,0,0,0,3,0,0,0,0,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
doc3_2018,6,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,2,0,1,1,1,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,2,0,0,1,1,1,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,5,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,2,0,0,0,2,2,0,2,2,0,0,1,0,0,0,0,0,6,0,0,2,0,1,3,2,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,1,0,0,0,1,2,13,0,0,0,0,0,1,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,3,0,0,0
doc4_2018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,3,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,7,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,6,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,0,0,0,2,0,0,0,0,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,4,1,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,7,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,5,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,14,0,1,1,0,0,1,0,0,0,0,0,0,4,1,0,0,4,1,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0
doc5_2019,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,3,0,3,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,0,0,3,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,2,0,1,2,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,1,5,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,2,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,2,1,1,0,0,0,0,0,0,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,3,14,0,0,0,0,0,2,0,1,0,0,0,2,5,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,2,0,3,1,0,0,0,0,0,0
doc6_2019,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,5,0,0,0,1,0,0,0,0,2,0,1,0,0,1,0,1,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,3,0,0,0,0,0,7,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,3,2,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,3,0,1,0,0,0,3,0,0,0,0,0,0,1,0,0,0,2,0,0,0,0,0,0,0,0,4,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2,0,1,0,3,0,0,0,0,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,7,0,0,3,0,0,0,0,3,0,0,0,0,0,0,0,0,1,1,3,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,2,0,0,0,0,0,0,0,2,0,0,1,1,5,15,0,0,0,0,0,1,0,0,0,0,0,0,3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,2,1,0,0,1,0,1,0,0,1,1,1,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0
doc7_2020,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,6,3,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,1,1,0,0,0,2,0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,3,8,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,1,2,2,1,0,3,1,0,0,0,2,2,6,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,1,0,0,0,12,1,0,1,0,2,0,0,3,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,20,0,0,0,0,0,2,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,1,0,0,0,2,0,0,0,0,0,0,1,1,0
doc8_2020,0,0,1,0,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,6,1,1,0,0,0,1,0,0,3,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,2,0,1,1,0,0,0,10,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,5,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,1,0,0,0,0,0,2,0,0,0,1,0,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,4,0,0,2,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,2,0,5,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,3,0,0,0,3,0,0,0,0,0,0,0,0,9,0,0,0,0,0,4,1,1,0,0,0,1,2,0,0,0,0,0,0,0,1,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,2,0,1,0,0,0,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,4,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,3,4,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,13,3,0,1,1,0,3,1,0,2,1,0,0,11,0,0,0,0,0,0,0,0,0,1,0,0,6,3,1,1,2,1,0,0,1,0,0,0,0,1,1,0,2,0,1,0,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0
doc9_2021,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,9,0,0,0,0,1,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,1,0,0,0,0,0,0,0,0,4,0,0,3,0,0,0,1,0,1,1,0,0,0,0,0,1,0,0,0,6,1,1,2,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,2,0,1,4,1,1,0,0,0,0,2,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,11,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,7,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,13,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,8,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,4,1,1,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,2,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,2,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,2,0,1,0,0,0,0,0,0,0,0,0,0,4,18,0,0,1,0,0,3,0,0,0,0,0,0,8,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0
doc10_2021,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,3,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,2,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,2,0,0,0,0,1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,8,0,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,2,0,0,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,1,3,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,0,4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2,9,0,0,0,0,1,2,0,0,0,0,0,0,9,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,2,1,0,1,0,0,1,0,0,0


In [209]:
from sklearn.metrics.pairwise import cosine_similarity

df.to_csv("Cosine.csv")
print(cosine_similarity(df, df))

[[1.         0.44776131 0.42639724 0.34649287 0.45948781 0.38238644
  0.37618985 0.48691767 0.41139379 0.4180362 ]
 [0.44776131 1.         0.54449821 0.52467817 0.62995761 0.57604994
  0.61145794 0.52149855 0.57673071 0.560678  ]
 [0.42639724 0.54449821 1.         0.53610329 0.6170404  0.55976003
  0.58383533 0.54155332 0.52770015 0.51746572]
 [0.34649287 0.52467817 0.53610329 1.         0.59456174 0.53914792
  0.56082114 0.4564884  0.54304721 0.48697107]
 [0.45948781 0.62995761 0.6170404  0.59456174 1.         0.62912088
  0.65754186 0.53455476 0.63331462 0.64822216]
 [0.38238644 0.57604994 0.55976003 0.53914792 0.62912088 1.
  0.62103734 0.46909709 0.56706721 0.5091783 ]
 [0.37618985 0.61145794 0.58383533 0.56082114 0.65754186 0.62103734
  1.         0.50106811 0.56242252 0.53172012]
 [0.48691767 0.52149855 0.54155332 0.4564884  0.53455476 0.46909709
  0.50106811 1.         0.49870848 0.53982456]
 [0.41139379 0.57673071 0.52770015 0.54304721 0.63331462 0.56706721
  0.56242252 0.49870