In [35]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import math
from nltk.tokenize import word_tokenize

ori_df = pd.read_pickle("../dataset/features/raw_features.pickle")


In [36]:
strs_df = ori_df[['Label', 'strings']].compute()


### 1. TF_IDF Analysis on Tokens

In [None]:
from nltk import FreqDist

cache_stopwords = stopwords.words("english")

def tokens_gen(sen):
    if ' ' not in sen:
        return sen
    else:
        return word_tokenize(sen)

def remove_stopwords(sen):
    return ' '.join([word for word in sen.split() if word not in cache_stopwords])

def TF_Words(docs):
    '''
    :param docs: the total lists of strings
    '''
    tf_word_dict = {}
    total_tokens_num = 0
    for doc in docs:
        for sen in doc:
            # tokenize sentence
            tokens = tokens_gen(sen)
            freddist = FreqDist(tokens)
            tokens = list(freddist.keys())
            total_tokens_num += len(tokens)
    
            for token, freq in freddist.items():
                if token in tf_word_dict.keys():
                    tf_word_dict[token] += freq
                else:
                    tf_word_dict[token] = freq

    for key, value in tf_word_dict.items():
        tf_word_dict[key] = np.round(value/total_tokens_num,5)
    
    return tf_word_dict


def IDF_Words(tf_word_dict, docs):
    # calculate the IDF for every words
    num_docs = len(docs)
    idf_word_dict = {}

    for word in tf_word_dict.keys():
        doc_num = 0
        for doc in docs:
            if any(word in sen for sen in doc):
                doc_num += 1
                
        idf_word_dict[word] = np.round(math.log10(num_docs/doc_num), 5)

    return idf_word_dict

def TF_IDF_Words(tf_word_dict, idf_word_dict):
    tf_idf_dict = {}
    for word, value in tf_word_dict.items():
        tf_idf_dict[word] = np.round(value * idf_word_dict[word], 5)

    return tf_idf_dict


In [None]:
# test on tf_idf_word
docs = strs_df['strings'].tolist()
tf_word_dict = TF_Words(docs)


In [None]:
idf_word_dict = IDF_Words(tf_word_dict, docs)
tf_idf_dict = TF_IDF_Words(tf_word_dict, idf_word_dict)

In [None]:
word_df = pd.DataFrame(tf_idf_dict.items(), columns=['word','value'])

In [None]:
print(word_df[word_df['value']>0].sort_values('value',ascending=False))


In [None]:
# word_df

In [None]:
strs_df['doc'] = strs_df['strings'].apply(lambda x: ' '.join(x))

In [None]:
strs_df.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
corpus = strs_df['doc'].tolist()
vectorizer_model = vectorizer.fit(corpus)
X_word = vectorizer_model.transform(corpus)

In [None]:
# type(X_word)
# X_word.toarray()

In [None]:
np.savetxt('../dataset/matrix/X_tf_idf_token.csv', X_word.toarray())

In [None]:
vectorizer_model.get_feature_names()[-10:]

In [52]:
np.loadtxt('../dataset/matrix/X_tf_idf_token.csv').shape

(321, 125326)

### 2. TF_IDF Analysis on Sentences

In [37]:
def TF_Sen(docs):
    ''' calculate the frequencies of sentences in corpus
    :param docs: the lists of lists of strings
    '''
    tf_sen_dict = {}
    total_sens_num = 0
    # calculate the TF for sentences
    for doc in docs:
        for sen in doc:
            total_sens_num += 1
            if sen in tf_sen_dict.keys():
                tf_sen_dict[sen] += 1
            else:
                tf_sen_dict[sen] = 1
    for key, value in tf_sen_dict.items():
        tf_sen_dict[key] = np.round(value/total_sens_num ,7)
    
    return tf_sen_dict


def IDF_Sen(tf_sen_dict, docs):
    # calculate the IDF for single sentence
    num_docs = len(docs)
    idf_sen_dict = {}

    for sen_key in tf_sen_dict.keys():
        doc_num = 0
        for doc in docs:
            if sen_key in doc:
                doc_num += 1   

        idf_sen_dict[sen_key] = np.round(math.log10(num_docs/doc_num), 7)

    return idf_sen_dict


# calculate the TF_IDF for sentences
def TF_IDF_Sen(tf_sen_dict, idf_sen_dict):
    # two dict should have the same keys
    tf_idf_dict = {}
    for sen, value in tf_sen_dict.items():
        tf_idf_dict[sen] =np.round(value * idf_sen_dict[sen], 7)
        
    return tf_idf_dict

In [38]:
# test on tf_idf_word
docs = strs_df['strings'].tolist()
tf_sen_dict = TF_Sen(docs)


In [39]:
# tf_sen_dict

In [40]:
idf_sen_dict = IDF_Sen(tf_sen_dict, docs)

In [41]:
# idf_sen_dict

In [42]:
tf_idf_sen_dict = TF_IDF_Sen(tf_sen_dict, idf_sen_dict)

# print(tf_idf_sen_dict)

In [43]:
sen_df = pd.DataFrame(tf_idf_sen_dict.items(), columns = ['sen', 'value'])

In [44]:
sen_df['value'].sort_values(ascending=False)

9798      0.025733
8449      0.016294
76867     0.003556
134088    0.001872
134093    0.001707
            ...   
132938    0.000008
132937    0.000008
132936    0.000008
132935    0.000008
100989    0.000008
Name: value, Length: 201980, dtype: float64

In [45]:
# sen_df[sen_df['value']>0.00003]

In [46]:
# sen_df[sen_df['value']>0.00001]

In [47]:
# filter_sen_df = sen_df[sen_df['value']>0.00002]

In [48]:
# filter_sen_df

In [49]:
# sen_df

In [50]:
def match_value(sen_list, tf_idf_dict):
    array = []
    for sen in sen_list:
        if sen in tf_idf_dict.keys():
            array.append(tf_idf_dict[sen])
    return array

In [53]:
strs_df['vector'] = strs_df['strings'].apply(lambda x: match_value(x, tf_idf_sen_dict))

In [79]:
strs_df.head()['strings'][0]

['!This program cannot be run in DOS mode.',
 '.rdata',
 '`@.bss',
 '.edata',
 '0@.idata',
 '.reloc',
 '0B.gnu_deb',
 '\x1f<dVmT',
 '};\x1fTBT2pq',
 'P<)M,M',
 'PO@MA$6',
 'FOZM[$',
 "'W`ScS",
 'WgSVSc',
 'WtSCSt',
 '.M"OM/Z',
 'P7QJVu',
 'O*OIJBJ',
 '?"*~, ',
 '@QGizdL',
 'FLOTI}',
 'xAThFF',
 'c?(!".',
 "/'A(.9",
 '1ig8.QBE',
 '#+"p? ',
 'Vt9@ZpY',
 'cQri6lF]',
 'CwB8bq',
 'NiD:ht',
 '/@FHG4M',
 '__1NJK',
 '4sP\\.In',
 'pG9<dku',
 'X9JqN0',
 'krr&7>8>vZI9G',
 '\x1fAklw_E',
 'G$hG,F.',
 'X>\x1fJl\x1fwG',
 'K`\x1f+v/!m',
 '@A{C@K',
 '9Atc_fR',
 'GB;`!_\\',
 'VL8q\\O\x1f',
 'i6yvsxi',
 'pH9\x1f`\\',
 '_$>9Am',
 '`~^L>V',
 'G".,!y',
 'Bt&P":&y',
 "!!\\O^'>u",
 '&AY"!=',
 '"*KtR8',
 ')`"Kc2@u',
 ',($j"8(U',
 '<>9F:;5',
 '4aL):O',
 "aCD^')K",
 'Yyn~$q',
 'OABo."',
 ':AH?A_i',
 "v2Vn'K",
 'f?{J@oa',
 'L0O6Qh',
 'Q\x1fQ(Q*Q$[>Q',
 '@lT"?$',
 ',H4Gj8',
 'Kq2w9gw',
 '~R^AHe',
 "'Hbk\\_",
 'NR9K+9$B',
 ')\\\x1f(+`=',
 'FLH79!=',
 'IhL,[AL',
 'Bj(OdV',
 'KGCN,Ze',
 'LL?I9o',
 "*\\G9C'*.8",
 '7:?

In [83]:
num_array = []
max_len = max([len(value_list) for value_list in strs_df['vector']])

for value_list in strs_df['vector'].tolist():
    if len(value_list) != max_len:
        value_list.extend([0]* (max_len - len(value_list)))
    
    num_array.append(np.array(value_list))

In [84]:
num_array

[array([2.457e-04, 9.310e-05, 1.430e-05, ..., 0.000e+00, 0.000e+00,
        0.000e+00]),
 array([5.710e-05, 5.710e-05, 2.457e-04, ..., 0.000e+00, 0.000e+00,
        0.000e+00]),
 array([2.457e-04, 4.550e-05, 4.170e-05, ..., 0.000e+00, 0.000e+00,
        0.000e+00]),
 array([0.0002457, 0.0002859, 0.000282 , ..., 0.       , 0.       ,
        0.       ]),
 array([2.457e-04, 4.550e-05, 4.170e-05, ..., 0.000e+00, 0.000e+00,
        0.000e+00]),
 array([5.25e-05, 5.25e-05, 5.25e-05, ..., 0.00e+00, 0.00e+00, 0.00e+00]),
 array([2.457e-04, 4.900e-05, 4.900e-05, ..., 0.000e+00, 0.000e+00,
        0.000e+00]),
 array([2.457e-04, 8.300e-06, 2.859e-04, ..., 0.000e+00, 0.000e+00,
        0.000e+00]),
 array([2.457e-04, 1.990e-05, 1.990e-05, ..., 0.000e+00, 0.000e+00,
        0.000e+00]),
 array([2.457e-04, 1.127e-04, 1.430e-05, ..., 0.000e+00, 0.000e+00,
        0.000e+00]),
 array([2.457e-04, 1.990e-05, 1.430e-05, ..., 0.000e+00, 0.000e+00,
        0.000e+00]),
 array([2.457e-04, 4.900e-05, 4.900

In [85]:
np.savetxt('../dataset/matrix/X_tf_idf_sen.csv', np.array(num_array))

In [86]:
import numpy as np

np.loadtxt('../dataset/matrix/X_tf_idf_sen.csv').shape

(321, 2068)