In [None]:
import sys
sys.path.append('./../')

In [None]:
import os
import re
import math
import copy
import warnings
import itertools
from typing import List, Dict
from datetime import datetime

import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

from definitions import *

warnings.filterwarnings('ignore')

In [None]:
dataset_csv_path = os.path.join(DATASET_DIR, 'cryptonews', 'color', 'dataset.csv')
sdf = pd.read_csv(dataset_csv_path)

sdf

In [None]:
df = sdf.copy()
df = df[['text', 'label']]

df

# Stats

In [None]:
def has_digit(inputString):
    return any(char.isdigit() for char in inputString)

### Part A: Sample cout

In [None]:
sample_count = len(df)

sample_count

### Part B: Sentence count

In [None]:
sentence_count = len(df)

sentence_count

### Part C: word count

In [None]:
words_list = [word for sentence in df.text for word in sentence.split() if not has_digit(word)]
word_count = len(words_list)

word_count

### Part D: unique word count

In [None]:
unique_word_count = len(set(words_list))

unique_word_count

### Part D: unique word based on labels count

In [None]:
word2label = {}
for index in tqdm(range(len(df))):
    sentence, label = df.iloc[index]
    for word in sentence.split():
        if not has_digit(word):
            if word not in word2label:
                word2label[word] = label
            elif label != word2label[word] and 2 != word2label[word]:
                word2label[word] = 2

list(word2label.items())[:10]

In [None]:
common_words_set = set([word for word, label in word2label.items() if 2 == label])
green_words_set = set([word for word, label in word2label.items() if 1 == label])
red_words_set = set([word for word, label in word2label.items() if 0 == label])

print("Common word count", len(common_words_set))
print("Green  word count", len(green_words_set))
print("Red    word count", len(red_words_set))

### Part E: 10 most repetitive uncommon words of each class

In [None]:
word2count = {}
for word in words_list:
    word2count[word] = 1 if word not in word2count else word2count[word] + 1 
word2count = {word: count for word, count in sorted(word2count.items(), key=lambda x: x[1], reverse=True)}

list(word2count.items())[:10]

In [None]:
green_word2count = {word: count for word, count in word2count.items() if word in green_words_set and word not in red_words_set}
red_word2count = {word: count for word, count in word2count.items() if word not in green_words_set and word in red_words_set}

green_repetitive_words_list = list(green_word2count.items())[:10]
red_repetitive_words_list = list(red_word2count.items())[:10]

print("10 most repetitive green words:")
for word, count in green_repetitive_words_list:
    print('\t-', word, count)

print("\n10 most repetitive red words:")
for word, count in red_repetitive_words_list:
    print('\t-', word, count)

### Part F: Relative Normalized Frequency

In [None]:
green_count = sum(green_word2count.values())
red_count = sum(red_word2count.values())

common_words2rnf = {}
for word in common_words_set:
    common_words2rnf[word] = (word2count[word] / green_count) / (word2count[word] / red_count)

common_words2rnf = {w: rnf for w, rnf in sorted(common_words2rnf.items(), key=lambda x: x[1], reverse=True)}
list(common_words2rnf.items())[:10]

### Part G: TF-IDF

In [None]:
items = [(green_words_set, green_word2count, green_count, 'green'),
         (red_words_set, red_word2count, red_count, 'red')]

for word_set, doc_word2count, total_doc, class_name in items:
    word2tfidf = {}
    for word in word_set:
        tf = doc_word2count[word] / total_doc
        nt = word2label[word] if 0 != word2label[word] else 1
        idf = -np.log(nt / 2)
        word2tfidf[word] = tf * idf

    word2tfidf = {w: v for w, v in sorted(word2tfidf.items(), key=lambda x: x[1], reverse=True)}

    print("10 word with greatest tf-idf value in {}".format(class_name))
    for word, value in list(word2tfidf.items())[:10]:
        print("\t- {:<20}{:.4f}".format(word, value))
    print()

In [None]:
plt.hist(words_list)
plt.plot()