In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from utils import *

import nltk
from nltk.corpus import stopwords

from tqdm import tqdm

In [2]:
config = yaml_read('config.yaml')
config

{'dataset': './dataset.csv',
 'model': {'max_seq_len': 75,
  'n_label': 13,
  'walk_len': 10,
  'sg': 1,
  'vector_size': 10,
  'min_count': 5,
  'window': 2,
  'workers': 2,
  'seed': 0}}

In [None]:
def remove_stopwords(text, stopwords=stopwords.words("english")): 
    tokens = nltk.word_tokenize(text)
    tokens = [w.lower().strip() for w in tokens if not w.lower() in stopwords]
    return tokens


def clean_content(text):
    text = re.sub(r'\s+', ' ', text).strip()                                    # remove extra spaces
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)     # remove urls
    text = re.sub(r'\@\w+|\#', '', text)                                        # remove at and hash tags
    text = text.encode('ascii', 'ignore').decode('ascii')                       # remove emojis
    tokenized_text = remove_stopwords(text)                                     # remove stopwords
    return tokenized_text 


In [None]:
data_dir = config['dataset']
df = pd.read_csv(data_dir)
df.info()

In [None]:
df.sentiment.value_counts()

In [None]:
sentiment_lb = df.sentiment.value_counts().index.to_list()
sentiment_vl = df.sentiment.value_counts().to_list()

plt.figure(figsize=(12, 8))
plt.bar(range(len(sentiment_vl)), sentiment_vl)
plt.xticks(range(len(sentiment_vl)), sentiment_lb, rotation=60)
plt.xlabel('Sentiments')
plt.ylabel('Proportion of each sentimental types')

In [None]:
# Proportion of word-length
content = df.content
content = content.apply(lambda x: clean_content(x))

word_len_prop = {}

for row in tqdm(content, desc='In Progress...'): 
    word_len = len(row)
    try: 
        word_len_prop[word_len] += 1
    except: 
        word_len_prop[word_len] = 1

In [None]:
sorted_word_dict = dict(sorted(word_len_prop.items(), key=lambda x: x[0]))
plt.figure(figsize=(12, 8))
plt.bar(range(len(sorted_word_dict.values())), sorted_word_dict.values())
plt.xticks(range(len(sorted_word_dict.values())),sorted_word_dict.keys(), rotation=60)
plt.xlabel('Length of each comment')
plt.ylabel('Number of comment\'s length')