In [1]:
import psycopg2
import nltk
import pandas as pd
import numpy as np
# import pyodbc
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/daniel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def execQuery(query):
    try:
        connection = psycopg2.connect(user = "postgres",
                                      password = "root",
                                      host = "localhost",
                                      port = "5432",
                                      database = "postgres")
        cursor = connection.cursor()
        cursor.execute(query)
        record = cursor.fetchall()
        return record
    except (Exception, psycopg2.Error) as error :
        connection = False
        print ("Error while connecting to PostgreSQL", error)
    finally:
        if(connection):
            cursor.close()
            connection.close()
            print("Executed query and closed connection.")

In [None]:
articlesContent = execQuery("""Select content
from article""")

In [3]:
# connection = psycopg2.connect(user = "postgres",
#                                      password = "root",
#                                      host = "localhost",
#                                      port = "5432",
#                                      database = "postgres")

# usage: specify file location, sample size and seed(used by random) 
filepath = '/home/daniel/OneDrive/KUuni/DataScience/Python/small.csv'
#filepath = 'news_sample.csv' # <- overwrite for setup
s = 250                    # desired sample size(seems to have slack ie. not exact)
seed = 1                     # seed used by Pseudorandom number generator

df = pd.read_csv(filepath, index_col = [0]).sample(n=s, random_state=seed)

In [64]:
import nltk
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

# https://medium.com/biaslyai/beginners-guide-to-text-preprocessing-in-python-2cbeafbf5f44
# https://www.datacamp.com/community/tutorials/text-analytics-beginners-nltk
# https://towardsdatascience.com/nlp-for-beginners-cleaning-preprocessing-text-data-ae8e306bef0f
# https://towardsdatascience.com/an-introduction-to-tweettokenizer-for-processing-tweets-9879389f8fe7

# COMAPRE TOKENIZERS
# https://miro.medium.com/max/1400/1*FLVWAVL1pkAOpN9CoVBehA.png

# series_of_str -> [ nltk.word_tokenize(string) for string in words ]

df_1 = pd.DataFrame({'content': [
                     'the sky is blue',
                     'blue not is sky the']})

# tokenization of strings - [not fast if using apply: maybe use library for faster apply or list comwprehension]
def str_tokenizer(str_words):
    # Word Tokenization
    list_words = word_tokenize(str_words)

    # Word Lemmatization
    lemmatizer = WordNetLemmatizer()
    lem_words = [lemmatizer.lemmatize(w) for w in list_words]

    # Word Stemming
    # snowball = SnowballStemmer(language = 'english')
    # stem_words = [snowball.stem(w) for w in list_words]
        
    return lem_words

# choose df-column to tokenize
series_1 = df_1['content'].apply(str_tokenizer)

# list comprehension
[ nltk.word_tokenize(string) for string in df_1['content'] ]

[nltk_data] Downloading package wordnet to /home/daniel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[['the', 'sky', 'is', 'blue'], ['blue', 'not', 'is', 'sky', 'the']]

In [63]:
def tf_idf(series_of_lists_of_tokens):
    """ Returns tf-idf matrix(dataframe) for a given df-column/series with list of tokens for each row. """

    # ### DONT REMOVE Understandable/simple term frequencies
    # # Tokenize and generate count vectors
    # word_vec = series_of_lists_of_tokens.apply(pd.value_counts).fillna(0)
    # # Compute term frequencies
    # tf = word_vec.divide(np.sum(word_vec, axis=1), axis=0)
    # ### DONT REMOVE Understandable/simple term frequencies

    # Compute count vectors
    # explode: flattern -> goupby: apply to each "list_of_tokens" -> unstack: dataframe with row as content and column as token [change unstack(level=X) to transpose]
        ### Assuming from http://www.tfidf.com/ -> TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
    word_vec = series_of_lists_of_tokens.explode().groupby(level=0).value_counts().unstack(level=1).fillna(0)

    # Compute term frequencies
    tf = word_vec.divide(np.sum(word_vec, axis='columns'), axis='index')

    # Compute inverse document frequencies
        ### Assuming from http://www.tfidf.com/ -> log_e(Total number of documents / Number of documents with term t in it)
    idf = np.log(len(tf) / word_vec[word_vec > 0].count()) 

    # Compute TF-IDF vectors and return
    return np.multiply(tf, idf.to_frame().T) # <- tf * idf

tf_idf(series_1)

sent,blue,is,not,sky,the
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.138629,0.0,0.0


In [None]:
### BELOW IS DEBUG CODE and scratchpad

In [56]:
df_1 = pd.DataFrame({'sent': [
                     'the sky is blue',
                     'blue not is sky the']})
series_1 = df_1.sent.str.split()
# should be equal but maybe different order
# simple
word_vec = series_1.apply(pd.value_counts).fillna(0)
tf1 = word_vec.divide(np.sum(word_vec, axis='columns'), axis='index')

print(tf1.T, "\n")

# complex but fast with large datasets
tf2 = series_1.explode().groupby(level=0).value_counts(normalize=True).unstack(level=1).fillna(0)

print(tf2.T, "\n")

0    1
the   0.25  0.2
blue  0.25  0.2
is    0.25  0.2
sky   0.25  0.2
not   0.00  0.2 

         0    1
sent           
blue  0.25  0.2
is    0.25  0.2
not   0.00  0.2
sky   0.25  0.2
the   0.25  0.2 



In [18]:
df_tokens = series_of_lists_of_tokens.explode().to_frame().reset_index().rename(columns={'index':'content_index'})
tf2 = df_tokens.groupby('content_index')[df_tokens.columns[1]].value_counts(normalize=True).unstack(level=1).fillna(0)
tf2

sent,blue,is,not,sky,the
content_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.25,0.25,0.0,0.25,0.25
1,0.2,0.2,0.2,0.2,0.2


In [60]:
word_vec = series_1.explode().groupby(level=0).value_counts().unstack(level=1).fillna(0)
print(word_vec)

print(word_vec.divide(np.sum(word_vec, axis=1), axis=0))

sent  blue   is  not  sky  the
0      1.0  1.0  0.0  1.0  1.0
1      1.0  1.0  1.0  1.0  1.0
sent  blue    is  not   sky   the
0     0.25  0.25  0.0  0.25  0.25
1     0.20  0.20  0.2  0.20  0.20


In [19]:
%%timeit
series_explode = series_of_lists_of_tokens.explode()

tf2 = series_explode.groupby(level=0).value_counts(normalize=True).unstack(level=1).fillna(0)
tf2

2.04 ms ± 105 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [20]:
series_explode = series_of_lists_of_tokens.explode()

tf2 = series_of_lists_of_tokens.explode().groupby(level=0).value_counts(normalize=True).unstack(level=1).fillna(0)
tf2

sent,blue,is,not,sky,the
0,0.25,0.25,0.0,0.25,0.25
1,0.2,0.2,0.2,0.2,0.2


In [70]:
series_explode = series_of_lists_of_tokens.explode()

tf2 = series_of_lists_of_tokens.explode().groupby(level=0).value_counts(normalize=True).unstack(level=1).fillna(0)
tf2

   sent
0  blue    0.25
   is      0.25
   sky     0.25
   the     0.25
1  blue    0.20
   is      0.20
   not     0.20
   sky     0.20
   the     0.20
Name: sent, dtype: float64