In [7]:
import pandas as pd


from sklearn.feature_extraction.text import  CountVectorizer
import re
import numpy as np
import scipy.stats as stats
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter

In [14]:
def find_counts(data, text_col = "tweet_text", min_df=2, ngrams=(1,1)):   
    
    # Quick and dirty counter of terms and tokens (before we whittle down later)
    results = Counter()
    data_pre = data
    data_pre.str.split().apply(results.update)
    
    n_docs = data.shape[0]
    n_terms = len(results)
    n_tokens = sum(results.values())
    
    print('Number of documents: {}'.format(n_docs))
    print('Number of word forms (terms): {}'.format(n_terms))
    print('Number of words (tokens): {}'.format(n_tokens))
    print('Mean words per document: {:.1f}'.format(n_tokens / n_docs))
    print('Mean term occurance: {:.1f}'.format(np.mean(list(results.values()))))
    for m in [1, 5, 10, 100]:
        vs = {k:v for (k, v) in results.items() if v <= m}
        print('Number (Pct) of terms occuring <= {}: {} ({:.1f})'.format(m, len(vs), 100*len(vs)/n_terms))
        
    
    # We override the token_pattern in order to keep @signs and #hashtags
    vec = CountVectorizer(      #preprocessor=preprocessor,
                                token_pattern = '[a-zA-Z0-9@#]+',
                                stop_words="english",
                                lowercase=True,
                                min_df=min_df,
                                ngram_range=ngrams,
                                max_features=10000)
    
    bow = vec.fit_transform(data)
    vocab = vec.get_feature_names()
    tdm = pd.DataFrame(bow.toarray(), columns=vocab)
        
    
    n_tokens = sum(tdm.sum())
    n_docs = tdm.shape[0]
    phrases = list(tdm.columns)
    counts = pd.DataFrame(data={'Phrase': phrases, 
                                'Characters': [len(x) for x in phrases],
                                'Terms': [x.count(' ')+1 for x in phrases],
                                'Count': tdm.sum(),
                                'Count Pct': tdm.sum() / n_tokens,
                                'Docs': tdm.astype(bool).sum(),
                                'Docs Pct': tdm.astype(bool).sum() / n_docs,
                          })
    
    counts = counts.sort_values(by=['Count'], ascending=False)
    
    print('Top {} words:'.format(num_words_to_print))
    print(counts.head(num_words_to_print))
    print('\nBottom {} words:'.format(num_words_to_print))
    print(counts.tail(num_words_to_print))
    
    
    return tdm, vocab, counts 

In [15]:
df = pd.read_csv('clean.csv')
tdm, vocab, counts = find_counts(df['text'], min_df=10, ngrams=(1,3))

Number of documents: 10003
Number of word forms (terms): 4518
Number of words (tokens): 119530
Mean words per document: 11.9
Mean term occurance: 26.5
Number (Pct) of terms occuring <= 1: 1940 (42.9)
Number (Pct) of terms occuring <= 5: 3200 (70.8)
Number (Pct) of terms occuring <= 10: 3623 (80.2)
Number (Pct) of terms occuring <= 100: 4341 (96.1)
Top 25 words:
                  Phrase  Characters  Terms  Count  Count Pct  Docs  Docs Pct
card                card           4      1   2682   0.044147  2578  0.257723
t                      t           1      1   1582   0.026041  1521  0.152054
account          account           7      1   1352   0.022255  1288  0.128761
money              money           5      1   1133   0.018650  1068  0.106768
transfer        transfer           8      1   1084   0.017843  1025  0.102469
payment          payment           7      1    751   0.012362   709  0.070879
need                need           4      1    698   0.011490   675  0.067480
cash        

In [16]:
df2 = pd.read_csv('Banking77_trimmed_updatedLabels_load.csv')
tdm, vocab, counts = find_counts(df2['text'], min_df=10, ngrams=(1,3))

Number of documents: 8575
Number of word forms (terms): 4230
Number of words (tokens): 103776
Mean words per document: 12.1
Mean term occurance: 24.5
Number (Pct) of terms occuring <= 1: 1775 (42.0)
Number (Pct) of terms occuring <= 5: 3005 (71.0)
Number (Pct) of terms occuring <= 10: 3389 (80.1)
Number (Pct) of terms occuring <= 100: 4067 (96.1)
Top 25 words:
                  Phrase  Characters  Terms  Count  Count Pct  Docs  Docs Pct
card                card           4      1   2227   0.043298  2133  0.248746
t                      t           1      1   1300   0.025275  1251  0.145889
account          account           7      1   1198   0.023292  1137  0.132595
money              money           5      1    956   0.018587   895  0.104373
transfer        transfer           8      1    880   0.017109   828  0.096560
cash                cash           4      1    630   0.012249   615  0.071720
need                need           4      1    617   0.011996   597  0.069621
payment      