In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.decomposition import PCA, NMF, TruncatedSVD
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
import functions as f
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
import pickle

#### functions

In [4]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [5]:
def to_lemma(text):
    # takes text, returns lemmatized words
    # remove nonsense
    if type(text) != str:
        text = str(text)
    scrubbed = f.scrub_words(text)
    # word tokenize
    tokenized_word = word_tokenize(scrubbed)
    # lower case the words for better frequency, tf-idf
    tokens_lower = [word.lower() for word in tokenized_word]
    # remove stopwords
    stop_words=set(stopwords.words("english"))
    filtered_desc = []
    for w in tokens_lower:
        if w not in stop_words:
            filtered_desc.append(w)
    # for lemmatization, need to pass part of speech
    pos = nltk.pos_tag(filtered_desc)
    # lemmatization with pos 
    lem = WordNetLemmatizer()
    lemmed_words = ""
    for w,p in pos:
        p_new = get_wordnet_pos(p)
        lemmed_words += f" {(lem.lemmatize(w,p_new))},"
    return lemmed_words

#### load data / start here

In [3]:
# save bc that took a loooong time
df_lem = pd.read_csv('data/S_lemmed.csv')

In [133]:
len(lemmed)
type(lemmed)
lemmed = df_lem["lemmed"]
lemmed.fillna("", inplace=True)

In [134]:
len(lemmed), len(target)

(14337, 14337)

In [135]:
lemmed.replace(" nan,", "", inplace=True)

In [136]:
lem = list(lemmed)
new_lemmed = []
for string in lem:
    new_str = ''
    s = string.split(",")
    for word in s:
        if len(word) > 3:
            new_str += f"{word},"
        else:
            pass
    new_lemmed.append(new_str)
            

In [361]:
type(new_lemmed)

list

In [82]:
# frequency distribution
fdist = FreqDist(new_lemmed)
# most common two words
#fdist.most_common(10)

### Feature Generation

In [18]:
# Document Term Matrix - really needs a Series (Bag of Words)
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1), max_features = 5000)
text_counts= cv.fit_transform(lemmed)
text_counts

<45x43 sparse matrix of type '<class 'numpy.int64'>'
	with 44 stored elements in Compressed Sparse Row format>

In [191]:
# TF-IDF, normalizes the DTM
tf = TfidfVectorizer(stop_words='english', analyzer = "word", max_features = 10000)
text_tf = tf.fit_transform(new_lemmed)
text_tf

<14337x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 955062 stored elements in Compressed Sparse Row format>

### Dimensionality Reduction

In [203]:
# could try truncatedSVD
# np.linalg.inv - svd, pca
model = NMF(n_components=20, init='random', random_state=42)

In [204]:
W = model.fit_transform(text_tf.T)
H = model.components_

In [205]:
# W -> W(:,k) kth topic
# H -> H(j,k) importance of kth topic to jth doc
# X (text_tf.T) -> X(:, j) jth doc
W.shape, H.shape

((10000, 20), (20, 14337))

In [206]:
descriptions = H.T
des = pd.DataFrame(descriptions)

In [207]:

des["max_feat"] = des.idxmax(axis=0)

In [208]:
des["target"] = target["fraud"]


In [209]:
descrips = des.drop(["max_feat", "target"], axis=1)

In [210]:
X_train, X_test, y_train, y_test = train_test_split(descrips, target['fraud'], test_size=0.3, random_state=42)

In [211]:
lr = LogisticRegression(random_state=42, verbose=1).fit(X_train, y_train)

[LibLinear]



In [212]:
yhats = lr.predict(X_test)

In [213]:
sum(y_test == yhats)/ len(yhats)

0.907717340771734

In [214]:
lr.coef_

array([[-0.58050531, -0.97416613,  0.82823368, -3.29661609, -2.50583237,
         2.28757118, -1.59572779, -0.4543123 , -7.04081796, -3.19072525,
        -1.93208437, -2.88844962, -1.18285352, -2.22946043, -2.16803206,
        -2.14209495,  4.88235658, -2.98468976, -1.18357188, -2.00726382]])

In [215]:
features = pd.DataFrame(W)
feats = tf.get_feature_names()
features["names"] = feats
features.set_index("names", inplace=True)

In [358]:
coeffs = features * lr.coef_
coeffs["total"] = coeffs.sum(axis=1)
words_coeffs_dict = coeffs["total"].to_dict()

In [247]:
words_coeffs_dict = coeffs["total"].to_dict()

In [298]:

with open('data/words_coeffs_dict', 'wb') as sweet:
    pickle.dump(words_coeffs_dict, sweet)

In [333]:
with open('data/words_coeffs_dict', 'rb') as sweet:
        wcd = pickle.load(sweet)

In [347]:
def words_coeff(list_words):
    # bring in dictionary
    with open('data/words_coeffs_dict.txt', 'rb') as handle:
        wcd = pickle.load(handle)
    total = 0
    # count number of each word
    l = list_words.split(",")
    fd = FreqDist(l)
    for k,v in fd.items():
        k = k.strip()
        if k in wcd.keys():
            total += (wcd[k] * v)
        else:
            pass
    return total
        
    
    

In [364]:
words_coeff(lemmed[10]), type(lemmed[10])

(-124.96758509416024, str)

In [299]:
with open('data/words_coeffs_dict.txt', 'rb') as handle:
        wcd = pickle.load(handle)

In [336]:
wcd['party']

2.330430140808307

## TEST with word_coeff + rest of df ModelLR

In [365]:
with open('website/modelX2.pkl', 'rb') as handle:
        rff = pickle.load(handle)

In [367]:
rff.oob_score_

0.9741926483922717