In [11]:
import pandas as pd
import pickle
import spacy
import re
import unicodedata
import sys
import csv
from spacy import displacy
from tqdm import tqdm

In [23]:
from sklearn_pandas import DataFrameMapper
from sklearn.feature_extraction.text import CountVectorizer
import scipy as sp
import numpy as np
from collections import Counter

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

In [24]:
from sklearn.feature_extraction import DictVectorizer

## Pull sentences from txt files

In [12]:
# cDF = pd.DataFrame()
# cDF = pd.read_csv('informalSentences.txt', sep="\n", header=None, quoting=csv.QUOTE_NONE)
# cDF.columns = ["text"]
# cDF["isFormal"] = [0 for i in range(len(cDF["text"]))] #Label
# cDF["text"] = [str(i) if str(i)[0] != " " else str(i)[1::] for i in cDF["text"]] #Clean up prepended spaces

In [13]:
# cDF.head()

In [14]:
# fDF = pd.DataFrame()
# fDF = pd.read_csv('formalSentences.txt', sep="\n", header=None, quoting=csv.QUOTE_NONE)
# fDF.columns = ["text"]
# fDF["isFormal"] = [1 for i in range(len(fDF["text"]))] #Label
# fDF["text"] = [str(i) if str(i)[0] != " " else str(i)[1::] for i in fDF["text"]] #Clean up prepended spaces

In [15]:
# fDF.head()

In [16]:
# df = fDF.append(cDF)

Shuffle dataframe

In [17]:
# df = df.sample(frac=1)
# len(df)

In [18]:
# df.head()

In [20]:
nlp = spacy.load('en_core_web_sm') #Load spacy model

In [21]:
# with open("df.pkl","wb") as f:
#     pickle.dump(df, f)

We use pickle to avoid doing that every time

In [22]:
with open("df.pkl", "rb") as f:
    df = pickle.load(f)

## Processing (Feature extraction)

In [25]:
with open("cols.pkl","rb") as f:
    column = pickle.load(f)
with open("lexicon.pkl","rb") as f:
    lexicon = pickle.load(f)

In [26]:
cols = column.tolist()


def counts(doc): #Frequency of POS_ tags
    count = Counter(([token.pos_ for token in doc]))
    countSum = sum(count.values())
    d = dict()
    d["ADJ"],d["ADP"],d["ADV"],d["AUX"],d["CONJ"],d["DET"],d["INTJ"],d["NOUN"],d["NUM"],d["PART"],d["PRON"],d["PROPN"],d["PUNCT"],d["SCONJ"],d["SYM"],d["VERB"],d["X"] = [0 for i in range(17)]
    for part, c, in count.items():
        d[part]=(c/countSum)
    return d

colsS = set(cols)

cV = CountVectorizer(vocabulary=lexicon) #I restrict to about 1200 words I defined in the exploratory process
def vectorize(sentence): #Feature extraction: convert string into vector containing info on POS_ tags, BOW, etc.
    doc = nlp(sentence)
    split = sentence.split()
    numWords = len(split)
    wordLength = sum([len(i) for i in split]) / max(numWords,1)
    d = counts(doc)
    vdf = pd.DataFrame(columns = cols)
    
    bow = cV.fit_transform([sentence]).toarray()[0]
    for i in d.items():
        vdf[i[0]] = [i[1] * 250] #Scale ratio to range 0-250 (since the dtype will be range 0-255)
    
    vdf["numWords"] = [numWords]
    vdf["wordLength"] = [wordLength]
    vdf = vdf.drop(columns = ["SPACE"]) #This has reduced overtraining in my experience. Number of spaces should not be a concern.
    
    for i in range(len(lexicon)):
        vdf[lexicon[i]] = bow[i]
    
    vdf.fillna(0, inplace=True)
    return vdf.to_numpy().astype(np.uint8)[0] #Store in 8 bits instead of the standard 64
    

In [27]:
vectorize("Hello Hello world world world world")

array([6, 5, 0, ..., 0, 0, 0], dtype=uint8)

We pick only 3% of the df right now, so it takes a few hours to process in the background instead of days. Also greatly reduces size of the array.

In [29]:
tiny = df.sample(frac=0.03)

In [30]:
train = [vectorize(i) for i in tqdm(tiny["text"])]

100%|██████████████████████████████████████████████████████████████████████████| 26920/26920 [3:10:19<00:00,  2.36it/s]


In [31]:
str(len(train)) + "*" + str(len(train[0]))

'26920*1256'

In [33]:
isFormal = tiny["isFormal"].to_numpy().astype(np.uint8)

In [34]:
len(isFormal)

26920

In [36]:
import scipy.sparse as sparse
with open("train.pkl","wb") as f:
    pickle.dump(sparse.csr_matrix(train),f)
with open("isFormal.pkl","wb") as f:
    pickle.dump(isFormal,f)

In [37]:
XTrain, XTest, yTrain, yTest = train_test_split(train, isFormal, test_size=0.3, random_state=1)

In [38]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

## Model

In [39]:
clfP = make_pipeline(StandardScaler(), SVC(random_state=0, probability=True, ))

In [40]:
clfP

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(probability=True, random_state=0))])

In [41]:
clfL = make_pipeline(StandardScaler(), LinearSVC(random_state=0))

In [42]:
clfL

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearsvc', LinearSVC(random_state=0))])

In [43]:
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import MultinomialNB
clfNB = ComplementNB()

In [44]:
clfMNB = MultinomialNB()

In [47]:
from sklearn.tree import DecisionTreeClassifier
clfDT = DecisionTreeClassifier(random_state=0)
from sklearn.ensemble import VotingClassifier

In [48]:
voterS = VotingClassifier(estimators=[("SVC", clfP), ("Naive Bayes", clfNB), ("Multinomial: ", clfMNB), ("Decision Tree: ", clfDT)], voting='soft')

Specifically soft voting chosen, to get probability rather than 0,1.

In [49]:
voterS.fit(XTrain,yTrain)

VotingClassifier(estimators=[('SVC',
                              Pipeline(steps=[('standardscaler',
                                               StandardScaler()),
                                              ('svc',
                                               SVC(probability=True,
                                                   random_state=0))])),
                             ('Naive Bayes', ComplementNB()),
                             ('Multinomial: ', MultinomialNB()),
                             ('Decision Tree: ',
                              DecisionTreeClassifier(random_state=0))],
                 voting='soft')

In [50]:
with open("voterS.pkl", "wb") as f: #Save for deployment
    pickle.dump(voterS, f)

## Evaluating

In [51]:
yPred = voterS.predict(XTest)

In [52]:
print(classification_report(yTest,yPred))
print("---")
print(confusion_matrix(yTest,yPred))

              precision    recall  f1-score   support

           0       0.97      0.76      0.86      5337
           1       0.68      0.96      0.79      2739

    accuracy                           0.83      8076
   macro avg       0.82      0.86      0.82      8076
weighted avg       0.87      0.83      0.83      8076

---
[[4081 1256]
 [ 115 2624]]


That's pretty good! The commented code below was previously used to evaluate predict_proba. I found that for P(informal) > 0.9, the accuracy was very high.

In [53]:
#tpConf = [max(voterS.predict_proba([i])[0]) for i in tX]
#tyPred = voterS.predict(tX)
#tpSucc = ["Yes" if tyPred[i] == sY[i] else "No" for i in range(len(sY))]
#s = 0
#for i in range(len(tpSucc)):
#    print("Confidence: " + str(int(tpConf[i] * 1000)/10) + "%. Was correct: " + str(tpSucc[i]))

In [54]:
voterS.predict_proba([vectorize("Hey, how are you?")])

array([[9.99828863e-01, 1.71136694e-04]])

In [55]:
voterS.predict_proba([vectorize("Abraham lincoln was born on the 5th of May.")])

array([[0.03933179, 0.96066821]])

In [64]:
voterS.predict_proba([vectorize("He was born in May.")])

array([[0.87641197, 0.12358803]])

In [66]:
voterS.predict_proba([vectorize("They were born in May.")])

array([[0.61691521, 0.38308479]])