This notebook is the pre construct of the ML pipeline of the main database

Importing the libraries needed and the dataframes

In [21]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline
import sqlite3

def load_data():
    '''loading the messages database'''
    conn = sqlite3.connect('Messages.db')
    df = pd.read_sql('SELECT * FROM Messages', conn)
    df = df.drop(columns=['index'])
    X = df['message'].values
    y= df[df.columns.difference(['message','genre_news','genre_social'])]
    return X,y;

X, y = load_data()
X.shape

(19883,)

In [22]:
y.shape

(19883, 34)

At a first glance it seems better to treat each message as a document and build a document-term matrix, we may however end up with a matrix with too many columns, but we'll evaluate this later on. But first we'll clean the text: Normalize followed by tokenize then removing stop words and finally lemmatize

In [23]:
stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

def tokenize(text):
    # normalize case, remove punctuation and numbers
    text = re.sub(r"[^a-zA-Z]", " ", text.lower())
    
    # tokenize text
    tokens = word_tokenize(text)
    
    # lemmatize and remove stop words
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    #lemmatize verbs
    tokens = [lemmatizer.lemmatize(word, pos='v') for word in tokens]
    
    #lemmatize adjectives
    tokens = [lemmatizer.lemmatize(word, pos='a') for word in tokens]
    
    #lemmatize adverbs
    tokens = [lemmatizer.lemmatize(word, pos='r') for word in tokens]
    
    

    return tokens

In [26]:
def model_pipeline():
    pipeline = Pipeline([
        ('vect',CountVectorizer(tokenizer=tokenize)),
        ('tfidf',TfidfTransformer()),
        ('clf', MultiOutputClassifier(estimator=RandomForestClassifier()))
    ])
    
    return pipeline

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

random_state=42
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=random_state)

model = model_pipeline()
model.fit(X_train, y_train)
#y_pred = model.predict(X_test)


Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                                                                        ccp_alpha=0.0,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                   

In [28]:
y_pred = model.predict(X_test)

In [81]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

auc = []
for i in range (0,y_test.shape[1]):
    auc.append(roc_auc_score(y_test.iloc[:,i],y_pred[:,i]))
    

import statistics
print('Mean AUC: ',statistics.mean(auc),'Max AUC:', max(auc),'Min AUC:', min (auc))

Mean AUC:  0.6069162910334058 Max AUC: 0.897529718541132 Min AUC: 0.49933704587642536


In [54]:
cm_y1 = confusion_matrix(y_test.iloc[:,0],y_pred[:,0])
cm_y2 = confusion_matrix(y_test.iloc[:,1],y_pred[:,1])
#X = document_term_matrix(df['message'])
print(cm_y1)
print(cm_y2)

[[3917    1]
 [  59    0]]
[[1206  672]
 [ 417 1682]]


In [56]:

cr_y1 = classification_report(y_test.iloc[:,0],y_pred[:,0])
cr_y2 = classification_report(y_test.iloc[:,1],y_pred[:,1])

print(cr_y1)
print(cr_y2)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3918
           1       0.00      0.00      0.00        59

    accuracy                           0.98      3977
   macro avg       0.49      0.50      0.50      3977
weighted avg       0.97      0.98      0.98      3977

              precision    recall  f1-score   support

           0       0.74      0.64      0.69      1878
           1       0.71      0.80      0.76      2099

    accuracy                           0.73      3977
   macro avg       0.73      0.72      0.72      3977
weighted avg       0.73      0.73      0.72      3977



In [None]:
#X.shape


In [None]:
#adding the genre dummies columns to the document term matrix we have just created
#X_train = hstack((X,np.array(df['genre_news'])[:,None]))
#X_train = hstack((X_train,np.array(df['genre_social'])[:,None]))
#X_train.shape#old shape 19930,23371

In [None]:
#creating our target dataframe
#y =  df.drop(columns=['genre_news','genre_social','message'])
#y.shape