In [20]:
import sys
sys.path.append('../..')
import pickle
import numpy as np
import pandas as pd
import json
from src.preprocessing import Preprocessor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

In [21]:
df = pd.read_csv('../../data/prepared/prepared.csv', index_col=0)
df['corpus'] = df['corpus'].apply(lambda x: x[1:-1].replace("'", "").split(', '))

In [22]:
df_feature = pd.read_csv('../../data/prepared/new_features.csv', index_col=0)

In [25]:
df_feature['keywords'] = df_feature['keywords'].apply(lambda x: x[1:-1].replace("'", "").split(', '))

In [27]:
df['corpus'] = df['corpus'].apply(lambda x: ' '.join(x))
df_feature['keywords'] = df_feature['keywords'].apply(lambda x: ' '.join(x))

X_train, X_test, y_train, y_test = train_test_split(df['corpus'], df['class'].values, test_size=0.2, random_state=0)

In [28]:
vectorizer = TfidfVectorizer(min_df=4, lowercase=False)
X_train_vec = vectorizer.fit_transform(X_train)

In [29]:
df_feature_train = df_feature[df_feature.index.isin(pd.DataFrame(X_train).index)]
df_feature_test = df_feature[df_feature.index.isin(pd.DataFrame(X_test).index)]

In [27]:
clf_naive_bayes = MultinomialNB(alpha=0.1)
        
cross = cross_val_score(clf_naive_bayes, X_train_vec, y_train, cv=5).mean()
print(f'Cross validation Naive Bayes without additional features: {cross}')

Cross validation Naive Bayes without additional features: 0.915045945234293


In [13]:
clf_logistic_regression = LogisticRegression(
            C=3, max_iter=100,
            solver='sag',
            random_state=0,
            multi_class='multinomial')

cross = cross_val_score(clf_logistic_regression, X_train_vec, y_train, cv=5).mean()
print(f'Cross validation Logistic Regression without additional features: {cross}')

Cross validation Logistic Regression without additional features: 0.939957324457756


Additional features

In [31]:
sentiment_train = np.array(df_feature_train['compound'].values.reshape(-1, 1))+np.abs(df_feature_train['compound'].min())
length_train = np.array(df_feature_train['length'].values.reshape(-1, 1))/df_feature_train['length'].max()

length_sparse_train = csr_matrix(length_train)
sentiments_sparse_train = csr_matrix(sentiment_train)
key_words_sparse_train = vectorizer.transform(df_feature_train['keywords'])

X_with_sentiments = hstack([sentiments_sparse_train, X_train_vec])
X_with_length = hstack([length_sparse_train, X_train_vec])
X_with_keywords = hstack([key_words_sparse_train, X_train_vec])
X_with_sentiments_length = hstack([sentiments_sparse_train, length_sparse_train, X_train_vec])
X_with_sentiments_keywords = hstack([sentiments_sparse_train, key_words_sparse_train, X_train_vec])
X_with_all = hstack([sentiments_sparse_train, length_sparse_train, key_words_sparse_train, X_train_vec])

In [48]:
datas = [X_train_vec, X_with_sentiments, X_with_length, X_with_keywords, X_with_sentiments_length, X_with_sentiments_keywords, X_with_all]
names = ['none','sentiments', 'length', 'keywords', 'sentiments-length', 'sentiments-keywords', 'all']

In [49]:
for data, name in zip(datas, names):
    cross = cross_val_score(clf_naive_bayes, data, Y, cv=5).mean()
    print(f'Cross validation Naive Bayes with {name}: {cross}')

Cross validation Naive Bayes with none: 0.915045945234293
Cross validation Naive Bayes with sentiments: 0.9158756086760598
Cross validation Naive Bayes with length: 0.9152075664749318
Cross validation Naive Bayes with keywords: 0.9063021903298528
Cross validation Naive Bayes with sentiments-length: 0.9159833559763235
Cross validation Naive Bayes with sentiments-keywords: 0.9068247656503983
Cross validation Naive Bayes with all: 0.9068463157780107


In [50]:
for data, name in zip(datas, names):
    cross = cross_val_score(clf_logistic_regression, data, Y, cv=5).mean()
    print(f'Cross validation Logistic Regression with {name}: {cross}')

Cross validation Logistic Regression with none: 0.939957324457756
Cross validation Logistic Regression with sentiments: 0.9399788757463412
Cross validation Logistic Regression with length: 0.9399411621160096
Cross validation Logistic Regression with keywords: 0.9320647684654098
Cross validation Logistic Regression with sentiments-length: 0.9399788757463414
Cross validation Logistic Regression with sentiments-keywords: 0.9321240306270158
Cross validation Logistic Regression with all: 0.9321671300115104


In [57]:
def adjust_sentiment(text, sentiment, help_words):
    if any(word in (text) for word in help_words): 
        return sentiment*0.5
    return sentiment

In [58]:
df_adjusted = pd.concat(
    [X_train,
    df_feature_train['compound']],
    axis=1
)

In [59]:
help_words = [
'help','suicide','plz', 'cyanide','ibuprofen','charcoal','euthanasia','survivor','please',
'unimportant', 'insulin','support', 'urgent', 'emergency']

adjusted_compounds = df_adjusted.apply(lambda row: adjust_sentiment(row['corpus'], row['compound'], help_words), axis=1)

sentiment_train = np.array(adjusted_compounds.values.reshape(-1, 1))+np.abs(adjusted_compounds.min())
sentiments_sparse_train = csr_matrix(sentiment_train)
X_with_sentiments_train = hstack([sentiments_sparse_train, X_train_vec])

cross = cross_val_score(clf_logistic_regression, X_with_sentiments_train, Y, cv=5).mean()
print(f'Cross validation Logistic Regression with: {cross}')

Cross validation Logistic Regression with: 0.9401943706371121


In [18]:
a=cross_val_score(clf_logistic_regression, X_with_sentiments_train, Y, cv=10)
a, a.mean()

(array([0.94348669, 0.93966167, 0.94046978, 0.94046978, 0.94197824,
        0.94068527, 0.93826096, 0.94046978, 0.93885028, 0.94283713]),
 0.9407169576953403)

In [60]:
length_test = np.array(df_feature_test['length'].values.reshape(-1, 1))/df_feature_test['length'].max()
length_sparse_test = csr_matrix(length_test)

In [236]:
X_with_sentiments_train = hstack([sentiments_sparse_train, length_sparse_train, X_train_vec])

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

adjusted = pd.concat(
    [X_test,
    df_feature_test['compound']],
    axis=1
)

adjusted_compounds = adjusted.apply(lambda row: adjust_sentiment(row['corpus'], row['compound'], help_words), axis=1)

sentiment_test = np.array(adjusted_compounds.values.reshape(-1, 1))+np.abs(adjusted_compounds.min())
sentiments_sparse_test = csr_matrix(sentiment_test)

X_with_sentiments_test = hstack([sentiments_sparse_test, length_sparse_test, X_test_vec])

In [259]:
clf_logistic_regression.fit(X_with_sentiments_train, y_train)
clf_logistic_regression.score(X_with_sentiments_test, y_test)

0.9418597133929534

In [64]:
with open('../../models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

with open('../../models/clf_logistic_regression.pkl', 'wb') as f:
    pickle.dump(clf_logistic_regression, f)

In [65]:
y_pred = clf_logistic_regression.predict(X_with_sentiments_test)

In [66]:
confusion_matrix(y_test, y_pred)

array([[22134,  1167],
       [ 1531, 21573]], dtype=int64)

In [67]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.95      0.94     23301
           1       0.95      0.93      0.94     23104

    accuracy                           0.94     46405
   macro avg       0.94      0.94      0.94     46405
weighted avg       0.94      0.94      0.94     46405

