In [1]:
import sys
sys.path.append('../..')
import pandas as pd
from src.preprocessing import Preprocessor
from src.feature_engineering import FeatureCreator
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('../../data/raw/Suicide_Detection.csv', usecols=['text','class'])
df['class'] = df['class'].map({'suicide': 1, 'non-suicide':0})

In [3]:
procedure_preclean = [
    "get_length", 
    "get_exclamation_count"
]

feature_creator_preclean = FeatureCreator(procedure=procedure_preclean)
length_exclamation = [feature_creator_preclean.fit(document) for document in df['text']]
length = [x for x,y in length_exclamation]
exclamation = [y for x,y in length_exclamation]

In [4]:
additional_feature_preclean = {
    'length': length,
    'exclamation': exclamation
}

In [5]:
preprocessing_config = {
    "1": ['lower', 'remove_punctuation', 'remove_links', 'remove_numbers', 
    'remove_emoji', 'tokenize', 'remove_stopwords', 'stem', 'lemmatize', 
    'remove_short_words', 'remove_long_words', 'shorten_text'],

    "2": ['lower', 'remove_punctuation', 'remove_links', 'remove_numbers',
    'translate_emoji', 'tokenize', 'remove_stopwords', 'stem', 'lemmatize', 
    'remove_short_words', 'remove_long_words', 'shorten_text'],

    "3": ['lower', 'remove_punctuation', 'remove_links', 'remove_numbers',
    'translate_emoji', 'tokenize', 'remove_stopwords', 'stem', 'lemmatize', 
    'remove_short_words', 'remove_long_words'],

    "4": ['lower', 'remove_punctuation', 'remove_links', 'remove_numbers',
    'translate_emoji', 'tokenize', 'remove_stopwords', 'stem', 'lemmatize', 
    'remove_short_words'],

    "5": ['lower', 'remove_punctuation', 'remove_links', 'remove_numbers',
    'translate_emoji', 'tokenize', 'remove_stopwords', 'stem', 'lemmatize'],

    "6": ['lower', 'remove_punctuation', 'remove_links', 'remove_numbers',
    'translate_emoji', 'tokenize', 'remove_stopwords', 'stem'],
    
    "7": ['lower', 'remove_punctuation', 'remove_links', 'remove_numbers',
    'translate_emoji', 'tokenize', 'remove_stopwords', 'lemmatize'],
}

In [6]:
feature_engineering_preclean_config = {
    "1": ['length', 'exclamation_count'],
    "2": ['length'],
    "3": ['exclamation_count'],
}

feature_engineering_postclean_config = {
    "1": ['get_sentiment','get_keyword'],
    "2": ['get_sentiment'],
    "3": ['get_keyword'],
}

In [27]:
models_config = {
    # "naive_bayes": MultinomialNB(),
    "svm": SVC(),
    "knn": KNeighborsClassifier(n_neighbors=3),
    "RandomForest": RandomForestClassifier()
}

In [9]:
# preprocessing
preprocessing = Preprocessor(procedure=preprocessing_config['1'])
corpus = df['text'].apply(lambda x: preprocessing.fit(x))
y = df['class'][corpus.astype(bool)]
corpus = corpus[corpus.astype(bool)]
print('Preprocessing done!')

# feature engineering after preprocessing
feature_creator_postclean = FeatureCreator(procedure=feature_engineering_postclean_config['1'])
feature_creator_postclean.fit_vectorizer(corpus)
print('Vectorizer ready!')
sentiment_keyword = [feature_creator_postclean.fit(document) for document in corpus]
sentiment = [x for x,y in sentiment_keyword]
keyword = [y for x,y in sentiment_keyword]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessing done!
Vectorizer ready!


In [28]:
X_train, X_test, y_train, y_test = train_test_split(df_x.drop('class',axis=1), df_x['class'], test_size=0.2, random_state=42)

for model_name, model in models_config.items():
    print(f"Model: {model_name}")
    model.fit(X_train, y_train)
    print(f"Training score: {model.score(X_train, y_train)}")
    print(f"Testing score: {model.score(X_test, y_test)}")
    print(f"Cross validation score: {cross_val_score(model, df_x, df_x['class'], cv=5)}")
    print("\n")

Model: svm
