## Requirements


In [1]:
# Unable warnings
import os
import warnings

warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'


#### Directory adjustment


In [2]:
from pathlib import Path
import sys
import os
# Back to main folder
path = os.path.dirname(os.getcwd())+"/"
os.chdir(path)
sys.path.append(path)


#### Data Processing


In [3]:
# ETL
import numpy as np
import pandas as pd
from pandas import MultiIndex, Int64Index
# ML preprocessing
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold


#### Natural language processing


In [4]:
import gensim
import spacy
import re 

# Vectorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import KeyedVectors


#### Models


In [5]:
# Pipe
from sklearn.pipeline import Pipeline
# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
# Metrics
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from src.ModelAnalysis import ranking_recall

from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score


## Set and split train and test data


In [6]:
# Get data
df = pd.read_csv('data/corpus/augmented_corpus_fortuna.csv')

# Set target and features
target = 'label'
features = 'text_stop'

# Break apart dataset
X = df[features].values.astype('U')
y = df[target]


# Split train abd test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42)

# Set k-fold criteria
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)


# Class weights
pos = len(df.query('label==1'))
neg = len(df.query('label==0'))
weight_for_0 = (1 / neg) * (len(df) / 2.0)
weight_for_1 = (1 / pos) * (len(df) / 2.0)*1.1
class_weight = {0: weight_for_0, 1: weight_for_1}


## Model otimzation 

#### Base model


In [7]:
# Vectorizer
vectorizer = TfidfVectorizer(lowercase=False,
                             analyzer="word",
                             norm='l2',
                             ngram_range=(1, 2),
                             max_features=1500,
                             min_df=5)

classifier = LinearSVC(penalty='l2',
                       loss='squared_hinge',
                       dual=True,
                       tol=1e-6, C=1.1,
                       multi_class='crammer_singer',
                       fit_intercept=True,
                       intercept_scaling=1,
                       class_weight=class_weight,
                       random_state=42,
                       max_iter=1000)

# Pipe
ml_pipe = Pipeline([('vectorizer', vectorizer),
                    ('classifier', classifier)])

# Train
ml_pipe.fit(X_train, y_train)

# Predict
y_predict = ml_pipe.predict(X_test)

# Evaluate
report = classification_report(y_test, y_predict, output_dict=True)
pd.DataFrame(report)


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.838415,0.525105,0.706349,0.68176,0.73978
recall,0.707851,0.703081,0.706349,0.705466,0.706349
f1-score,0.76762,0.601198,0.706349,0.684409,0.715228
support,777.0,357.0,0.706349,1134.0,1134.0


In [8]:
print('precision', precision_score(y_test, y_predict))
print('accuracy', accuracy_score(y_test, y_predict))
print('recall', recall_score(y_test, y_predict))
print('auc', roc_auc_score(y_test, y_predict))
print('f1', f1_score(y_test, y_predict))

precision 0.5251046025104602
accuracy 0.7063492063492064
recall 0.7030812324929971
auc 0.7054659701718526
f1 0.6011976047904192


In [9]:
# from sklearn.ensemble import VotingClassifier

# # Vectorizer
# vectorizer = TfidfVectorizer(lowercase=False,
#                              analyzer="word",
#                              norm='l2',
#                              ngram_range=(1, 2),
#                              max_features=1500)

# # Models
# classifier_1 = LinearSVC()

# classifier_2 = DecisionTreeClassifier(
#     random_state=42, class_weight=class_weight)


# classifier_3 = RandomForestClassifier(
#     random_state=42, class_weight=class_weight)
# # Vote

# models_vote = VotingClassifier(
#     estimators=[('M1', classifier_1),
#                 ('M2', classifier_2),
#                 ('M3', classifier_3)],
#     voting='hard')

# # Pipe
# ml_pipe = Pipeline([('vectorizer', vectorizer),
#                     ('classifier', models_vote)])

# # Train
# ml_pipe.fit(X_train, y_train)

# # Predict
# y_predict = ml_pipe.predict(X_test)

# # Evaluate
# report = classification_report(y_test, y_predict, output_dict=True)
# pd.DataFrame(report)


In [10]:
# from sklearn.experimental import enable_halving_search_cv
# from sklearn.model_selection import HalvingGridSearchCV
# from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import cross_val_score
# from sklearn import metrics
# from sklearn.metrics import recall_score
# from sklearn.utils.fixes import loguniform

# # Set k-fold criteria
# k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
# scores = cross_val_score(ml_pipe, X, y, cv=k_fold)
# print(f"{scores.mean()} accuracy with a standard deviation of {scores.std()}")


In [11]:
# grid_params = {'classifier__C': [0.5, 1.0, 1.5, 2.0, 2.5, 3],
#                'classifier__penalty': ['l2'],
#                'classifier__loss': ['hinge', 'squared_hinge'],
#                'classifier__multi_class': ['ovr', 'crammer_singer'],
#                'classifier__class_weight': [{1: 1, 0: 1}, class_weight]}


In [12]:
# # Parameters search
# grid = HalvingGridSearchCV(ml_pipe, grid_params, cv=k_fold)
# grid.fit(X_train, y_train)
# print("Best Score:  ", grid.best_score_)
# # Pipe
# # scores = cross_val_score(grid, X, y, cv=k_fold)
# # print(f"{scores.mean()} accuracy with a standard deviation of {scores.std()}")


In [13]:
# # Predict
# y_predict = grid.predict(X_test)

# # Evaluate
# report = classification_report(y_test, y_predict, output_dict=True)
# pd.DataFrame(report)