In [3]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import validation_curve
from sklearn.metrics import make_scorer
from sklearn import metrics as mt
from sklearn.metrics import f1_score as f1
from sklearn.svm import SVC
import seaborn as sns
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore', DeprecationWarning)
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np

## Import dataset

In [21]:
#training files
df_etd_training = pd.read_csv("../Data/chat_pattern/training/etd_pattern_training.csv")
df_ps_training = pd.read_csv("../Data/chat_pattern/training/ps_pattern_training.csv")
df_ngram_training = pd.read_csv("../Data/chat_pattern/training/ngram_training.csv")
df_pos_training = pd.read_csv("../Data/chat_pattern/training/pos_training.csv")

#testing files
df_etd_testing = pd.read_csv("../Data/chat_pattern/testing/etd_pattern_testing.csv")
df_ps_testing = pd.read_csv("../Data/chat_pattern/testing/ps_pattern_testing.csv")
df_ngram_testing = pd.read_csv("../Data/chat_pattern/testing/ngram_testing.csv")
df_pos_testing = pd.read_csv("../Data/chat_pattern/testing/pos_testing.csv")

#label
df_label_training = pd.read_csv("../Data/chat_pattern/chat_annotation_1000_pos.csv")
df_label_testing = pd.read_csv("../Data/chat_pattern/chat_testing_200.csv")

In [None]:
X_etd_train = df_etd_training.values
X_ps_train = df_ps_training.values
X_ngram_train = df_ngram_training.values
X_pos_train = df_pos_training.values

X_etd_ngram_train = np.hstack((X_etd_train, X_ngram_train))
X_ps_ngram_train = np.hstack((X_ps_train, X_ngram_train))
X_etd_pos_train = np.hstack((X_etd_train, X_pos_train))
X_ps_pos_train = np.hstack((X_ps_train, X_pos_train))
X_ngram_pos_train = np.hstack((X_ngram_train, X_pos_train))

X_all_etd_train = np.hstack((X_etd_ngram_train, X_pos_train))
X_all_ps_train = np.hstack((X_ps_ngram_train, X_pos_train))

In [83]:
X_etd_test = df_etd_testing.values
X_ps_test = df_ps_testing.values
X_ngram_test = df_ngram_testing.values
X_pos_test = df_pos_testing.values


X_etd_ngram_test = np.hstack((X_etd_test, X_ngram_test))
X_etd_pos_test = np.hstack((X_etd_test, X_pos_test))
X_ngram_pos_test = np.hstack((X_ngram_test, X_pos_test))
X_ps_ngram_test = np.hstack((X_ps_test, X_ngram_test))
X_ps_pos_test = np.hstack((X_ps_test, X_pos_test))

X_all_etd_test = np.hstack((X_etd_ngram_test, X_pos_test))
X_all_ps_test = np.hstack((X_ngram_pos_test, X_pos_test))

In [65]:
y_etd_train = df_label_training["y_ETD"].values
y_ps_train = df_label_training["y_PS"].values
y_etd_test = df_label_testing["y_ETD"].values
y_ps_test = df_label_testing["y_PS"].values

## Model

In [131]:
def train_evaluate(X_train, y_train, X_test, y_test):
    y_test_model = []
    yhat_model = []
    param_grid = {'C': np.linspace(0.001, 100, 20)}
    
    svc = SVC()
    grid_search = GridSearchCV(svc, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    
    print('CV Train score: {:.2f}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))
    
    predictions = grid_search.predict(X_test)
    precison = mt.precision_score(y_test, predictions)
    recall = mt.recall_score(y_test, predictions)
    score = mt.f1_score(y_test, predictions)
    
    print("precision:",round(precison,3),"recall:",round(recall,3),"F1:",round(score,3))
    
    for val in zip(y_test, predictions):
        yhat_model.append(val[1])
        y_test_model.append(val[0])
    
    return precison,recall,score, y_test_model, yhat_model

## Training

In [132]:
# 1. ETD: pattern
prec,recall,f1_scores, y_test_model, yhat_model = train_evaluate(X_etd_train, y_etd_train, X_etd_test, y_etd_test)

CV Train score: 0.90
Best parameters: {'C': 5.264105263157894}
precision: 0.887 recall: 0.894 F1: 0.891


In [133]:
# 2. ETD: pos
prec,recall,f1_scores, y_test_model, yhat_model = train_evaluate(X_pos_train, y_etd_train, X_pos_test, y_etd_test)

CV Train score: 0.69
Best parameters: {'C': 21.053421052631577}
precision: 0.782 recall: 0.735 F1: 0.758


In [134]:
# 3. ETD: n-gram
prec,recall,f1_scores, y_test_model, yhat_model = train_evaluate(X_ngram_train, y_etd_train, X_ngram_test, y_etd_test)

CV Train score: 0.71
Best parameters: {'C': 5.264105263157894}
precision: 0.769 recall: 0.909 F1: 0.833


In [135]:
# 4. ETD: pattern + pos
prec,recall,f1_scores, y_test_model, yhat_model = train_evaluate(X_etd_pos_train, y_etd_train, X_etd_pos_test, y_etd_test)

CV Train score: 0.78
Best parameters: {'C': 10.527210526315788}
precision: 0.855 recall: 0.848 F1: 0.852


In [136]:
# 5. ETD: pattern + n-gram
prec,recall,f1_scores, y_test_model, yhat_model = train_evaluate(X_etd_ngram_train, y_etd_train, X_etd_ngram_test, y_etd_test)

CV Train score: 0.77
Best parameters: {'C': 5.264105263157894}
precision: 0.812 recall: 0.917 F1: 0.861


In [137]:
# 6. ETD: n-gram + pos
prec,recall,f1_scores, y_test_model, yhat_model = train_evaluate(X_ngram_pos_train, y_etd_train, X_ngram_pos_test, y_etd_test)

CV Train score: 0.73
Best parameters: {'C': 5.264105263157894}
precision: 0.787 recall: 0.841 F1: 0.813


In [138]:
# 7. ETD: pattern + n-gram + pos
prec,recall,f1_scores, y_test_model, yhat_model = train_evaluate(X_all_etd_train, y_etd_train, X_all_etd_test, y_etd_test)

CV Train score: 0.76
Best parameters: {'C': 5.264105263157894}
precision: 0.823 recall: 0.879 F1: 0.85


In [None]:
# 8. PS: pattern
prec,recall,f1_scores, y_test_model, yhat_model = train_evaluate(X_ps_train, y_ps_train, X_ps_test, y_ps_test)

In [None]:
# 9. PS: pos
prec,recall,f1_scores, y_test_model, yhat_model = train_evaluate(X_pos_train, y_ps_train, X_pos_test, y_ps_test)

In [None]:
# 10. PS: n-gram
prec,recall,f1_scores, y_test_model, yhat_model = train_evaluate(X_ngram_train, y_ps_train, X_ngram_test, y_ps_test)

In [None]:
# 11. PS: pattern + pos
prec,recall,f1_scores, y_test_model, yhat_model = train_evaluate(X_ps_pos_train, y_ps_train, X_ps_pos_test, y_ps_test)

In [None]:
# 12. PS: pattern + n-gram
prec,recall,f1_scores, y_test_model, yhat_model = train_evaluate(X_ps_ngram_train, y_ps_train, X_ps_ngram_test, y_ps_test)

In [None]:
# 13. PS: n-gram + pos
prec,recall,f1_scores, y_test_model, yhat_model = train_evaluate(X_ngram_pos_train, y_ps_train, X_ngram_pos_test, y_ps_test)

In [None]:
# 14. PS: pattern + n-gram + pos
prec,recall,f1_scores, y_test_model, yhat_model = train_evaluate(X_all_ps_train, y_ps_train, X_all_ps_test, y_ps_test)