In [30]:
import numpy as np
import pandas as pd
import sys
sys.path.append("../")
import spacy
from src.loader import TextLoader
from src.purifier import Purifier
from tqdm import tqdm

from sklearn.model_selection import train_test_split, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score


In [3]:
DISCOURSE_ID = "discourse_id"
DISCOURSE_START = "discourse_start"
DISCOURSE_END = "discourse_end"
DISCOURSE_TEXT = "discourse_text"
DISCOURSE_TYPE = "discourse_type"
df = pd.read_csv("../data/dataset/train.csv").astype(
    {DISCOURSE_ID: np.int64, DISCOURSE_START: np.int64, DISCOURSE_END: np.int64}
)
df

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622627660524,8,229,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622627653021,230,312,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622627671020,313,401,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622627696365,402,758,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622627759780,759,886,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...
...,...,...,...,...,...,...,...,...
144288,4C471936CD75,1618153340639,2234,3203,if I'm not sure what college I want to attend...,Evidence,Evidence 2,386 387 388 389 390 391 392 393 394 395 396 39...
144289,4C471936CD75,1618153383399,3221,4509,seeking multiple opinions before making a har...,Evidence,Evidence 3,576 577 578 579 580 581 582 583 584 585 586 58...
144290,4C471936CD75,1618024996127,4510,4570,it is better to seek multiple opinions instead...,Position,Position 1,828 829 830 831 832 833 834 835 836 837 838
144291,4C471936CD75,1618025268756,4570,4922,The impact of asking people to help you make a...,Evidence,Evidence 4,839 840 841 842 843 844 845 846 847 848 849 85...


In [4]:
enc = LabelEncoder()

In [12]:
X = df["discourse_text"]

In [13]:
new_X = []
nlp = spacy.load("en_core_web_lg")
for i, x in enumerate(X):
    purified = Purifier().purify(x)
    token = nlp(purified)
    new_X.append(token.vector)
    

In [19]:
y = df["discourse_type"]

In [20]:
y_encoded = enc.fit_transform(y)

In [21]:
y_encoded

array([4, 5, 3, ..., 5, 3, 1])

In [23]:
X_train, X_test, y_train, y_test = train_test_split(new_X, y_encoded, test_size=0.3, random_state=42)

In [24]:
clf = XGBClassifier(random_state=42, seed=2, colsample_bytree=0.6, subsample=0.7)

In [25]:
class DiscourseClassifier():
    def __init__(self):
        # hyperparameters
        self.subsample = 0.7
        self.colsample_bytree = 0.7
        self.gamma = 0
        self.reg_lambda = 0

        self.model = self.create_model()

    def create_model(self):
        clf = XGBClassifier(random_state=42, seed=2, colsample_bytree=self.colsample_bytree, subsample=self.subsample, gamma=self.gamma, reg_lambda=self.reg_lambda)
        return clf
    
    def fit(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        return self.model.predict(X_test)

In [26]:
discource_clf = DiscourseClassifier()

discource_clf.fit(X_train, y_train)

y_pred = discource_clf.predict(X_test)





In [31]:
accuracy_score(y_test, y_pred)

0.69725