## **Preprocessing**

In [10]:
#**Preprocessing**

import json
import pandas as pd

def load_domain_from_json(path):
    domain = []
    with open(path, "r") as file:
        for instance in file:
            domain.append(json.loads(instance))
    # print(domain[0])

    domain_label = [instance["label"] for instance in domain]
    domain_id = [instance["id"] for instance in domain]
    domain_text = [instance["text"] for instance in domain]
    for i in range(len(domain_text)):
        text = list(map(str, domain_text[i]))
        domain_text[i] = " ".join(text)
    # print(domain_text[0])
    domain_df = pd.DataFrame({
        "id": domain_id,
        "text": domain_text,
        "label": domain_label
    })
    return domain_df

domain1 = load_domain_from_json("data/domain1_train_data.json")
domain2 = load_domain_from_json("data/domain2_train_data.json")
print(domain1.head())
print(domain2.head())

test_data = []
with open("data/test_data.json", "r") as file:
    for instance in file:
        test_data.append(json.loads(instance))
id = [instance["id"] for instance in test_data]
text = [instance["text"] for instance in test_data]
for i in range(len(text)):
    text_str = list(map(str, text[i]))
    text[i] = " ".join(text_str)
test_data_df = pd.DataFrame({
    "id": id,
    "text": text,
})
# print(test_data_df.head())
print(len(test_data_df))


   id                                               text  label
0   0  16 231 543 5 15 43 8282 94 231 1129 31 34 32 9...      1
1   1  16 4046 138 10 2 1809 2007 3763 14 40113 13 90...      1
2   2  1108 16550 3 6168 3 160 284 19 49 464 5333 8 4...      1
3   3  1802 27 16 25 48 451 632 3 2 2164 25 2380 34 7...      1
4   4  16 19 302 93 97 43 952 118 1 16 528 2 26528 10...      1
     id                                               text  label
0  5000  12 920 7 1266 28 9884 1640 116 11 1342 1533 28...      1
1  5001  783 397 253 5797 9379 22 793 11838 10 607 6324...      1
2  5002  888 14851 323 9 27 1377 584 195 3 137 10 2732 ...      1
3  5003  228 1161 5815 379 9 941 10 2 316 4 2693 594 87...      1
4  5004  736 19 37 813 45 6723 27 626 8 2 3446 4 564 34...      1
4000


## **Baseline(BOW + NaiveBayes)**

# **Undersampling**

In [None]:


# **Undersampling + BOW**

from sklearn.utils import resample

label_counts = domain2['label'].value_counts()
majority_label = label_counts[label_counts == label_counts.max()].index[0]
minority_label = label_counts[label_counts == label_counts.min()].index[0]

domain2_majority = domain2[domain2['label'] == majority_label]
domain2_minority = domain2[domain2['label'] == minority_label]

domain2_majority_underampled = resample(domain2_majority,
                                        replace=False,
                                        n_samples=len(domain2_minority),
                                        random_state=42)

domain2_undersampled = pd.concat([domain2_majority_underampled, domain2_minority])

print(domain2_undersampled['label'].value_counts())

from sklearn.feature_extraction.text import CountVectorizer

combined_data = pd.concat([domain1, domain2_undersampled], ignore_index=True)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(combined_data['text'])
y = combined_data['label']

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

mnb_classifier = MultinomialNB()
mnb_classifier.fit(X_train, y_train)

y_pred = mnb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

## **Undersampling + TFIDF**

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(combined_data['text'])
y = combined_data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# more models
#SVM
from sklearn.svm import SVC

svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)

y_pred_svm = svm_classifier.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy:", accuracy_svm)

#LR
from sklearn.linear_model import LogisticRegression

lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, y_train)

y_pred_lr = lr_classifier.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression Accuracy:", accuracy_lr)

#XGB
from xgboost import XGBClassifier

xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train, y_train)
y_pred_xgb = xgb_classifier.predict(X_test)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print("XGBoost Accuracy:", accuracy_xgb)

#RF
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

y_pred_rf = rf_classifier.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)

## **Oversampling**

In [None]:


### SMOTE

from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.naive_bayes import BernoulliNB

combined_domain = pd.concat([domain1, domain2], ignore_index=True)
# print(combined_domain)
accuracy_df = pd.DataFrame(index=['SMOTE', 'ADASYN'], columns=['BOW', 'TFIDF'])

########### BOW
vectorizer = CountVectorizer()
X = combined_domain['text']
y_BOW = combined_domain['label']
X_train, X_test, y_train_BOW, y_test_BOW = train_test_split(X, y_BOW, test_size=0.2, random_state=42, stratify=y_BOW)
X_train_BOW = vectorizer.fit_transform(X_train)
X_test_BOW = vectorizer.transform(X_test)

X_train_BOW_resampled_SMOTE, y_train_BOW_resampled_SMOTE = SMOTE(sampling_strategy='auto', random_state=35, k_neighbors=5, n_jobs=None).fit_resample(X_train_BOW, y_train_BOW)
# print(X_train_resampled.shape)
# print(y_train_resampled.shape)

nb_classifier = BernoulliNB()
nb_classifier.fit(X_train_BOW_resampled_SMOTE, y_train_BOW_resampled_SMOTE)

y_pred_BOW_SMOTE = nb_classifier.predict(X_test_BOW)
accuracy = accuracy_score(y_test_BOW, y_pred_BOW_SMOTE)
accuracy_df.loc['SMOTE', 'BOW'] = accuracy

########## TFIDF
# vectorizer = TfidfVectorizer()
# X_TFIDF = vectorizer.fit_transform(combined_domain['text'])
# y_TFIDF = combined_domain['label']
# X_train_TFIDF, X_test_TFIDF, y_train_TFIDF, y_test_TFIDF = train_test_split(X_TFIDF, y_TFIDF, test_size=0.2, random_state=42, stratify=y_TFIDF)

vectorizer = TfidfVectorizer()
X = combined_domain['text']
y_TFIDF = combined_domain['label']
X_train, X_test, y_train_TFIDF, y_test_TFIDF = train_test_split(X, y_TFIDF, test_size=0.2, random_state=42, stratify=y_TFIDF)
X_train_TFIDF = vectorizer.fit_transform(X_train)
X_test_TFIDF = vectorizer.transform(X_test)
x = test_data_df["text"]
assert(len(x)==4000)
X_test_oversampling = vectorizer.transform(x)
print(X_test_oversampling.shape)

X_train_TFIDF_resampled_SMOTE, y_train_TFIDF_resampled_SMOTE = SMOTE(sampling_strategy='auto', random_state=35, k_neighbors=5, n_jobs=None).fit_resample(X_train_TFIDF, y_train_TFIDF)
# print(X_train_resampled.shape)
# print(y_train_resampled.shape)
nb_classifier = BernoulliNB()
nb_classifier.fit(X_train_TFIDF_resampled_SMOTE, y_train_TFIDF_resampled_SMOTE)

y_pred_TFIDF_SMOTE = nb_classifier.predict(X_test_TFIDF)
accuracy = accuracy_score(y_test_TFIDF, y_pred_TFIDF_SMOTE)
accuracy_df.loc['SMOTE', 'TFIDF'] = accuracy


### ADASYN


from imblearn.over_sampling import SMOTE, ADASYN
X_train_TFIDF_resampled_ADA, y_train_TFIDF_resampled_ADA = ADASYN(sampling_strategy='auto', random_state=35, n_neighbors=5, n_jobs=None).fit_resample(X_train_TFIDF, y_train_TFIDF)
# print(X_train_resampled.shape)
# print(y_train_resampled.shape)

nb_classifier = BernoulliNB()
nb_classifier.fit(X_train_TFIDF_resampled_ADA, y_train_TFIDF_resampled_ADA)

y_pred_TFIDF_ADA = nb_classifier.predict(X_test_TFIDF)
accuracy = accuracy_score(y_test_TFIDF, y_pred_TFIDF_ADA)
accuracy_df.loc['ADASYN', 'TFIDF'] = accuracy


X_train_BOW_resampled_ADA, y_train_BOW_resampled_ADA = ADASYN(sampling_strategy='auto', random_state=35, n_neighbors=5, n_jobs=None).fit_resample(X_train_BOW, y_train_BOW)
# print(X_train_resampled.shape)
# print(y_train_resampled.shape)

nb_classifier = BernoulliNB()
nb_classifier.fit(X_train_BOW_resampled_ADA, y_train_BOW_resampled_ADA)

y_pred_BOW_ADA = nb_classifier.predict(X_test_BOW)
accuracy = accuracy_score(y_test_BOW, y_pred_BOW_ADA)
accuracy_df.loc['ADASYN', 'BOW'] = accuracy

print(accuracy_df)

svm_classifier = SVC()
svm_classifier.fit(X_train_TFIDF_resampled_ADA, y_train_TFIDF_resampled_ADA)

y_pred_svm_ADA = svm_classifier.predict(X_test_TFIDF)
accuracy_svm_ADA = accuracy_score(y_test_TFIDF, y_pred_svm_ADA)
print("SVM Accuracy:", accuracy_svm_ADA)

## **Word2Vec**

## **DaNN**

## **Test Result**

In [29]:
def get_predict_csv(clfs, X_test, id):
    
    prediction = clfs.predict(X_test)
    results_csv = pd.DataFrame({
        "id": id,
        "class": prediction,
    })
    return results_csv

test_results = get_predict_csv(svm_classifier, X_test_oversampling, test_data_df["id"])
print(test_results)
test_results.to_csv('test_results.csv', index=False)

        id  class
0        0      1
1        1      0
2        2      0
3        3      0
4        4      0
...    ...    ...
3995  3995      0
3996  3996      0
3997  3997      0
3998  3998      0
3999  3999      1

[4000 rows x 2 columns]
