# Import modules

In [1]:
# basic import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns
import math

%matplotlib inline

In [2]:
import spacy
import re

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import classification_report, balanced_accuracy_score, confusion_matrix
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.metrics import make_scorer, balanced_accuracy_score

In [23]:
link_train = "./dataset/clean.csv"
link_test = "./dataset/test_clean.csv"

In [24]:
test_df = pd.read_csv(link_test, delimiter=",")
train_df = pd.read_csv(link_train, delimiter=",")

In [25]:
id = test_df['IDText']

In [27]:
train_df.isna().sum()

text     5
label    0
dtype: int64

In [8]:
train_df.head()

Unnamed: 0,text,label
0,kunjung prabowo untuk resmi serah proyek bantu...,Sumber Daya Alam
1,anies tepuk tangan riah jadi rektor wajib mata...,Politik
2,benar dukung goblok dukung hanya saja pak ridw...,Demografi
3,waktu anies sikap kritis kerja pak prabowo ang...,Politik
4,anies baswedan harap asn masuk tni polri pegan...,Politik


In [9]:
train_df.dropna(subset=['text'], inplace=True)

In [10]:
test_df.head()

Unnamed: 0,IDText,text
0,TXT0001,mau orang prodemokrasi negara bisa punya sempa...
1,TXT0002,prabowo tanya soal hutang luar negeri jawab hu...
2,TXT0003,kikidaliyo ganjar pranowo beliau sosok mengagu...
3,TXT0004,prabowo gibran bisa laku semua sejahtera rakyat
4,TXT0005,udazulhendra lah justru sambung junjung lu aom...


In [11]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   IDText  1000 non-null   object
 1   text    1000 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB


In [12]:
test_df.describe()

Unnamed: 0,IDText,text
count,1000,1000
unique,1000,928
top,TXT0001,pasang duet capres cawapres ganjar pranowo pro...
freq,1,17


# Predict

In [13]:
base_parameter = {'max_df': 0.8981026022043065, 'min_df': 2, 'ngram_range': (1, 2), 'alpha': 7.09904501602026, 'fit_prior': False}

In [14]:
best_clf = Pipeline([
    ('vectorizer_tfid', TfidfVectorizer()),
    ('smote', SMOTE(random_state=42)),
    ('Complement NB', ComplementNB())
])

In [15]:
updated_best_params = {
    'vectorizer_tfid__max_df': base_parameter['max_df'],
    'vectorizer_tfid__min_df': base_parameter['min_df'],
    'vectorizer_tfid__ngram_range': base_parameter['ngram_range'],
    'Complement NB__alpha': base_parameter['alpha'],
    'Complement NB__fit_prior': base_parameter['fit_prior']
}

best_clf.set_params(**updated_best_params)

In [16]:
train_df['stemmed_text'] = train_df['text']
encoding_dict = {
    'Politik':0,
    'Sosial Budaya':1,
    'Pertahanan dan Keamanan':2,
    'Ideologi':3,
    'Ekonomi':4,
    'Sumber Daya Alam':5,
    'Demografi':6,
    'Geografi':7,
}

# Fitting and transforming the 'Category' column
train_df['encoded_label'] = train_df['label'].map(encoding_dict)

In [17]:
X = train_df.stemmed_text
y = train_df.encoded_label

best_clf.fit(X, y)

test_predictions = best_clf.predict(test_df['text'])

## Evaluation

In [19]:
encoding_dict = {
    0:'Politik',
    1:'Sosial Budaya',
    2:'Pertahanan dan Keamanan',
    3:'Ideologi',
    4:'Ekonomi',
    5:'Sumber Daya Alam',
    6:'Demografi',
    7:'Geografi',
}

pd.Series(test_predictions).map(encoding_dict).value_counts()

Politik                    196
Ekonomi                    186
Pertahanan dan Keamanan    182
Ideologi                   143
Sumber Daya Alam            95
Geografi                    70
Sosial Budaya               65
Demografi                   63
Name: count, dtype: int64

# Export results

In [22]:
submission1 = pd.DataFrame({
    'IDText': id,
    'Kelas': test_predictions
})

submission1['Kelas'] = submission1['Kelas'].map(encoding_dict)

submission1.to_csv('./dataset/result.csv', index=False)