In [46]:
import pandas as pd
import numpy as np
import glob
import warnings
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from parsivar import Normalizer
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier


TRAIN_PATH = "Dataset/Train/"
TEST_PATH = "Dataset/Test/"

categories = [
    'Economics',
    'Sociology',
    'Sports',
    'Religions',
    'Tech',
    'Strategic',
    'Politics'
]

normalizer = Normalizer()

df_train = pd.DataFrame(columns=["text", "category"])
df_test = pd.DataFrame(columns=["text", "category"])

for category in categories:
    all_files = glob.glob(TRAIN_PATH + category + "/*.txt")
    for file in all_files:
        with open(file, 'r', encoding = 'utf-8') as file:
            text = file.read().replace('\n', ' ')
            text = normalizer.normalize(text)
        df_train.loc[len(df_train)] = [text, category]

for category in categories:
    all_files = glob.glob(TEST_PATH + category + "/*.txt")
    for file in all_files:
        with open(file, 'r', encoding = 'utf-8') as file:
            text = file.read().replace('\n', ' ')
            text = normalizer.normalize(text)
        df_test.loc[len(df_test)] = [text, category]

with open('persian-stopwords',  encoding = 'utf-8') as f:
    content = f.readlines()
stop_words = [x.strip() for x in content]

for i in range (len(df_train)):
    df_train.loc[i]['text'] = df_train.loc[i]['text'].replace('amp', ' ')
    df_train.loc[i]['text'] = df_train.loc[i]['text'].replace('nbsp', ' ')
    df_train.loc[i]['text'] = df_train.loc[i]['text'].replace('\u200c', ' ')
    df_train.loc[i]['text'] = ''.join([i for i in df_train.loc[i]['text'] if not i.isdigit()])

for i in range (len(df_test)):
    df_train.loc[i]['text'] = df_train.loc[i]['text'].replace('amp', ' ')
    df_train.loc[i]['text'] = df_train.loc[i]['text'].replace('nbsp', ' ')
    df_test.loc[i]['text'] = df_test.loc[i]['text'].replace('\u200c', ' ')
    df_test.loc[i]['text'] = ''.join([i for i in df_test.loc[i]['text'] if not i.isdigit()])

# print(df_train.to_string())
df_train.head()

Unnamed: 0,text,category
0,تا وقتی نیروی کار کشور زیر حداقل معیشت به سر م...,Economics
1,پانزده تن از استادان دانشگاه و کارشناسان اقتصا...,Economics
2,تن از استادان و مدرسان علم اقتصاد دانشگاه های...,Economics
3,دکتر محمد ستاری فر از امضاکنندگان نامه اقتصاد...,Economics
4,رییس کل بانک مرکزی متن نهائی بسته سیاستی نظارت...,Economics


In [43]:
X_train = df_train['text']
y_train = df_train['category']

X_test = df_test['text']
y_test = df_test['category']

count_vect = CountVectorizer(max_features=500, stop_words=stop_words)
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# Naive Bayes
clf = MultinomialNB().fit(X_train_tfidf, y_train)

# print(count_vect.get_feature_names())

result = clf.predict(count_vect.transform(X_test))
warnings.filterwarnings('ignore')
print("accuracy:  " + str(accuracy_score(y_test, result)))
print("precision: " + str(precision_score(y_test, result, average='macro')))
print("recall:    " + str(recall_score(y_test, result, average='macro')))
print("f-measure: " + str(f1_score(y_test, result, average='macro')))

accuracy:  0.9285714285714286
precision: 0.9523809523809524
recall:    0.9285714285714286
f-measure: 0.9238095238095239


In [45]:
print("~~~~~~~~~~~~ KNN With TF ~~~~~~~~~~~~")

iterations = [1, 5, 15]
for iteration in iterations:
    print("for K =", str(iteration))
    knn = KNeighborsClassifier(n_neighbors=iteration, metric='euclidean').fit(X_train_counts, y_train)
    result = knn.predict(count_vect.transform(X_test))
    print("accuracy: " + str(accuracy_score(y_test, result)))
    print("confusion matrix:")
    print(confusion_matrix(y_test, result))
    print("\n")
    

print("~~~~~~~~~~~~ KNN With TF-IDF ~~~~~~~~~~~~")

iterations = [1, 5, 15]
for iteration in iterations:
    print("for K =", str(iteration))
    knn = KNeighborsClassifier(n_neighbors=iteration, metric='euclidean').fit(X_train_tfidf, y_train)
    result = knn.predict(count_vect.transform(X_test))
    print("accuracy: " + str(accuracy_score(y_test, result)))
    print("confusion matrix:")
    print(confusion_matrix(y_test, result))
    print("\n")

~~~~~~~~~~~~ KNN With TF ~~~~~~~~~~~~
for K = 1
accuracy: 0.5714285714285714
confusion matrix:
[[0 0 0 2 0 0 0]
 [0 2 0 0 0 0 0]
 [0 0 1 1 0 0 0]
 [0 0 0 2 0 0 0]
 [0 0 0 0 2 0 0]
 [0 0 0 1 0 0 1]
 [0 0 0 1 0 0 1]]


for K = 5
accuracy: 0.2857142857142857
confusion matrix:
[[0 0 2 0 0 0 0]
 [0 1 1 0 0 0 0]
 [0 0 2 0 0 0 0]
 [0 0 2 0 0 0 0]
 [0 0 0 0 1 0 1]
 [0 0 2 0 0 0 0]
 [0 0 1 0 0 1 0]]


for K = 15
accuracy: 0.2857142857142857
confusion matrix:
[[0 0 0 0 0 0 2]
 [0 1 0 0 0 0 1]
 [0 0 0 0 0 0 2]
 [0 0 0 0 0 0 2]
 [0 0 0 0 1 0 1]
 [0 0 0 0 0 0 2]
 [0 0 0 0 0 0 2]]


~~~~~~~~~~~~ KNN With TF-IDF ~~~~~~~~~~~~
for K = 1
accuracy: 0.8571428571428571
confusion matrix:
[[1 1 0 0 0 0 0]
 [0 2 0 0 0 0 0]
 [0 0 2 0 0 0 0]
 [0 0 0 1 1 0 0]
 [0 0 0 0 2 0 0]
 [0 0 0 0 0 2 0]
 [0 0 0 0 0 0 2]]


for K = 5
accuracy: 0.7142857142857143
confusion matrix:
[[2 0 0 0 0 0 0]
 [1 1 0 0 0 0 0]
 [0 1 1 0 0 0 0]
 [0 0 0 1 1 0 0]
 [0 0 0 0 2 0 0]
 [0 0 0 0 0 2 0]
 [0 0 0 0 0 1 1]]


for K = 15
accuracy: 0.7