In [199]:
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from hazm import Normalizer
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier


TRAIN_PATH = "Dataset/Train/"
TEST_PATH = "Dataset/Test/"

categories = [
    'Economics',
    'Sociology',
    'Sports',
    'Religions',
    'Tech',
    'Strategic',
    'Politics'
]

normalizer = Normalizer()

df_train = pd.DataFrame(columns=["text", "category"])
df_test = pd.DataFrame(columns=["text", "category"])

for category in categories:
    all_files = glob.glob(TRAIN_PATH + category + "/*.txt")
    for file in all_files:
        with open(file, 'r', encoding = 'utf-8') as file:
            text = file.read().replace('\n', ' ')
            normalizer.normalize(text)
        df_train.loc[len(df_train)] = [text, category]

for category in categories:
    all_files = glob.glob(TEST_PATH + category + "/*.txt")
    for file in all_files:
        with open(file, 'r', encoding = 'utf-8') as file:
            text = file.read().replace('\n', ' ')
            normalizer.normalize(text)
        df_test.loc[len(df_test)] = [text, category]

with open('persian-stopwords',  encoding = 'utf-8') as f:
    content = f.readlines()
stop_words = [x.strip() for x in content]

df.head()

Unnamed: 0,text,category
0,مدير بانك خون بند ناف پژوهشگاه رويان جهاد دا...,Tech
1,شروين تقوي، دانشجوي دكتري مهندسي برق دانشگاه...,Tech
2,پانزده تن از استادان دانشگاه و كارشناسان اقت...,Economics
3,اشاره: آنچه كه در پي مي‌آيد &amp;nbsp;اولين ...,Strategic
4,دكتر سيد حسين نصر، &amp;nbsp;آسيب‌شناسي ديني...,Religions


In [208]:
X_train = df_train['text']
y_train = df_train['category']

X_test = df_test['text']
y_test = df_test['category']

count_vect = CountVectorizer(max_features=500, stop_words=stop_words)
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

result = clf.predict(count_vect.transform(X_test))
print("accuracy: " + str(accuracy_score(y_test, result)))
confusion_matrix(y_test, result)

accuracy: 0.8571428571428571




array([[1, 0, 0, 0, 0, 0, 1],
       [0, 2, 0, 0, 0, 0, 0],
       [0, 0, 2, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 1],
       [0, 0, 0, 0, 2, 0, 0],
       [0, 0, 0, 0, 0, 2, 0],
       [0, 0, 0, 0, 0, 0, 2]], dtype=int64)