In [10]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection._split import train_test_split
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from hazm import word_tokenize
from hazm.Stemmer import Stemmer
from keras import models, layers
from keras.utils import to_categorical
from sklearn import preprocessing
from sklearn.feature_selection.univariate_selection import SelectPercentile
from sklearn.feature_selection import chi2
import numpy
from sklearn import svm
from sklearn.neighbors.classification import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble.weight_boosting import AdaBoostClassifier
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics.classification import f1_score, precision_score, recall_score
from sklearn.neural_network import MLPClassifier, BernoulliRBM
from sklearn.metrics import zero_one_loss
from sklearn.metrics import confusion_matrix

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Izad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Izad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
perisca_dataset = pd.read_csv("per.csv", encoding="UTF-8", header=0)
perisca_dataset.head()

Unnamed: 0,NewsID,Title,Body,Date,Time,Category,Category2
0,843656,\nوزير علوم درجمع استادان نمونه: سن بازنشستگي ...,\nوزير علوم در جمع استادان نمونه كشور گفت: از ...,\n138/5//09,\n0:9::18,\nآموزشي-,\nآموزشي
1,837144,\nگردهمايي دانش‌آموختگان موسسه آموزش عالي سوره...,\nبه گزارش سرويس صنفي آموزشي خبرگزاري دانشجويا...,\n138/5//09,\n1:4::11,\nآموزشي-,\nآموزشي
2,436862,\nنتايج آزمون دوره‌هاي فراگير دانشگاه پيام‌نور...,\nنتايج آزمون دوره‌هاي فراگير مقاطع كارشناسي و...,\n138/3//07,\n1:0::03,\nآموزشي-,\nآموزشي
3,227781,\nهمايش يكروزه آسيب شناسي مفهوم روابط عمومي در...,\n,\n138/2//02,\n1:3::42,\nاجتماعي-خانواده-,\nاجتماعي
4,174187,\nوضعيت اقتصادي و ميزان تحصيلات والدين از مهمت...,\nمحمدتقي علوي يزدي، مجري اين طرح پژوهشي در اي...,\n138/1//08,\n1:1::49,\nآموزشي-,\nآموزشي


In [12]:
#stopwords from: https://raw.githubusercontent.com/kharazi/persian-stopwords/master/persian
# NLTK : Natural Language Toolkit
with open('stopwords.txt', encoding="UTF-8") as stopwords_file:
    stopwords = stopwords_file.readlines()
stopwords = [str(line).replace('\n', '') for line in stopwords]

# insert NLTK English stopwords into nltk_stopwords
nltk_stopwords = nltk.corpus.stopwords.words('english')
# appent presian stopwords in nltk_stopwords
nltk_stopwords.extend(stopwords)

In [13]:
len(nltk_stopwords)

1495

In [14]:
stemmer = Stemmer()
dataset = pd.DataFrame(columns=('title_body', 'category'))

In [18]:
# We can make a dataframe with the concatination of series
for index, row in perisca_dataset.iterrows():
    title_body = row['Title'] + ' ' + row['Body']
    title_body_tokenized = word_tokenize(title_body)
    title_body_tokenized_filtered = [w for w in title_body_tokenized if not w in nltk_stopwords]
    title_body_tokenized_filtered_stemming = [stemmer.stem(w) for w in title_body_tokenized_filtered]
    dataset.loc[index] = {'title_body': ' '.join(title_body_tokenized_filtered_stemming), 'category': row['Category2']}

dataset.head()

Unnamed: 0,title_body,category
0,وزير علو درجمع استاد نمونه سن بازنشستگي استاد ...,\nآموزشي
1,گردهمايي دانش‌آموختگ موسسه آموز عالي سوره برگز...,\nآموزشي
2,نتايج آزمون دوره‌هاي فراگير دانشگاه پيام‌نور ن...,\nآموزشي
3,هماي يكروزه آسيب شناسي مفهو روابط عمومي بابلسر...,\nاجتماعي
4,وضعي اقتصادي ميز تحصيل والدين مهمترين عوامل مو...,\nآموزشي


In [20]:
# df = Document Frequency
# idf=>  inverted doument frequency => 1/df
# Tfidf =>  Tf * idf
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
X = vectorizer.fit(dataset['title_body']).transform(dataset['title_body']) # Bag of Words
#--------------------------------------------
# Why fit is separate from transform? 
# Fit only applys on training data, but transform would apply on both train and test data

In [21]:
le = preprocessing.LabelEncoder()
y = le.fit(dataset['category']).transform(dataset['category'])
numpy.unique(dataset['category'])


array(['\nآموزشي', '\nاجتماعي', '\nاقتصادي', '\nبهداشتي', '\nتاريخي',
       '\nسياسي', '\nعلمي', '\nفرهنگي', '\nفقه و حقوق', '\nمذهبي',
       '\nورزشي'], dtype=object)

In [22]:
numpy.shape(X)

(10999, 60798)

In [23]:
numpy.shape(y)

(10999,)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [25]:
ch2 = SelectPercentile(chi2, 80)
X_train = ch2.fit_transform(X_train, y_train)
X_test = ch2.transform(X_test)

In [29]:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
score = sgd.score(X_test, y_test)
print('SDG score: ' + str(score))

SDG score: 0.8585454545454545


In [30]:
svmc = svm.SVC()
svmc.fit(X_train, y_train)
score = svmc.score(X_test, y_test)
print('svm score: ' + str(score))

svm score: 0.8472727272727273


In [31]:
svmlc = svm.SVC(kernel='linear')
svmlc.fit(X_train, y_train)
score = svmlc.score(X_test, y_test)
print('svm linear score: ' + str(score))

svm linear score: 0.854909090909091


In [32]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
score = knn.score(X_train, y_train)
print('knn score: ' + str(score))

knn score: 0.8468905321857195


In [22]:
knn2 = KNeighborsClassifier(n_neighbors=20, weights='distance')
knn2.fit(X_train, y_train)
score = knn2.score(X_train, y_train)
print('knn2 score: ' + str(score))

knn2 score: 1.0


In [23]:
mnnb = MultinomialNB()
mnnb.fit(X_train, y_train)
score = mnnb.score(X_train, y_train)
print('mnnb score: ' + str(score))

mnnb score: 0.8060370954055037


In [24]:
abc = AdaBoostClassifier()
abc.fit(X_train, y_train)
score = abc.score(X_train, y_train)
print('abc score: ' + str(score))

abc score: 0.5729179294459934


In [25]:
abc2 = AdaBoostClassifier(n_estimators=100)
abc2.fit(X_train, y_train)
score = abc2.score(X_train, y_train)
print('abc2 score: ' + str(score))

abc2 score: 0.6158322220875257


In [33]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
score = rfc.score(X_train, y_train)
print('rfc score: ' + str(score))

rfc score: 1.0


In [27]:
rfc2 = RandomForestClassifier(n_estimators=200)
rfc2.fit(X_train, y_train)
score = rfc2.score(X_train, y_train)
print('rfc2 score: ' + str(score))

rfc2 score: 1.0


In [54]:
vcls = VotingClassifier(estimators=[('randomforest', rfc2), ('naivebayes', mnnb), ('knn', knn), ('svm', svmlc)])
vcls.fit(X_train, y_train)
score = vcls.score(X_train, y_train)
print('vcls score: ' + str(score))

vcls score: 0.933567705176385


In [34]:
mlp = MLPClassifier(hidden_layer_sizes=(15,), random_state=1, max_iter=1, warm_start=True)
for i in range(10):
    mlp.fit(X_train, y_train)
score = mlp.score(X_train, y_train)
print('mlp score: ' + str(score))

mlp score: 0.946781428051885


In [67]:
vcls_predict = vcls.predict(X_test)
vcls_recall = recall_score(y_test, vcls_predict, average='macro')
vcls_precision = precision_score(y_test, vcls_predict, average='macro')
vcls_f1 = f1_score(y_test, vcls_predict, average='macro')
vcls_conf = confusion_matrix(y_test, vcls_predict)
vcls_zol = zero_one_loss(y_test, vcls_predict)
print("vcls recal: " + str(vcls_recall))
print("vcls precision: " + str(vcls_precision))
print("vcls f1: " + str(vcls_f1))
print("vcls confusion matrix: \n" + str(vcls_conf))
print("vcls zero one loss: " + str(vcls_zol))

vcls recal: 0.8117360851173391
vcls precision: 0.8288885138354627
vcls f1: 0.808676817701605
vcls confusion matrix: 
[[243   1   0   0   2   0   2   1   0   0   0]
 [ 30 134  29   9  13   4   6  10   7   8   2]
 [  6  15 193   1   6   5   4   5   2   2   1]
 [  9   0   1 243   1   1   0   0   1   0   0]
 [  6   1   1   0 259   0   0   0   0   0   0]
 [  9   3   6   5  56 146   1   2  11   1   2]
 [ 32   4  10  34   3   4 159   1   1   0   3]
 [  8   7   5   0  27   3   4 194   0   6   2]
 [  4   8   2   2  16   8   1   0 203   0   0]
 [  1   4   3   1  11   2   0   1   2 226   0]
 [  0   0   1   3   1   1   1   0   0   0 235]]
vcls zero one loss: 0.18727272727272726


In [35]:
mlp_predict = mlp.predict(X_test)
mlp_recall = recall_score(y_test, mlp_predict, average='macro')
mlp_precision = precision_score(y_test, mlp_predict, average='macro')
mlp_f1 = f1_score(y_test, mlp_predict, average='macro')
mlp_conf = confusion_matrix(y_test, mlp_predict)
mlp_zol = zero_one_loss(y_test, mlp_predict)
print("mlp recal: " + str(mlp_recall))
print("mlp precision: " + str(mlp_precision))
print("mlp f1: " + str(mlp_f1))
print("mlp confusion matrix: \n" + str(mlp_conf))
print("mlp zero one loss: " + str(mlp_zol))

mlp recal: 0.8273488350112288
mlp precision: 0.8225387422953303
mlp f1: 0.8233802868428518
mlp confusion matrix: 
[[211   4   3   3   1   2  17   1   1   2   0]
 [ 13 155  30  11   6  15   9  13   6   9   4]
 [  1  22 195   1   1  10   9   3   1   4   1]
 [  2   4   4 211   0   4   4   1   0   1   0]
 [  1   0   1   0 218   2   0   4   0   1   0]
 [  2   6  13   4  20 189   2   7  13   4   1]
 [ 17   5   7  19   1   4 189   5   1   1   0]
 [  1   9   3   2  10   4   7 199   0   4   2]
 [  1  17   2   1   8  11   7   2 211   4   1]
 [  0   6   1   0   1   2   2   1   1 245   0]
 [  1   4   0   2   1   0   0   2   0   1 242]]
mlp zero one loss: 0.1763636363636364


In [68]:
sgd_predict = sgd.predict(X_test)
sgd_recall = recall_score(y_test, sgd_predict, average='macro')
sgd_precision = precision_score(y_test, sgd_predict, average='macro')
sgd_f1 = f1_score(y_test, sgd_predict, average='macro')
sgd_conf = confusion_matrix(y_test, sgd_predict)
sgd_zol = zero_one_loss(y_test, sgd_predict)
print("SGD recal: " + str(sgd_recall))
print("SGD precision: " + str(sgd_precision))
print("SGD f1: " + str(sgd_f1))
print("SGD confusion matrix: \n" + str(sgd_conf))
print("SGD zero one loss: " + str(sgd_zol))

SGD recal: 0.8585117734456366
SGD precision: 0.8559277325517001
SGD f1: 0.8554769644173849
SGD confusion matrix: 
[[227   5   2   2   0   2   7   3   0   1   0]
 [ 11 149  25   9   2  10   7  16   9  10   4]
 [  1  18 188   1   2  11   8   4   2   3   2]
 [  1   1   0 250   0   2   2   0   0   0   0]
 [  1   1   0   0 254   7   0   2   1   1   0]
 [  3   5   5   5  10 189   4   3  13   2   3]
 [ 18   3   7  22   1   4 187   5   1   0   3]
 [  2   7   1   0   5   3   3 227   1   5   2]
 [  1   4   3   1   3  12   3   1 216   0   0]
 [  0   4   3   0   1   3   0   1   0 239   0]
 [  0   1   1   3   0   0   0   0   0   0 237]]
SGD zero one loss: 0.1407272727272727


In [36]:
svmc_predict = svmc.predict(X_test)
svmc_recall = recall_score(y_test, svmc_predict, average='macro')
svmc_precision = precision_score(y_test, svmc_predict, average='macro')
svmc_f1 = f1_score(y_test, svmc_predict, average='macro')
svmc_conf = confusion_matrix(y_test, svmc_predict)
svmc_zol = zero_one_loss(y_test, svmc_predict)
print("svmc recal: " + str(svmc_recall))
print("svmc precision: " + str(svmc_precision))
print("svmc f1: " + str(svmc_f1))
print("svmc confusion matrix: \n" + str(svmc_conf))
print("svmc zero one loss: " + str(svmc_zol))

svmc recal: 0.8506782054850516
svmc precision: 0.8463432486996034
svmc f1: 0.8471269563682576
svmc confusion matrix: 
[[223   5   0   3   0   2   7   1   2   2   0]
 [  8 172  25  11   6  13   6  13   4   9   4]
 [  2  20 197   0   1  11   8   3   1   4   1]
 [  1   4   1 223   0   0   2   0   0   0   0]
 [  1   1   1   0 215   3   1   4   1   0   0]
 [  4  11  15   5  11 181   4  10  16   3   1]
 [ 16  10   5  19   1   4 187   5   1   1   0]
 [  2   4   3   2   6   4   2 213   1   3   1]
 [  1  10   3   0   3   8   2   3 235   0   0]
 [  0   7   1   0   2   1   0   2   1 245   0]
 [  1   3   0   3   0   3   1   2   0   1 239]]
svmc zero one loss: 0.1527272727272727


In [70]:
svmlc_predict = svmlc.predict(X_test)
svmlc_recall = recall_score(y_test, svmlc_predict, average='macro')
svmlc_precision = precision_score(y_test, svmlc_predict, average='macro')
svmlc_f1 = f1_score(y_test, svmlc_predict, average='macro')
svmlc_conf = confusion_matrix(y_test, svmlc_predict)
svmlc_zol = zero_one_loss(y_test, svmlc_predict)
print("svmlc recal: " + str(svmlc_recall))
print("svmlc precision: " + str(svmlc_precision))
print("svmlc f1: " + str(svmlc_f1))
print("svmlc confusion matrix: \n" + str(svmlc_conf))
print("svmlc zero one loss: " + str(svmlc_zol))

svmlc recal: 0.8509185051143775
svmlc precision: 0.8504678195822618
svmlc f1: 0.8499912463562375
svmlc confusion matrix: 
[[224   7   0   1   0   1  13   3   0   0   0]
 [ 16 156  24   8   2  11   9  15   4   6   1]
 [  0  25 185   1   0   9   8   7   2   2   1]
 [  2   2   0 247   0   2   2   0   0   0   1]
 [  4   3   0   0 251   7   0   1   1   0   0]
 [  4   8   5   5  10 191   3   2  10   2   2]
 [ 17   5   7  19   0   5 193   2   1   0   2]
 [  5  13   2   0   3   5   4 217   0   6   1]
 [  0  10   2   1   3  14   3   1 210   0   0]
 [  0   5   3   1   0   5   0   1   2 234   0]
 [  0   1   1   3   0   2   1   0   0   0 234]]
svmlc zero one loss: 0.14836363636363636


In [71]:
knn_predict = knn.predict(X_test)
knn_recall = recall_score(y_test, knn_predict, average='macro')
knn_precision = precision_score(y_test, knn_predict, average='macro')
knn_f1 = f1_score(y_test, knn_predict, average='macro')
knn_conf = confusion_matrix(y_test, knn_predict)
knn_zol = zero_one_loss(y_test, knn_predict)
print("knn recal: " + str(knn_recall))
print("knn precision: " + str(knn_precision))
print("knn f1: " + str(knn_f1))
print("svmlc confusion matrix: \n" + str(knn_conf))
print("svmlc zero one loss: " + str(knn_zol))

knn recal: 0.7634691471480437
knn precision: 0.7802196882009302
knn f1: 0.7609107391421331
svmlc confusion matrix: 
[[229   5   1   3   4   1   4   1   0   0   1]
 [ 24 134  20   9  18  10   9   6   7  11   4]
 [  6  24 174   2  10   8   7   3   1   4   1]
 [ 13   5   4 227   2   0   1   2   2   0   0]
 [  6   2   1   0 254   2   0   0   2   0   0]
 [ 12  13   9   4  60 130   2   0   9   1   2]
 [ 32  11  17  31   5   8 141   3   1   0   2]
 [  6  10  14   1  27  10   5 171   0  10   2]
 [  5  12   3   5  20   9   3   1 186   0   0]
 [  1   2   6   0  11   1   0   0   1 229   0]
 [  2   0   2   2   4   3   0   1   0   0 228]]
svmlc zero one loss: 0.2352727272727273


In [72]:
knn2_predict = knn2.predict(X_test)
knn2_recall = recall_score(y_test, knn2_predict, average='macro')
knn2_precision = precision_score(y_test, knn2_predict, average='macro')
knn2_f1 = f1_score(y_test, knn2_predict, average='macro')
knn2_conf = confusion_matrix(y_test, knn2_predict)
knn2_zol = zero_one_loss(y_test, knn2_predict)
print("knn2 recal: " + str(knn2_recall))
print("knn2 precision: " + str(knn2_precision))
print("knn2 f1: " + str(knn2_f1))
print("knn2 confusion matrix: \n" + str(knn2_conf))
print("knn2 zero one loss: " + str(knn2_zol))

knn2 recal: 0.7810959710663937
knn2 precision: 0.8010217840054714
knn2 f1: 0.7754818983688608
knn2 confusion matrix: 
[[236   3   0   3   3   0   0   3   0   1   0]
 [ 26 108  25  13  28   6   9   6  13  13   5]
 [  5  15 177   3  10   7   8   3   3   8   1]
 [ 10   1   2 237   4   0   1   1   0   0   0]
 [  3   1   0   0 259   2   0   1   1   0   0]
 [  6   4   6   5  70 127   3   1  12   4   4]
 [ 32   6   9  34  12   3 151   1   1   0   2]
 [  6   5   2   1  26   5   3 193   1  12   2]
 [  4   6   4   5  20   6   2   1 195   1   0]
 [  1   0   3   0   9   0   0   1   0 237   0]
 [  0   0   1   3   3   1   0   1   1   0 232]]
knn2 zero one loss: 0.21745454545454546


In [73]:
mnnb_predict = mnnb.predict(X_test)
mnnb_recall = recall_score(y_test, mnnb_predict, average='macro')
mnnb_precision = precision_score(y_test, mnnb_predict, average='macro')
mnnb_f1 = f1_score(y_test, mnnb_predict, average='macro')
mnnb_conf = confusion_matrix(y_test, mnnb_predict)
mnnb_zol = zero_one_loss(y_test, mnnb_predict)
print("mnnb recal: " + str(mnnb_recall))
print("mnnb precision: " + str(mnnb_precision))
print("mnnb f1: " + str(mnnb_f1))
print("mnnb confusion matrix: \n" + str(mnnb_conf))
print("mnnb zero one loss: " + str(mnnb_zol))

mnnb recal: 0.7209294343471309
mnnb precision: 0.785554197645527
mnnb f1: 0.7172835633860047
mnnb confusion matrix: 
[[231   1   0   2  11   0   3   0   1   0   0]
 [ 32  64  54  10  50   5   6   7  15   9   0]
 [  7   7 181   2  23   4   3   2   3   7   1]
 [ 17   1   1 230   4   1   0   0   1   0   1]
 [  4   0   0   0 261   0   0   0   2   0   0]
 [  4   0   5   5  99 118   1   1   9   0   0]
 [ 36   3  15  35  13   3 138   1   3   0   4]
 [ 10   6   4   0  73   1   4 145   2   9   2]
 [  4   2   5   2  49   3   2   0 175   2   0]
 [  1   2   1   1  20   0   0   1   1 224   0]
 [  3   0   3   4  10   0   1   0   2   0 219]]
mnnb zero one loss: 0.27781818181818185


In [74]:
abc_predict = abc.predict(X_test)
abc_recall = recall_score(y_test, abc_predict, average='macro')
abc_precision = precision_score(y_test, abc_predict, average='macro')
abc_f1 = f1_score(y_test, abc_predict, average='macro')
abc_conf = confusion_matrix(y_test, abc_predict)
abc_zol = zero_one_loss(y_test, abc_predict)
print("abc recal: " + str(abc_recall))
print("abc precision: " + str(abc_precision))
print("abc f1: " + str(abc_f1))
print("abc confusion matrix: \n" + str(abc_conf))
print("abc zero one loss: " + str(abc_zol))

abc recal: 0.5574905902693778
abc precision: 0.5552256041942149
abc f1: 0.5370214080066794
abc confusion matrix: 
[[196   9   2   2   2   1  30   4   1   2   0]
 [ 18  26 127  11  10   8  29   3   7  10   3]
 [  1  20 171   7   4   4  11   4  12   4   2]
 [  3   8   7 209   1   1  25   0   0   1   1]
 [  3   1  13   0 194  16   9  18   9   4   0]
 [  4  14 125   7  13  18  16  12  22   7   4]
 [ 14   7  33  21   2   1 155  11   3   0   4]
 [  1  19 119   1  32   8  35  19   6  10   6]
 [  1   8  20   6  10  17   5   4 168   5   0]
 [  0   9  36   0  10   6   4  10   1 175   0]
 [  2   1  25   4   0   2   0   4   0   2 202]]
abc zero one loss: 0.4425454545454546


In [75]:
abc2_predict = abc2.predict(X_test)
abc2_recall = recall_score(y_test, abc2_predict, average='macro')
abc2_precision = precision_score(y_test, abc2_predict, average='macro')
abc2_f1 = f1_score(y_test, abc2_predict, average='macro')
abc2_conf = confusion_matrix(y_test, abc2_predict)
abc2_zol = zero_one_loss(y_test, abc2_predict)
print("abc2 recal: " + str(abc2_recall))
print("abc2 precision: " + str(abc2_precision))
print("abc2 f1: " + str(abc2_f1))
print("abc2 confusion matrix: \n" + str(abc2_conf))
print("abc2 zero one loss: " + str(abc2_zol))

abc2 recal: 0.5786762578583496
abc2 precision: 0.5979905340864783
abc2 f1: 0.5800551554649136
abc2 confusion matrix: 
[[196  10   1   8   0   1  26   4   2   1   0]
 [ 18  36  36   4  11  88  28  14   6   7   4]
 [  1  23  59   4   5 114  15   5  10   4   0]
 [  2   9  10 213   1   7  13   1   0   0   0]
 [  4   5   4   2 194  20   8  17   9   4   0]
 [  2  17  17   6  18 123  17   8  19   7   8]
 [ 17  15  18  20   2  15 148  10   3   1   2]
 [  3  22  12   0  16  74  25  79   5  14   6]
 [  1  11   6   6  11  25   4   5 171   4   0]
 [  0  11  11   0  11  30   2   9   2 175   0]
 [  2   8   4   0   0  12   5  10   0   1 200]]
abc2 zero one loss: 0.4203636363636364


In [76]:
rfc_predict = rfc.predict(X_test)
rfc_recall = recall_score(y_test, rfc_predict, average='macro')
rfc_precision = precision_score(y_test, rfc_predict, average='macro')
rfc_f1 = f1_score(y_test, rfc_predict, average='macro')
rfc_conf = confusion_matrix(y_test, rfc_predict)
rfc_zol = zero_one_loss(y_test, rfc_predict)
print("rfc recal: " + str(rfc_recall))
print("rfc precision: " + str(rfc_precision))
print("rfc f1: " + str(rfc_f1))
print("rfc confusion matrix: \n" + str(rfc_conf))
print("rfc zero one loss: " + str(rfc_zol))

rfc recal: 0.6935644529527795
rfc precision: 0.6949155294001851
rfc f1: 0.6904362568910011
rfc confusion matrix: 
[[207   6   1   8   1   0  22   2   1   1   0]
 [ 27  86  45  18   9  15  12  17  13   9   1]
 [ 11  37 141   8   4  13  11   9   3   2   1]
 [  6   2   6 232   0   2   6   0   1   0   1]
 [  4  10   3   3 217  16   2   4   3   5   0]
 [ 13  15  10   3  30 139   6   6  15   3   2]
 [ 35  14  20  31   1   5 135   8   0   0   2]
 [ 13  15   6   2  11  11  12 173   2   5   6]
 [  3  21   9   7  13  17   9   2 159   3   1]
 [  1  16   4   2  12   8   1   6   3 197   1]
 [  3   2   1   5   3   2   0   2   0   0 224]]
rfc zero one loss: 0.3054545454545454


In [37]:
rfc2_predict = rfc2.predict(X_test)
rfc2_recall = recall_score(y_test, rfc2_predict, average='macro')
rfc2_precision = precision_score(y_test, rfc2_predict, average='macro')
rfc2_f1 = f1_score(y_test, rfc2_predict, average='macro')
rfc2_conf = confusion_matrix(y_test, rfc2_predict)
rfc2_zol = zero_one_loss(y_test, rfc2_predict)
print("rfc2 recal: " + str(rfc2_recall))
print("rfc2 precision: " + str(rfc2_precision))
print("rfc2 f1: " + str(rfc2_f1))
print("rfc2 confusion matrix: \n" + str(rfc2_conf))
print("rfc2 zero one loss: " + str(rfc2_zol))

NameError: name 'rfc2' is not defined

In [41]:
KX_train, KX_test, Ky_train, Ky_test = train_test_split(X, to_categorical(y, 11))
model = models.Sequential()
model.add(layers.Dense(200, activation="relu", input_shape=(60798,)))
model.add(layers.Dropout(0.3, noise_shape=None, seed=None))
model.add(layers.Dense(200, activation="relu"))
model.add(layers.Dropout(0.3, noise_shape=None, seed=None))
model.add(layers.Dense(200, activation="relu"))
model.add(layers.Dense(11, activation="sigmoid"))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 200)               12159800  
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 200)               40200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 200)               40200     
_________________________________________________________________
dense_4 (Dense)              (None, 11)                2211      
Total params: 12,242,411
Trainable params: 12,242,411
Non-trainable params: 0
__________________________________________

In [42]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
results = model.fit(KX_train, Ky_train, epochs=5, batch_size=500, validation_data=(KX_test, Ky_test))
print("Deep acc: ", numpy.mean(results.history["val_accuracy"]))

Train on 8249 samples, validate on 2750 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


KeyError: 'val_acc'

In [44]:
KX_train, KX_test, Ky_train, Ky_test = train_test_split(X, to_categorical(y, 11))
model = models.Sequential()
model.add(layers.Dense(500, activation="relu", input_shape=(60798,)))
model.add(layers.Dropout(0.3, noise_shape=None, seed=None))
model.add(layers.Dense(150, activation="relu"))
model.add(layers.Dropout(0.3, noise_shape=None, seed=None))
model.add(layers.Dense(100, activation="relu"))
model.add(layers.Dense(11, activation="sigmoid"))
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 500)               30399500  
_________________________________________________________________
dropout_3 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 150)               75150     
_________________________________________________________________
dropout_4 (Dropout)          (None, 150)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 100)               15100     
_________________________________________________________________
dense_8 (Dense)              (None, 11)                1111      
Total params: 30,490,861
Trainable params: 30,490,861
Non-trainable params: 0
__________________________________________

In [45]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
results = model.fit(KX_train, Ky_train, epochs=5, batch_size=500, validation_data=(KX_test, Ky_test))
print("Deep acc: ", numpy.mean(results.history["val_accuracy"]))

Train on 8249 samples, validate on 2750 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Deep acc:  0.918439757823944
