In [96]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection, naive_bayes, svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn import tree
from sklearn.preprocessing import LabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors.nearest_centroid import NearestCentroid
import pandas as pd

# Reading Dataset 

In [97]:
Corpus = pd.read_csv(r"aji-Arabic_corpus.csv")

Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text'],Corpus['targe'],test_size=0.2)

# convert into Tfidf Vectorizer

In [98]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

# Naive Bayes Classifier

In [99]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)


Naive Bayes Accuracy Score ->  97.33333333333334


In [100]:
target_names = ['Art', 'Economic', 'Politics', 'Science', 'Sport']
report=classification_report(predictions_NB, Test_Y, target_names=target_names, digits=4)
print(report)

              precision    recall  f1-score   support

         Art     0.9630    0.9811    0.9720        53
    Economic     0.9310    0.9818    0.9558        55
    Politics     0.9833    0.9077    0.9440        65
     Science     1.0000    1.0000    1.0000        58
       Sport     0.9857    1.0000    0.9928        69

    accuracy                         0.9733       300
   macro avg     0.9726    0.9741    0.9729       300
weighted avg     0.9739    0.9733    0.9731       300



# Rocchio classification

In [101]:
model = NearestCentroid()
model.fit(Train_X_Tfidf, Train_Y)
predictions_RC = model.predict(Test_X_Tfidf)

In [102]:
target_names = ['Art', 'Economic', 'Politics', 'Science', 'Sport']
report=classification_report(predictions_RC, Test_Y, target_names=target_names, digits=4)
print(report)

              precision    recall  f1-score   support

         Art     0.8704    0.9216    0.8952        51
    Economic     0.6724    1.0000    0.8041        39
    Politics     1.0000    0.6897    0.8163        87
     Science     1.0000    1.0000    1.0000        58
       Sport     0.9286    1.0000    0.9630        65

    accuracy                         0.8967       300
   macro avg     0.8943    0.9222    0.8957       300
weighted avg     0.9199    0.8967    0.8954       300



# Boosting and Bagging

In [103]:
model = GradientBoostingClassifier(n_estimators=100)
model.fit(Train_X_Tfidf, Train_Y)
predictions_BB = model.predict(Test_X_Tfidf)

In [104]:
target_names = ['Art', 'Economic', 'Politics', 'Science', 'Sport']
report=classification_report(predictions_BB, Test_Y, target_names=target_names, digits=4)
print(report)

              precision    recall  f1-score   support

         Art     0.9630    0.9286    0.9455        56
    Economic     0.8793    0.8947    0.8870        57
    Politics     0.9333    0.8750    0.9032        64
     Science     0.9310    1.0000    0.9643        54
       Sport     0.9571    0.9710    0.9640        69

    accuracy                         0.9333       300
   macro avg     0.9328    0.9339    0.9328       300
weighted avg     0.9337    0.9333    0.9330       300



# Boosting is a Ensemble learning meta-algorithm 

In [105]:
model=BaggingClassifier(KNeighborsClassifier())
model.fit(Train_X_Tfidf, Train_Y)
predictions_Bag = model.predict(Test_X_Tfidf)

In [106]:
target_names = ['Art', 'Economic', 'Politics', 'Science', 'Sport']
report=classification_report(predictions_Bag, Test_Y, target_names=target_names, digits=4)
print(report)

              precision    recall  f1-score   support

         Art     0.8704    0.9592    0.9126        49
    Economic     0.8276    0.9412    0.8807        51
    Politics     0.9333    0.7671    0.8421        73
     Science     1.0000    0.9831    0.9915        59
       Sport     0.9571    0.9853    0.9710        68

    accuracy                         0.9200       300
   macro avg     0.9177    0.9272    0.9196       300
weighted avg     0.9236    0.9200    0.9188       300



# K-nearest Neighbor

In [107]:
model = KNeighborsClassifier()
model.fit(Train_X_Tfidf, Train_Y)
predictions_KNN = model.predict(Test_X_Tfidf)

In [108]:
target_names = ['Art', 'Economic', 'Politics', 'Science', 'Sport']
report=classification_report(predictions_KNN, Test_Y, target_names=target_names, digits=4)
print(report)

              precision    recall  f1-score   support

         Art     0.8889    0.9231    0.9057        52
    Economic     0.8276    0.9231    0.8727        52
    Politics     0.9000    0.7826    0.8372        69
     Science     1.0000    0.9831    0.9915        59
       Sport     0.9571    0.9853    0.9710        68

    accuracy                         0.9167       300
   macro avg     0.9147    0.9194    0.9156       300
weighted avg     0.9181    0.9167    0.9159       300



# Support Vector Machine (SVM)

In [109]:
model = LinearSVC()
model.fit(Train_X_Tfidf, Train_Y)
predictions_SVM = model.predict(Test_X_Tfidf)

In [110]:
target_names = ['Art', 'Economic', 'Politics', 'Science', 'Sport']
report=classification_report(predictions_SVM, Test_Y, target_names=target_names, digits=4)
print(report)

              precision    recall  f1-score   support

         Art     0.9630    1.0000    0.9811        52
    Economic     0.9655    0.9825    0.9739        57
    Politics     0.9667    0.9355    0.9508        62
     Science     1.0000    1.0000    1.0000        58
       Sport     1.0000    0.9859    0.9929        71

    accuracy                         0.9800       300
   macro avg     0.9790    0.9808    0.9798       300
weighted avg     0.9801    0.9800    0.9799       300



# Decision Tree

In [111]:
model = tree.DecisionTreeClassifier()
model.fit(Train_X_Tfidf, Train_Y)
predictions_DT = model.predict(Test_X_Tfidf)

In [112]:
target_names = ['Art', 'Economic', 'Politics', 'Science', 'Sport']
report=classification_report(predictions_DT, Test_Y, target_names=target_names, digits=4)
print(report)

              precision    recall  f1-score   support

         Art     0.6852    0.6852    0.6852        54
    Economic     0.7586    0.7857    0.7719        56
    Politics     0.8167    0.6712    0.7368        73
     Science     0.7414    0.9348    0.8269        46
       Sport     0.9286    0.9155    0.9220        71

    accuracy                         0.7933       300
   macro avg     0.7861    0.7985    0.7886       300
weighted avg     0.7971    0.7933    0.7917       300



# Conditional Random Field (CRF)

In [113]:
model = tree.DecisionTreeClassifier()
model.fit(Train_X_Tfidf, Train_Y)
predictions_DT = model.predict(Test_X_Tfidf)

In [114]:
target_names = ['Art', 'Economic', 'Politics', 'Science', 'Sport']
report=classification_report(predictions_DT, Test_Y, target_names=target_names, digits=4)
print(report)

              precision    recall  f1-score   support

         Art     0.7407    0.6667    0.7018        60
    Economic     0.7759    0.7500    0.7627        60
    Politics     0.8167    0.7778    0.7967        63
     Science     0.7586    0.9565    0.8462        46
       Sport     0.9286    0.9155    0.9220        71

    accuracy                         0.8100       300
   macro avg     0.8041    0.8133    0.8059       300
weighted avg     0.8109    0.8100    0.8082       300

