# Books Classification

In [31]:
import os
import re
import string
from sklearn import svm
import seaborn as sns; 
import matplotlib.pyplot as plt 
import math
import sys
import argparse
import pandas as pd
import numpy as np
from sklearn.svm import SVC
import nltk
from nltk.stem.isri import ISRIStemmer
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix,accuracy_score
from nltk.tokenize.treebank import TreebankWordDetokenizer
import unicodedata as ud

### Load Dataset

In [2]:
dataset = pd.read_excel('datasets/ChildrenBooks300.xlsx', encoding ='utf-8-sig')
dataset=dataset.dropna()
dataset=dataset.reset_index(drop=True)
dataset.head()

Unnamed: 0,Book ID,Book Name,Brif,Label
0,1.0,كتابى الأول - الأعداد,أكثر الكتب نجاح ا مع الأطفال قبل عمر المدرسة ي...,A
1,2.0,حديقتي مدخل إلى لغتي,الغرض من هذه السلسلة أن يتعلّم الطالب ظبط الكل...,A
2,3.0,من أين تأتى الأشياء ؟,لقد صمم هذا الكتاب برسوماته الرقيقة وصوره الكب...,A
3,4.0,"كتب الفراشة - سلسلة التليتبز؛ هدايا لطيفة ""كتا...",في يوم ما، وفي قرية التليتبي، تبادل التليتبز ه...,A
4,5.0,سلسلة مهارات الطالب: تدريبات في القراءة العربية,تشكل قراءة الأحرف وتعلمها واستخداماتها المختلف...,A


### Remove colums

In [3]:
dataset = dataset.drop(columns = {'Book ID', 'Book Name'})
dataset.head()

Unnamed: 0,Brif,Label
0,أكثر الكتب نجاح ا مع الأطفال قبل عمر المدرسة ي...,A
1,الغرض من هذه السلسلة أن يتعلّم الطالب ظبط الكل...,A
2,لقد صمم هذا الكتاب برسوماته الرقيقة وصوره الكب...,A
3,في يوم ما، وفي قرية التليتبي، تبادل التليتبز ه...,A
4,تشكل قراءة الأحرف وتعلمها واستخداماتها المختلف...,A


### Define features and target

In [4]:
x=dataset.iloc[:,0]
y=dataset.iloc[:,1]

## Pre-processing

In [5]:
arabic_diacritics = re.compile("""
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)

### Normalize Arabic Word

In [6]:
for n in range(len(x)):   
    x[n] = re.sub("[إأآا]", "ا", x[n])
    x[n] = re.sub("ى", "ي", x[n])
    x[n] = re.sub("ؤ", "ء", x[n])
    x[n] = re.sub("ئ", "ء", x[n])
    x[n] = re.sub("ة", "ه", x[n])
    x[n] = re.sub("گ", "ك", x[n])
    x[n] = re.sub(" ا " ,"", x[n])

### Remove Arabic Diacritics

In [7]:
for n in range(len(x)):  
    x[n] = re.sub(arabic_diacritics, '', x[n])

### Remove Punctuation

In [8]:
for n in range(len(x)): 
    x[n] = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', x[n])

### Remove Integers

In [9]:
for n in range(len(x)): 
    x[n] = re.sub('\d+', '', x[n])

### Remove Extra whitespace

In [10]:
for n in range(len(x)): 
    x[n] = re.sub('\s+', ' ', x[n])

### Tokenization and Remove Stop Words

In [11]:
stop_words = set(stopwords.words('Arabic'))
filtered_sentence = [] 
for n in range(len(x)):
    word_tokens = nltk.word_tokenize(dataset['Brif'][n])
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
    filtered_sentenceD = TreebankWordDetokenizer().detokenize(filtered_sentence)
    for c in filtered_sentenceD:
        if ud.category(c).startswith("Po"):
            filtered_sentenceD = filtered_sentenceD.replace(c, '')
    dataset['Brif'][n] = filtered_sentenceD

### Stemming the data

In [12]:
stemmer=ISRIStemmer()
X=x.to_dict()
X=[]
for d in range(len(x)):
    b= stemmer.stem(x[d])
    b=x[d].lower()
    X.append(b)

### Vectorization

In [13]:
count_vect=CountVectorizer()
a=count_vect.fit_transform(X)
a.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 7, 1, 1],
       [1, 1, 1, ..., 7, 1, 1],
       [1, 1, 1, ..., 7, 1, 1]])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [15]:
count_vect=CountVectorizer()
X_train_counts=count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.toarray()

array([[0.00577144, 0.00550721, 0.00321862, ..., 0.01418554, 0.        ,
        0.00289029],
       [0.00583129, 0.00556433, 0.00325199, ..., 0.01433265, 0.        ,
        0.00292026],
       [0.        , 0.        , 0.00461007, ..., 0.01219089, 0.        ,
        0.0041398 ],
       ...,
       [0.        , 0.        , 0.00577546, ..., 0.01527265, 0.        ,
        0.00518631],
       [0.        , 0.        , 0.        , ..., 0.00952385, 0.        ,
        0.00970237],
       [0.        , 0.        , 0.00451539, ..., 0.01194052, 0.        ,
        0.00405478]])

In [16]:
sm=RandomOverSampler()
X_train_res, y_train_res = sm.fit_sample(X_train_tfidf, y_train)

### Perform a grid search to find the optimal hyper-parameters and kernel function

In [17]:
# possible hyper-parameters and kernel
#C_values = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]
#gamma_values = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]
#kernel_values = ['linear', 'poly', 'rbf', 'sigmoid']
# accurcy
#best_score = 0
#best_params = {'C': None , 'kernel': None, 'gamma': None}
#for C in C_values:
#    for kernel in kernel_values:
#        for gamma in gamma_values:
        
            # train the model for every hyper model
#            svc = svm.SVC(C=C, kernel=kernel, gamma=gamma)
#            svc.fit(X_train_res, y_train_res)
#            score = svc.score (X_train_res, y_train_res)
        
            # rate accurcy of the prarmeters
#            if score > best_score:
#                best_score = score
#                best_params['C'] = C
#                best_params['kernel'] = kernel
#                best_params['gamma'] = gamma
#print best score
#best_score, best_params

## Buliding Model

In [18]:
clf= svm.SVC(C=0.01, kernel = 'poly', gamma=7)
clf.fit(X_train_res, y_train_res)
clf.score(X_train_res, y_train_res)

1.0

In [19]:
X_test_tfidf=count_vect.transform(X_test)
y_pred=clf.predict(X_test_tfidf)

In [20]:
cm = confusion_matrix(y_test, y_pred)
Accuracy_Score = accuracy_score(y_test, y_pred)

In [21]:
print('Accuracy  = ', Accuracy_Score * 100)

Accuracy  =  100.0


### Plotting the dataset with design boundary

In [39]:
h = .01  # step size in the mesh
X_r = reduce_dim(X)
X_train_r = reduce_dim(X_train)
X_test_r = reduce_dim(X_test)

figure = pl.figure(figsize=(15, 5))

x_min, x_max = X_r[:, 0].min() - .5, X_r[:, 0].max() + .5
y_min, y_max = X_r[:, 1].min() - .5, X_r[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))

NameError: name 'reduce_dim' is not defined