## **Xing Yi Chan**
## **R00183768**


In [2]:
# install farasa
!pip install farasapy

Collecting farasapy
  Downloading https://files.pythonhosted.org/packages/c9/32/3647a6763dbd2cb4d5777a9a7b0f8443daa2924277518d7a9700617e82c4/farasapy-0.0.5-py3-none-any.whl
Installing collected packages: farasapy
Successfully installed farasapy-0.0.5


In [3]:
# import necessary libraries
import re
import pandas as pd
from io import StringIO
import nltk
from nltk.corpus import stopwords # import stopwords
from farasa.segmenter import FarasaSegmenter  # import farasa tokenizer / segmenter
from farasa.diacratizer import FarasaDiacritizer # import farasa diacritizer
from farasa.stemmer import FarasaStemmer # import farasa stemmer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

## **Text preprocessing**
Below is the list of text preprocessing methods that will be used to preprocess the data.
- Tokenization / segmentation
- Removing diacritics
- Removing punctuation
- Removing stopwords
- Stemming

In [4]:
# import training and testing data
training_data = pd.read_csv('/content/drive/My Drive/NLP/dataset/MADAR-Corpus-26-train.tsv', sep= '\t', 
                            names=['Sentence', 'City'])
testing_data = pd.read_csv('/content/drive/My Drive/NLP/dataset/MADAR-Corpus-26-dev.tsv', sep= '\t', 
                           names=['Sentence', 'City'])

# seperate training and testing data and target values
trainX = training_data['Sentence']
trainY = training_data['City']
testX = testing_data['Sentence']
testY = testing_data['City']

# perform lable encoding on both training and testing target values
enc = LabelEncoder()
trainY = enc.fit_transform(trainY)
testY = enc.fit_transform(testY)

In [82]:
# segmentation / tokenization
def tokenization(train, test):
    segmenter = FarasaSegmenter()
    train_output = train.apply(lambda x: segmenter.segment(x))
    test_output = test.apply(lambda x: segmenter.segment(x))
    print()
    print('Completed segmentation......')

    return train_output, test_output

trainX, testX = tokenization(trainX, testX)

perform system check...
check java version...
Your java version is 11.0 which is compatiple with Farasa 
check toolkit binaries...
Dependencies seem to be satisfied..
task [SEGMENT] is initialized in [34mSTANDALONE [37mmode...
Completed segmentation......


In [83]:
# remove diacritics
def remove_diacritics(train, test):
    diacritizer = FarasaDiacritizer()
    train_output = train.apply(lambda x: diacritizer.diacritize(x))
    test_output = test.apply(lambda x: diacritizer.diacritize(x))
    print()
    print('Completed diacritization......')

    return train_output, test_output

trainX, testX = remove_diacritics(trainX, testX)

perform system check...
check java version...
Your java version is 11.0 which is compatiple with Farasa 
check toolkit binaries...
Dependencies seem to be satisfied..
task [DIACRITIZE] is initialized in [34mSTANDALONE [37mmode...
Completed diacritization......


In [86]:
# remove punctuation
def remove_punctuation(text):
    output = re.sub(r'[^\w\s]','', text) # common punctuations
    output = re.sub('[،؛؟.]','', text)  # arabic punctuations  
    return output

for i in range(len(trainX)):
    trainX[i] = remove_punctuation(trainX[i])

for j in range(len(testX)):
    testX[j] = remove_punctuation(testX[j])

print('Completed punctuation removal......')

Completed punctuation removal......


In [87]:
# remove stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = stopwords.words('arabic')

def remove_stopwords(text):
    words = [w for w in text if w not in stopwords]
    return "".join(words)

for i in range(len(trainX)):
    trainX[i] = remove_stopwords(trainX[i])

for j in range(len(testX)):
    testX[j] = remove_stopwords(testX[j])

print('Completed stopwords removal......')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Completed stopwords removal......


In [88]:
# stemming
def stemming(train, test):
    stemmer = FarasaStemmer()
    train_output = train.apply(lambda x: stemmer.stem(x))
    test_output = test.apply(lambda x: stemmer.stem(x))
    print()
    print('Completed stemming...')

    return train_output, test_output

trainX, testX = stemming(trainX, testX)

perform system check...
check java version...
Your java version is 11.0 which is compatiple with Farasa 
check toolkit binaries...
Dependencies seem to be satisfied..
task [STEM] is initialized in [34mSTANDALONE [37mmode...
Completed stemming...


## **Create tf-idf vector**

In [5]:
# create tf-idf vectors
vectorizer = TfidfVectorizer(max_features=1500, analyzer='word', ngram_range=(1, 2))
trainX_tfidf = vectorizer.fit_transform(trainX)
testX_tfidf = vectorizer.fit_transform(testX)

print('Completed vectorization......')

Completed vectorization......


## **System implementation**
Four different machine learning classification will be used to classify the data. The models are as below.
- Naive Bayes Classifier
- Support Vector Machine Classifier
- k-Nearest Neighbour Classifier
- Random Forest Classifier

In [7]:
# naive bayes classifier
nb = MultinomialNB()
nb.fit(trainX_tfidf, trainY)
print('Accuracy: ', nb.score(testX_tfidf, testY)*100)

y_pred = nb.predict(testX_tfidf)

Accuracy: 71.75


In [125]:
# svm classifier
svm = SVC(kernel='linear')
svm.fit(trainX_tfidf, trainY)
print('Accuracy: ', svm.score(testX_tfidf, testY)*100)

Accuracy: 68.03


In [126]:
# k-nn classifier
knn = KNeighborsClassifier()
knn.fit(trainX_tfidf, trainY)
print('Accuracy: ', knn.score(testX_tfidf, testY)*100)

Accuracy: 61.52


In [127]:
# random forest classifier
rf = RandomForestClassifier()
rf.fit(trainX_tfidf, trainY)
print('Accuracy: ', rf.score(testX_tfidf, testY)*100)

Accuracy: 68.75


In [131]:
# Write the test label and predicted output into seperate files
pd.DataFrame(testY).to_csv('/content/drive/My Drive/NLP/dataset/Ass1.GOLD')
pd.DataFrame(y_pred).to_csv('/content/drive/My Drive/NLP/dataset/Ass1.PRED')

print('Completed writing into file......')

Completed writing into file......
