In [1]:
import pandas as pd
import numpy as np

In [2]:
from matplotlib import pyplot as plt
%matplotlib inline

# set the font size of plots
plt.rcParams['font.size'] = 14

In [3]:
np.set_printoptions(precision=3)

# Read raw data

In [4]:
corpus_files = ['./langid_data_TUN-AR.txt', './langid_data_ARA.txt']

In [5]:
def read_text_file(filename):
    print('Reading file ' + filename + "...")
    with open(filename, "r", encoding='utf8') as textfile:
        L = []
        for line in textfile:
            L.append(line.strip())
        print('File contains ', len(L), "lines.\n")
        return L

In [6]:
tun_corpus = read_text_file(corpus_files[0])
ara_corpus = read_text_file(corpus_files[1])

Reading file ./langid_data_TUN-AR.txt...
File contains  13932 lines.

Reading file ./langid_data_ARA.txt...
File contains  21787 lines.



## Text Cleaning
To clean our corpus we opted to:

- Remove non-word symbols (punctuation, math symbols, emoticons, URLs, hashtags, etc.).
- Remove diactritic
- Remove documents that contain a large fraction of latin characters (some documents contain english or french words).
- Remove very short documents.

In [7]:
import re
import string
import unicodedata as ud
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

stop_words_ar = stopwords.words('arabic')

# regexp for word elongation: matches 3 or more repetitions of a word character.
two_plus_letters_RE = re.compile(r"(\w)\1{1,}", re.DOTALL)
three_plus_letters_RE = re.compile(r"(\w)\1{2,}", re.DOTALL)

# regexp for repeated words
two_plus_words_RE = re.compile(r"(\w+\s+)\1{1,}", re.DOTALL)
#arabic ponctuation
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations

def cleanup_text(text):
   
    # Remove URLs
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', text)

    # Remove user mentions of the form @username
    text = re.sub('@[^\s]+', '', text)
    
    # Remove special useless characters such as _x000D_
    text = re.sub(r'_[xX]000[dD]_', '', text)

    # Remove redundant white spaces
    text = text.strip()
    text = re.sub('[\s]+', ' ', text)

    # normalize word elongations (characters repeated more than twice)
    text = two_plus_letters_RE.sub(r"\1\1", text)

    # remove repeated words
    text = two_plus_words_RE.sub(r"\1", text)

    #remove repeating char
    re.sub(r'(.)\1+', r'\1', text)
    
    #remove diactritic
    arabic_diacritics = re.compile("""
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                             """, re.VERBOSE)
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    text = re.sub(arabic_diacritics, '', text)
    
    #remove ponctuation
    translator = str.maketrans('', '', punctuations_list)
    text=text.translate(translator)
    
    return text

# unit test of this function
cleanup_text("أهلاً بالعالم في هذه التجربة ! علامات ،الترقيم ؟ ,? لا .اتذكرها  أكثر")

'اهلا بالعالم في هذه التجربه  علامات الترقيم   لا اتذكرها اكثر'

In [8]:
#clean the corpus using the cleanup_text function 

tun_corpus_clean = [cleanup_text(document) for document in tun_corpus]
ara_corpus_clean = [cleanup_text(document) for document in ara_corpus]
len(tun_corpus),len(ara_corpus),len(tun_corpus_clean),len(ara_corpus_clean)


(13932, 21787, 13932, 21787)

In [9]:
# Remove documents that contain a large fraction of latin characters
tun_corpus_clean_2 = []
for document in tun_corpus_clean:
    if(len(document) > 10 ):
        if(len(re.findall('[a-zA-Z]',document))/len(document) < 0.3 ):
            tun_corpus_clean_2.append(document)
            
ara_corpus_clean_2 = []
for document in ara_corpus_clean:
    if(len(document) > 10 ):
        if(len(re.findall('[a-zA-Z]',document))/len(document) < 0.3 ):
            ara_corpus_clean_2.append(document) 

In [10]:
#Remove very short documents.
tun_corpus_clean_3 = []
for document in tun_corpus_clean_2:
    if(len(document) > 10 ):
        tun_corpus_clean_3.append(document)
        
ara_corpus_clean_3 = []
for document in ara_corpus_clean_2:
    if(len(document) > 10):
        ara_corpus_clean_3.append(document) 

## Construct a dictionary-based language classifier
- **Step 1**: Divide each corpus into a training corpus (70%) and test corpus (30%).
- **Step 2**: learn a set of typical words (also called stop words) of **every language** (TUN and ARA) based on its training corpus.
- **Step 3**: create a language identification algorithm that takes the list of typical words of each language and a new document as input; and returns the language of this document as output.
- **Step 4**: Evaluate the performance of this algorithm based on the test corpus -- calculate classification accuracy, precision, recall, F1, and confusion matrix.

In [11]:
# Divide each corpus into a training corpus and test corpus. 

from sklearn.model_selection import train_test_split
#?train_test_split

tun_corpus_clean_train, tun_corpus_clean_test = train_test_split(tun_corpus_clean_3, test_size=0.20, random_state=42 )

ara_corpus_clean_train, ara_corpus_clean_test = train_test_split( ara_corpus_clean_3, test_size=0.20, random_state=42 )


In [12]:
# learn a set of typical words (also called stop words) of every language.

from sklearn.feature_extraction.text import TfidfVectorizer
P1 = 100  ## configuration hyperparameter; you can modify it if you want; see instructions below.
P2 = 500

## Find typical words of the TUN language

# create TfidfVectorizer instance with maxdf = 1.0 so that the most frequent words of the corpus are NOT thrown away

bow_model_tun = TfidfVectorizer (max_df = 1.0, min_df = 0.01)
bow_model_ara = TfidfVectorizer (max_df = 1.0, min_df = 0.01)
# call fit() method with our TUN corpus; this will create the vocabulary of the corpus ...

bow_model_tun.fit( tun_corpus_clean_train )
bow_model_ara.fit( ara_corpus_clean_train )
# select P words from this vocabulary that have the SMALLEST IDF values -- See the source code in TD1 for help ...

print(len(bow_model_tun.get_feature_names()),len(bow_model_ara.get_feature_names()))

typical_words_tun=pd.DataFrame(dict(Word=bow_model_tun.get_feature_names(),IDF=bow_model_tun.idf_)).sort_values("IDF", inplace=False, ascending = True)['Word'].head(P1)

typical_words_ara=pd.DataFrame(dict(Word=bow_model_ara.get_feature_names(),IDF=bow_model_ara.idf_)).sort_values("IDF", inplace=False, ascending = True)['Word'].head(P1)

95 775


In [13]:
# Step 3 -- write the algorithm for dictionary-based language identification. 
#    This algorithm selects the language that has the highest number of typical words in the input document.

## COMPLETE THE CODE BELOW
n = 5000
random_indices = np.random.choice(np.arange(len(tun_corpus_clean_3)), n, replace=False)
small_corpus = [tun_corpus_clean[i] for i in random_indices]
def dict_langid(typical_words,document):
    words=[]
    for word in typical_words:
        if word in document and not word in words:
            words.append(word)
    return len(words)/len(typical_words)


fraction_tun=0
fraction_ara=0

# for each document in the test combined test corpus, call dict_langid with typical_words_tun and then with typical_words_tun
for document in small_corpus:
    fraction_tun+=dict_langid(typical_words_tun,document)
    fraction_tun=fraction_tun/2
    fraction_ara+=dict_langid(typical_words_ara,document)
    fraction_ara=fraction_ara/2
    
print(fraction_tun)
print(fraction_ara)


0.06387400976423384
0.02760641439282796


In [14]:
# Evaluate the performance of this algorithm based on the test corpus
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

def predict_language(document):
    fraction_tun=dict_langid(typical_words_tun,document)
    fraction_ara=dict_langid(typical_words_ara,document)
    if fraction_tun > fraction_ara:
        return 'TUN'
    else:
        return 'ARA'

predictions_tun=[]
y_true1=[]
for document in tun_corpus_clean_test:
    predictions_tun.append(predict_language(document))
for doc in tun_corpus_clean_test:
    y_true1.append("TUN")
    

predictions_ara=[]
y_true2=[]
for document in ara_corpus_clean_test:
    predictions_ara.append(predict_language(document))
for doc in ara_corpus_clean_test:
    y_true2.append("ARA")
print(accuracy_score(y_true1,predictions_tun),accuracy_score(y_true2,predictions_ara))


0.8706860706860707 0.8911100658513641


## Construct a language classifier using supervised learning
- **Step 0**: Divide each corpus into a training corpus (70%) and test corpus (30%).
- **Step 1**: Create a data frame called ``train_df`` that has two columns: 'document' and 'language'. The 'document' column contains the two corpora concatenated together. The values in the '' column should be 'TUN' and 'ARA'.  Repeat the same thing for the ``test_df``.
- **Step 2**: Convert the training documents into numeric feature vectors using the BOW-tfidf method with **character ngrams**.
- **Step 3**: Create a language classifier using Naive Bayes method (tfidf version).
- **Step 4**: Evaluate performance of this classifier based on the test corpus -- calculate classification accuracy, precision, recall, F1, and confusion matrix.

In [15]:
from sklearn.model_selection import train_test_split
#?train_test_split

tun_corpus_clean_train, tun_corpus_clean_test = train_test_split(tun_corpus_clean_3, test_size=0.30, random_state=42 )

ara_corpus_clean_train, ara_corpus_clean_test = train_test_split(ara_corpus_clean_3, test_size=0.30, random_state=42 )

In [16]:
#create 2 data frames called train_df and test_df

# create data frame
train_df = pd.DataFrame({'document':[], 'language':[]})
test_df = pd.DataFrame({'document':[], 'language':[]})

# fill the language column
train_df.language = pd.Series(['TUN']*len(tun_corpus_clean_train) + ['ARA']*len(ara_corpus_clean_train))
test_df.language = pd.Series(['TUN']*len(tun_corpus_clean_test) + ['ARA']*len(ara_corpus_clean_test))

# fill the document column -- CONCATENATE the TUN CORPUS and ARA CORPUS
train_df.document = pd.Series(tun_corpus_clean_train + ara_corpus_clean_train)
test_df.document = pd.Series(tun_corpus_clean_test + ara_corpus_clean_test)

train_df.head()

Unnamed: 0,document,language
0,عبد الحميد الشيب و العيب كرمك الله اه ها الشيخ...,TUN
1,والله عندك حق,TUN
2,هزابي بدلو شواي﻿,TUN
3,برشه مساطه واسقوطيه وحط في بالك رانا نتفرجو عا...,TUN
4,حجله واش دورها في هاد المسلسل نشوفها نتعقد رهج...,TUN


In [17]:
#Convert the training documents into numeric feature vectors using the BOW-tfidf method with character ngrams.
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
import pickle

n = 4   # hyperparameter for of character ngrams

#Term Frequency-Inverse Document Frequency (TF-IDF)

#the vector
# Create an instance of TfidfVectorizer class with analyzer = 'char' so that it generates bag of characters and not bag of words
bow_model_char = TfidfVectorizer(analyzer='char', ngram_range=(1,n), max_df = 0.9, min_df = 0.01)

# Call fit method with the combined training corpus
bow_model_char.fit(train_df.document)

#save the bow model
pickle.dump(bow_model_char, open('bow_model.sav', 'wb'))

# Create DTM(document term matrix) matrix of the combined training corpus and test corpus
train_dtm = bow_model_char.transform(train_df.document)
test_dtm = bow_model_char.transform(test_df.document)


In [18]:
from sklearn.naive_bayes import MultinomialNB

#instance of multinomial nb model
nb_model = MultinomialNB()

nb_model.fit(train_dtm,train_df.language)

#unit test of the model
test=[]
test.append("يخي شبيك تحكي هكا")
test1=bow_model_char.transform(test)
print(nb_model.predict(test1)[0])
y_pred = nb_model.predict(test_dtm)


TUN


In [19]:
# Step 4   Use the same source code as in Step 4 of the previous part.
print("Accuracy score : ",accuracy_score(test_df.language, y_pred))

Accuracy score :  0.9485227841762643


In [20]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
import pickle
filename = 'lang_model.sav'
pickle.dump(nb_model, open(filename, 'wb'))

In [21]:
loaded_model = pickle.load(open('lang_model.sav', 'rb'))

In [22]:
#test new instance
def predict_lang(text):
    inst=[]
    inst.append(text)
    text_vect=bow_model_char.transform(inst)
    prob=loaded_model.predict_proba(text_vect)[0]
    x=prob[0]
    print(x)
    if x <0.8 and x>0.2:
        return("OTHER")
    else:
        return nb_model.predict(text_vect)[0]
    
predict_lang("oaded_model hqjdhkqdjh zajdhazdlkamdk jqdqjdmaze لحميد العيب ")


0.3351527058028016


'OTHER'