## Project 

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

# set the font size of plots
plt.rcParams['font.size'] = 14

np.set_printoptions(precision=3)

## Read corpus


In [2]:
corpus_files = [ './sentiment_data_TUN_pos.txt', './sentiment_data_TUN_neg.txt','./langid_data_ARA.txt' ]
df = pd.read_csv("QuestionnaireData_28Nov2018.csv", encoding='ansi')

In [3]:
def read_text_file(filename):
    print('Reading file ' + filename + "...")
    with open(filename, "r", encoding='utf8') as textfile:
        L = []
        for line in textfile:
            L.append(line.strip())
        print('File contains ', len(L), "lines.\n")
        return L

In [4]:
ara_corpus_pos = read_text_file(corpus_files[0])
ara_corpus_neg = read_text_file(corpus_files[1])
ara_ara_corpus = read_text_file(corpus_files[2])
other_corpus = df.iloc[:,8].tolist()

Reading file ./sentiment_data_TUN_pos.txt...
File contains  3468 lines.

Reading file ./sentiment_data_TUN_neg.txt...
File contains  4345 lines.

Reading file ./langid_data_ARA.txt...
File contains  21787 lines.



In [5]:
## Combine pos and neg corpus into a single corpus for easy manipulation

ara_corpus = ara_corpus_pos + ara_corpus_neg
ara_corpus_sentiment = len(ara_corpus_pos)*[1] + len(ara_corpus_neg)*[-1]

In [6]:
len(ara_corpus),len(ara_corpus_sentiment)

(7813, 7813)

## Text Preprocessing & Cleaning


In [7]:
##1. Convert string to emoji
dict_emot= { ':-)'  : b'\xf0\x9f\x98\x8a'.decode('utf-8'),
              ':)'   : b'\xf0\x9f\x98\x8a'.decode('utf-8'),
             '=)'   : b'\xf0\x9f\x98\x8a'.decode('utf-8'),  # Smile or happy
              ':-D'  : b'\xf0\x9f\x98\x83'.decode('utf-8'),
              ':D'   : b'\xf0\x9f\x98\x83'.decode('utf-8'),
             '=D'   : b'\xf0\x9f\x98\x83'.decode('utf-8'),  # Big smile
             '>:-(' : b'\xF0\x9F\x98\xA0'.decode('utf-8'),
              '>:-o' : b'\xF0\x9F\x98\xA0'.decode('utf-8')   # Angry face
             }
def string_to_emoji(string):
    emoji_pattern = re.compile(r"(:\-?\))|(:\-?D)|=D|=\)|(>:\-[o\(])")  
    return emoji_pattern.sub( lambda x: dict_emot[x.group()] , string)


In [8]:
import emoji

def char_is_emoji(character):
    return character in emoji.UNICODE_EMOJI

In [9]:
##1. Remove useless characters using cleanup_text function from TD2

# YOU CAN MODIFY THIS FUNCTION AS NEEDED.
# FOR EXAMPLE, REMOVE NUMBERS ...

import re
import html
import string
try:
    maketrans = ''.maketrans
except AttributeError:
    # fallback for Python 2
    from string import maketrans

# regexp for word elongation: matches 3 or more repetitions of a word character.
two_plus_letters_RE = re.compile(r"(\w)\1{1,}", re.DOTALL)
three_plus_letters_RE = re.compile(r"(\w)\1{2,}", re.DOTALL)
# regexp for repeated words
two_plus_words_RE = re.compile(r"(\w+\s+)\1{1,}", re.DOTALL)


def cleanup_text(text):
    
    text =string_to_emoji(text)
    
    # Remove URLs
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', text)

    # Remove user mentions of the form @username
    text = re.sub('@[^\s]+', '', text)
    
    # Replace special html-encoded characters with their ASCII equivalent, for example: &#39 ==> '
    if re.search("&#",text):
        text = html.unescape(text)

    # Remove special useless characters such as _x000D_
    text = re.sub(r'_[xX]000[dD]_', '', text)

    # Replace all non-word characters (such as  punctuation, end of line characters, etc.) with a space
    text = text.translate(maketrans("","", string.punctuation))

    # Remove redundant white spaces
    text = text.strip()
    text = re.sub('[\s]+', ' ', text)

    # normalize word elongations (characters repeated more than twice)
    text = two_plus_letters_RE.sub(r"\1\1", text)

    # remove repeated words
    text = two_plus_words_RE.sub(r"\1", text)

    return text

# unit test of this function
print(ara_corpus[0])
print(cleanup_text(ara_corpus[0]))
print(ara_corpus[6])
print(cleanup_text(ara_corpus[6]))
print(ara_corpus[19])
print(cleanup_text(ara_corpus[19]))

تبارك الله عليه
تبارك الله عليه
نور شيبة فنان يعجبني صوتو حلوا و مزيان :)
نور شيبة فنان يعجبني صوتو حلوا و مزيان 😊
♥♥♥
♥♥♥


In [10]:
# Apply this function to each document in the corpus
ara_corpus_clean = []

for doc in ara_corpus:
    ara_corpus_clean.append(cleanup_text(doc))
    


In [11]:
assert(len(ara_corpus_clean)==len(ara_corpus))
print(ara_corpus[:10])
print(ara_corpus_clean[:10])

['تبارك الله عليه', 'امرأة أنيقة و سلسة في خطابها و ذكية و متواضعة', 'جعفر يحب بلادو ويعمل في الخير يعاون في الناس والحكومة لا علاقة تسرق في الشعب وتشري في الكوش للباجي', 'والله بكتني برافووووو واصل', 'محلاها', 'والله احسن ما صار في قناة الحوار التونسي المنشط هاذا', 'نور شيبة فنان يعجبني صوتو حلوا و مزيان :)', 'احسن ممثل في ادوار الكول', 'الرجل المناسب في المكان المناسب', 'عسل']
['تبارك الله عليه', 'امرأة أنيقة و سلسة في خطابها و ذكية و متواضعة', 'جعفر يحب بلادو ويعمل في الخير يعاون في الناس والحكومة لا علاقة تسرق في الشعب وتشري في الكوش للباجي', 'والله بكتني برافوو واصل', 'محلاها', 'والله احسن ما صار في قناة الحوار التونسي المنشط هاذا', 'نور شيبة فنان يعجبني صوتو حلوا و مزيان 😊', 'احسن ممثل في ادوار الكول', 'الرجل المناسب في المكان المناسب', 'عسل']


In [12]:

MAX_LAT_FRAC = 0.3
ara_corpus_clean = [doc for doc in ara_corpus_clean if(len(doc) >0) if (len(re.findall('[a-zA-Z]',doc)) / len(doc)) < MAX_LAT_FRAC]

  

In [13]:
assert(len(ara_corpus_clean)<=len(ara_corpus))
print(len(ara_corpus),len(ara_corpus_clean))


7813 7786


In [14]:
def normalizeArabic(corpus):
    corpus = re.sub("ة", "ت", corpus)
    corpus = re.sub("ض", "ظ", corpus) 
    corpus = re.sub("ى", "ي", corpus)
    corpus = re.sub("ؤ", "ء", corpus)
    corpus = re.sub("ئ", "ء", corpus)
    corpus = re.sub("[إأٱآا]", "ا", corpus)
    return(corpus)

ara_corpus_clean =  [normalizeArabic(doc) for doc in ara_corpus_clean]

In [15]:

from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('[^_\s]+')
ara_corpus_tokenized = [tokenizer.tokenize(doc) for doc in ara_corpus_clean]


In [16]:

assert(len(ara_corpus_clean) == len(ara_corpus_tokenized))
assert(type(ara_corpus_tokenized[0]) == list and type(ara_corpus_tokenized[0][0]) ==str)

In [17]:
# verify first document in corpus
ara_corpus_tokenized[:5]

[['تبارك', 'الله', 'عليه'],
 ['امرات', 'انيقت', 'و', 'سلست', 'في', 'خطابها', 'و', 'ذكيت', 'و', 'متواظعت'],
 ['جعفر',
  'يحب',
  'بلادو',
  'ويعمل',
  'في',
  'الخير',
  'يعاون',
  'في',
  'الناس',
  'والحكومت',
  'لا',
  'علاقت',
  'تسرق',
  'في',
  'الشعب',
  'وتشري',
  'في',
  'الكوش',
  'للباجي'],
 ['والله', 'بكتني', 'برافوو', 'واصل'],
 ['محلاها']]

In [18]:
##5. Remove stop words -- based on a 'standard' list of stopwords for the Arabic language.
import nltk
nltk.download('stopwords')
# Load stop words from NLTK library
from nltk.corpus import stopwords
stop_words_ar = stopwords.words('arabic')
type(stop_words_ar),len(stop_words_ar)
set(stop_words_ar) & {'من','إلى','عن','على','في','ب','ل','ك','و'}
stop_words_ar = stop_words_ar + ['هدا','ها','بش','من','إلى','عن','على','في','ب','ل','ك','و']
type(stop_words_ar)
# For each document, remove stop words
ara_corpus_tokenized = [[word for word in doc if word not in stop_words_ar] for doc in ara_corpus_tokenized]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nasri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
##6. Stemming
import argparse
from nltk.stem.isri import ISRIStemmer

def light_stem(words):
    
    result = list()
    stemmer = ISRIStemmer()
    for word in words:
        word = stemmer.norm(word, num=1)      # remove diacritics which representing Arabic short vowels
        if not word in stemmer.stop_words:    # exclude stop words from being processed
            word = stemmer.pre32(word)        # remove length three and length two prefixes in this order
            word = stemmer.suf32(word)        # remove length three and length two suffixes in this order
            word = stemmer.waw(word)          # remove connective ‘و’ if it precedes a word beginning with ‘و’
            word = stemmer.norm(word, num=2)  # normalize initial hamza to bare alif
        result.append(word)
    return ' '.join(result)

act=ara_corpus_tokenized
ara_corpus_clean =  [light_stem(words) for words in ara_corpus_tokenized ]
ara_corpus_tokenized =  [tokenizer.tokenize(doc) for doc in ara_corpus_clean]

In [20]:
print(act[:6])
print(ara_corpus_tokenized[:6])

[['تبارك', 'الله'], ['امرات', 'انيقت', 'سلست', 'خطابها', 'ذكيت', 'متواظعت'], ['جعفر', 'يحب', 'بلادو', 'ويعمل', 'الخير', 'يعاون', 'الناس', 'والحكومت', 'علاقت', 'تسرق', 'الشعب', 'وتشري', 'الكوش', 'للباجي'], ['والله', 'بكتني', 'برافوو', 'واصل'], ['محلاها'], ['والله', 'احسن', 'صار', 'قنات', 'الحوار', 'التونسي', 'المنشط', 'هاذا']]
[['تبارك', 'الله'], ['امر', 'انيقت', 'سلست', 'خطاب', 'ذكيت', 'متواظعت'], ['جعفر', 'يحب', 'بلادو', 'ويعمل', 'خير', 'يعا', 'ناس', 'حكومت', 'علاقت', 'تسرق', 'شعب', 'وتشري', 'كوش', 'باجي'], ['والله', 'بكت', 'برافوو', 'واصل'], ['محلا'], ['والله', 'احسن', 'صار', 'قنات', 'حوار', 'تونسي', 'منشط', 'هاذا']]


In [21]:
##7. Remove words that are too short or too long.

distinct_words = {word for doc in ara_corpus_tokenized for word in doc}
type(distinct_words),len(distinct_words)

words_len = pd.Series([len(word) for word in distinct_words])
words_len.describe()


count    14243.000000
mean         4.882398
std          1.852973
min          1.000000
25%          4.000000
50%          5.000000
75%          6.000000
max         58.000000
dtype: float64

In [22]:
ara_corpus_tokenized = [[word for word in doc if len(word)>=4 or char_is_emoji(word)] for doc in ara_corpus_tokenized]
ara_corpus_tokenized = [[word for word in doc if len(word)<=12] for doc in ara_corpus_tokenized]
type(ara_corpus_tokenized),len(ara_corpus_tokenized)

(list, 7786)

In [23]:
print(ara_corpus_tokenized[:10])

[['تبارك', 'الله'], ['انيقت', 'سلست', 'خطاب', 'ذكيت', 'متواظعت'], ['جعفر', 'بلادو', 'ويعمل', 'حكومت', 'علاقت', 'تسرق', 'وتشري', 'باجي'], ['والله', 'برافوو', 'واصل'], ['محلا'], ['والله', 'احسن', 'قنات', 'حوار', 'تونسي', 'منشط', 'هاذا'], ['شيبت', 'فنان', 'يعجب', 'صوتو', 'حلوا', '😊'], ['احسن', 'ممثل', 'ادوار'], ['مناسب', 'مكان', 'مناسب'], []]


## Construct a  language classifier


### Construct a language classifier using supervised learning

In [24]:
tun_corpus_clean = [cleanup_text(doc)   for doc in ara_corpus]
ara_corpus_clean = [cleanup_text(doc)   for doc in ara_ara_corpus]

# Step 1   COMPLETE THE CODE BELOW

from sklearn.model_selection import train_test_split
#?train_test_split

tun_corpus_clean_train, tun_corpus_clean_test = train_test_split(tun_corpus_clean,test_size=0.3 )

ara_corpus_clean_train, ara_corpus_clean_test = train_test_split(ara_corpus_clean,test_size=0.3 )



In [25]:
# Step 1   create 2 data frames called train_df and test_df (as explained above)

## COMPLETE THE CODE BELOW

# create data frame
train_df = pd.DataFrame({'document':[], 'language':[]})

# fill the language column
train_df.language = pd.Series(['TUN']*len(tun_corpus_clean_train) + ['ARA']*len(tun_corpus_clean_train) )


# fill the document column -- CONCATENATE the TUN CORPUS and ARA CORPUS
train_df.document = pd.Series(tun_corpus_clean_train + ara_corpus_clean_train )

test_df = pd.DataFrame({'document':[], 'language':[]})
test_df.language = pd.Series(['TUN']*len(tun_corpus_clean_test) + ['ARA']*len(ara_corpus_clean_test) )
test_df.document = pd.Series(tun_corpus_clean_test + ara_corpus_clean_test   )


In [27]:
# Step 2  Convert the training documents into numeric feature vectors using the BOW-tfidf method with character ngrams.
from sklearn.feature_extraction.text import TfidfVectorizer
## COMPLETE THE CODE BELOW

n = 3   # hyperparameter for of character ngrams ; you can change it if you want but n=3 is a reaonable value ...

# Create an instance of TfidfVectorizer class with analyzer = 'char' so that it generates bag of characters and not bag of words
bow_model_char = TfidfVectorizer(analyzer='char', ngram_range=(1,n), max_df =0.9, min_df=0.1)

# Call fit method with the combined training corpus
bow_model_char.fit(train_df.document)

# Create DTM matrix of the combined training corpus and test corpus
dtm_Train=bow_model_char.transform(train_df.document)
dtm_Test=bow_model_char.transform(test_df.document)


In [28]:
# Step 3   -- see official documentation of MultinomialNB in scikit-learn

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix

nb_model = MultinomialNB()

nb_model.fit(dtm_Train,train_df.language)

print(nb_model.predict(dtm_Test[0]),test_df.language[0])

print("Confusion matrix : ",confusion_matrix(test_df.language,nb_model.predict(dtm_Test)))

['TUN'] TUN
Confusion matrix :  [[5997  540]
 [ 269 2075]]


In [29]:
def predict(doc):
    dtm=bow_model_char.transform(pd.Series(doc))
    return nb_model.predict(dtm)

In [30]:
def splt(x):
    u = str(x).split(':')
    return [u[0][1:],float(u[1][0:-2])]

In [35]:
import langdetect
from langdetect.lang_detect_exception import LangDetectException



In [36]:
def lang_detect(doc):
    lg=splt(langdetect.detect_langs(doc))
    if lg[0]=='ar' :
        Lang=predict(doc)
    else:
        Lang='[\'OTHER\']'
    print(Lang)   

In [37]:
lang_detect(other_corpus[0])
lang_detect(ara_corpus_clean_test[0])
lang_detect(tun_corpus_clean_test[0])

['OTHER']
['ARA']
['TUN']


## Document Representation


#### Prepare the corpus for BOW

In [38]:
# First, concatenate the words in the cleaned corpus (because BOW method in scikit-learn requires this format)
ara_corpus_bow = [' '.join(doc) for doc in ara_corpus_tokenized]

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Configuration parameters of the BOW model
# FEEL FREE TO MODIFY THESE PARAMETERS AS NEEDED ...
max_words = 10000
maxdf = 0.7
mindf = 5
# create an instance of this class
bow_model = TfidfVectorizer(max_df=maxdf, min_df=mindf, max_features=max_words, stop_words=[], use_idf = True)
# call fit() method in order to prepare BOW method (determine vocabulary and IDF values)
bow_model.fit( ara_corpus_bow )
# Call the transform method in order to calculate DTM matrix of our corpus
ara_bow_dtm = bow_model.transform(ara_corpus_bow)

#### Remove documents that do not contain any vocabulary terms


In [40]:
nb_terms_per_doc = np.array((ara_bow_dtm>0).sum(axis=1))  # calculate sum of rows of DTM matrix
nb_terms_per_doc = nb_terms_per_doc.ravel()  # convert result to a 1D array (instead of 2D array)
idx = nb_terms_per_doc>0
ara_bow_dtm_filt = ara_bow_dtm[nb_terms_per_doc>0,:]
ara_corpus_bow_filt = [ara_corpus_bow[i] for i,x in enumerate(idx) if x]
ara_corpus_sentiment_filt = [ara_corpus_sentiment[i] for i,x in enumerate(idx) if x]

## Build Sentiment Classifier


In [41]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [42]:
X = ara_bow_dtm_filt
y = ara_corpus_sentiment_filt
# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1996)

### Train classifier using Naive Bayes

In [43]:
## Build Naive Bayes classification model

NB_model = MultinomialNB(alpha = 1.0)
NB_model.fit(X_train, y_train)
# Use this model to predict the sentiment category of test documents
y_pred_NB = NB_model.predict(X_test)
# Classification rate
metrics.accuracy_score(y_test, y_pred_NB)
# Confusion matrix
metrics.confusion_matrix(y_test, y_pred_NB)

array([[1076,   83],
       [  84,  870]], dtype=int64)

In [44]:
def classefy(doc):
    ara_corpus_bow=[]
    ara_corpus_bow.append(doc)
    ara_bow_dtm = bow_model.transform(ara_corpus_bow)
    nb_terms_per_doc = np.array((ara_bow_dtm>0).sum(axis=1))  # calculate sum of rows of DTM matrix
    nb_terms_per_doc = nb_terms_per_doc.ravel()  # convert result to a 1D array (instead of 2D array)
    idx = nb_terms_per_doc>0
    ara_bow_dtm_filt = ara_bow_dtm[nb_terms_per_doc>0,:]
    ara_corpus_bow_filt = [ara_corpus_bow[i] for i,x in enumerate(idx) if x]
    if len(ara_corpus_bow_filt)==0:
        print('0')
    else:
        y=(NB_model.predict(ara_bow_dtm_filt))
        print(y)
    

In [46]:
classefy(other_corpus[0])

0


In [47]:
classefy(ara_corpus_neg [0])

[-1]


In [49]:
classefy(ara_corpus_pos [0])

[1]


### Train classifier using logistic regression

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [51]:
# Train the model using Logistic Regression method
LR_model = LogisticRegression(penalty='l2')
LR_model.fit(X_train, y_train)
# Use this model to predict the sentiment category of test documents
y_pred_LR = LR_model.predict(X_test)
# Calculate the classification rate of this classifier
metrics.accuracy_score(y_test, y_pred_LR)
# Display the confusion matrix
print(metrics.confusion_matrix(y_test, y_pred_LR))

[[1097   62]
 [ 117  837]]


NB method seems better