# Content
* Importing Libraries
* Constants
* Preprocessing
    * Normalizing
    * Tokenizing
    * Stemming
    * Lemmatizing
* Feature Engineering
    * Bag of Words
    * FastText Word2Vec
* Model Selection

# Importing Libraries

In [1]:
from __future__ import unicode_literals

import json
import os
import numpy as np
import re
import pandas as pd
from functools import reduce
from hazm import *
from pprint import pprint

# Feature Engineering
from sklearn import feature_extraction

# Constants

In [2]:
# Data root path
data_root = 'data'

# Dataset dataframe column names
keys = None

# News headline tags
valid_tags = None

# News agencies
news_agencies = None

# Preprocessing

### Import Dataset

In [3]:
with open(os.path.join(data_root, 'out.jsonl'), encoding='utf-8') as json_data:
    news = [json.loads(line) for line in json_data]
    news = pd.DataFrame(news)
print('Number of Datapoints: {}'.format(len(news)))

Number of Datapoints: 1000


In [4]:
keys = list(news.columns)
pd.DataFrame([keys])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,NewsAgency,_id,body,bodyHtml,date,newsCode,newsLink,newsPath,newsPathLinks,rutitr,subtitle,tags,title


Lets look at our data

In [5]:
news.head(2)

Unnamed: 0,NewsAgency,_id,body,bodyHtml,date,newsCode,newsLink,newsPath,newsPathLinks,rutitr,subtitle,tags,title
0,AsrIran,5b4f7279020eb20597f401b4,مدیرعامل سابق استقلال از عضویت در هیات مدیره ا...,"<img align=""left"" class=""news_corner_image"" s...",تاریخ انتشار: ۲۱:۲۲ - ۲۷ تير ۱۳۹۷ - 18 July 2018,621662,http://www.asriran.com/fa/news/621662,صفحه نخست » ورزشی,"{'صفحه نخست': '/fa/archive?service_id=1', 'ورز...",,,"{'استقلال': '/fa/tag/1/استقلال', 'افتخاری': '/...",افتخاری قید ماندن در هیات مدیره استقلال را هم زد
1,AsrIran,5b4f7279020eb20597f401b5,دادستان انتظامی مالیاتی سازمان امور مالیاتی گف...,<p><br/>دادستان انتظامی مالیاتی سازمان امور م...,تاریخ انتشار: ۲۱:۱۱ - ۲۷ تير ۱۳۹۷ - 18 July 2018,621659,http://www.asriran.com/fa/news/621659,صفحه نخست » اجتماعی,"{'صفحه نخست': '/fa/archive?service_id=1', 'اجت...",,,"{'مالیات': '/fa/tag/1/مالیات', 'دادستان': '/fa...",دادستان انتظامی مالیاتی سازمان مالیات: آخرین ا...


### subtitle & rutitr

In [6]:
print(news.subtitle.sample(5))
print(news.rutitr.sample())

128    تالاب بین المللی انزلی که سالهاست از آلودگی ها...
509                                                     
747                                                     
984                                                     
952                                                     
Name: subtitle, dtype: object
610    
Name: rutitr, dtype: object


As we can see, there might not exist any 'subtitle' or 'rutitr', so we drop them if they do not have valuable features.

In [7]:
print("Not null 'subtitle' ",len([i for i in news.subtitle if len(i) != 0]))
print("Not null 'rutitr' ",len([i for i in news.rutitr if len(i) != 0]))

Not null 'subtitle'  569
Not null 'rutitr'  50


So based on information we got here, we know that these columns can help us, so we consider them.

### Drop Useless Columns

But it is clear for us, `date`,`newsCode`, `newsLink`,`bodyHtml` and `_id` are useless features. So we remove them from our dataset.


In [8]:
news = news.drop(['_id','date','newsCode','newsLink','bodyHtml'], axis=1)

In [9]:
news.head(5)

Unnamed: 0,NewsAgency,body,newsPath,newsPathLinks,rutitr,subtitle,tags,title
0,AsrIran,مدیرعامل سابق استقلال از عضویت در هیات مدیره ا...,صفحه نخست » ورزشی,"{'صفحه نخست': '/fa/archive?service_id=1', 'ورز...",,,"{'استقلال': '/fa/tag/1/استقلال', 'افتخاری': '/...",افتخاری قید ماندن در هیات مدیره استقلال را هم زد
1,AsrIran,دادستان انتظامی مالیاتی سازمان امور مالیاتی گف...,صفحه نخست » اجتماعی,"{'صفحه نخست': '/fa/archive?service_id=1', 'اجت...",,,"{'مالیات': '/fa/tag/1/مالیات', 'دادستان': '/fa...",دادستان انتظامی مالیاتی سازمان مالیات: آخرین ا...
2,AsrIran,قیمت سبد نفتی اوپک دیروز به روند کاهشی خود ادا...,صفحه نخست » اقتصادی,"{'صفحه نخست': '/fa/archive?service_id=1', 'اقت...",,,"{'اوپک': '/fa/tag/1/اوپک', 'نفت': '/fa/tag/1/ن...",قیمت سبد نفتی اوپک یک گام دیگر عقب نشست/ 70 دل...
3,AsrIran,رئیس فراکسیون فرهنگیان مجلس خطاب به وزیر آموزش...,صفحه نخست » اجتماعی,"{'صفحه نخست': '/fa/archive?service_id=1', 'اجت...",,,"{'آموزش و پرورش': '/fa/tag/1/آموزش و پرورش', '...",حاجی‌بابایی به بطحایی:آقای وزیر در اطلاع‌رسانی...
4,AsrIran,رئیس صندوق بین&zwnj;المللی پول در آستانه نشست ...,صفحه نخست » بین الملل,"{'صفحه نخست': '/fa/archive?service_id=1', 'بین...",,,{'صندوق بین المللی پول': '/fa/tag/1/صندوق بین ...,رئیس صندوق بین‌المللی پول: آمریکا از جنگ تعرفه...


### newsPath & newsPathLinks

In [10]:
newspathlinks_tags = list(set([list(x.keys())[1] for x in news.newsPathLinks]))
newspath_tags = list(set(x.split(' » ')[1] for x in news.newsPath))
print("news path links and news path show the same thing? => ", newspath_tags == newspathlinks_tags)
valid_tags = newspath_tags
pd.DataFrame([valid_tags])

news path links and news path show the same thing? =>  True


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,ورزشی,خواندنی ها و دیدنی ها,عمومی,روانشناسی,سلامت,دانلود,اقتصادی,علمی,بین الملل,سرگرمی,اجتماعی,حوادث,فناوری و IT,داستان کوتاه,سیاسی,سیاست خارجی,فرهنگی/هنری


Note: we can remove `newsPathLinks` because it is exactly duplicate of `newsPath` and update `newsPath` column in dataframe with just one keyword

In [11]:
news = news.drop(['newsPathLinks'], axis=1)
news.head(2)

Unnamed: 0,NewsAgency,body,newsPath,rutitr,subtitle,tags,title
0,AsrIran,مدیرعامل سابق استقلال از عضویت در هیات مدیره ا...,صفحه نخست » ورزشی,,,"{'استقلال': '/fa/tag/1/استقلال', 'افتخاری': '/...",افتخاری قید ماندن در هیات مدیره استقلال را هم زد
1,AsrIran,دادستان انتظامی مالیاتی سازمان امور مالیاتی گف...,صفحه نخست » اجتماعی,,,"{'مالیات': '/fa/tag/1/مالیات', 'دادستان': '/fa...",دادستان انتظامی مالیاتی سازمان مالیات: آخرین ا...


In [12]:
news.loc[:,'newsPath'] = list(x.split(' » ')[1] for x in news.newsPath)
news.head(3)

Unnamed: 0,NewsAgency,body,newsPath,rutitr,subtitle,tags,title
0,AsrIran,مدیرعامل سابق استقلال از عضویت در هیات مدیره ا...,ورزشی,,,"{'استقلال': '/fa/tag/1/استقلال', 'افتخاری': '/...",افتخاری قید ماندن در هیات مدیره استقلال را هم زد
1,AsrIran,دادستان انتظامی مالیاتی سازمان امور مالیاتی گف...,اجتماعی,,,"{'مالیات': '/fa/tag/1/مالیات', 'دادستان': '/fa...",دادستان انتظامی مالیاتی سازمان مالیات: آخرین ا...
2,AsrIran,قیمت سبد نفتی اوپک دیروز به روند کاهشی خود ادا...,اقتصادی,,,"{'اوپک': '/fa/tag/1/اوپک', 'نفت': '/fa/tag/1/ن...",قیمت سبد نفتی اوپک یک گام دیگر عقب نشست/ 70 دل...


### NewsAgency

In [13]:
news_agencies = list(news.NewsAgency.unique())
pd.DataFrame([news_agencies])

Unnamed: 0,0
0,AsrIran


### tags

In [14]:
def tag_extractor(tags_dict):
    """
    gets a tags dictionary and finds unique tags in collection of values and keys
    
    ::params tags_dict : 
    """
    keys = list(set(tags_dict.keys()))
    values = { v.split('/')[-1] for v in set(tags_dict.values())}
    [values.add(i) for i in keys]
    return list(values)

print('not processed tags_dict',news.tags[0])
print('processesd tags_dict',tag_extractor(news.tags[0]))

not processed tags_dict {'استقلال': '/fa/tag/1/استقلال', 'افتخاری': '/fa/tag/1/افتخاری'}
processesd tags_dict ['استقلال', 'افتخاری']


Now we replace `tags` column with extract values from `tag_extractor` function.

In [15]:
news.loc[:, 'tags'] = [tag_extractor(tag) for tag in news.tags]

In [16]:
news.head(5)

Unnamed: 0,NewsAgency,body,newsPath,rutitr,subtitle,tags,title
0,AsrIran,مدیرعامل سابق استقلال از عضویت در هیات مدیره ا...,ورزشی,,,"[استقلال, افتخاری]",افتخاری قید ماندن در هیات مدیره استقلال را هم زد
1,AsrIran,دادستان انتظامی مالیاتی سازمان امور مالیاتی گف...,اجتماعی,,,"[دادستان, مالیات]",دادستان انتظامی مالیاتی سازمان مالیات: آخرین ا...
2,AsrIran,قیمت سبد نفتی اوپک دیروز به روند کاهشی خود ادا...,اقتصادی,,,"[نفت, اوپک]",قیمت سبد نفتی اوپک یک گام دیگر عقب نشست/ 70 دل...
3,AsrIran,رئیس فراکسیون فرهنگیان مجلس خطاب به وزیر آموزش...,اجتماعی,,,"[آموزش و پرورش, فرهنگیان]",حاجی‌بابایی به بطحایی:آقای وزیر در اطلاع‌رسانی...
4,AsrIran,رئیس صندوق بین&zwnj;المللی پول در آستانه نشست ...,بین الملل,,,"[صندوق بین المللی پول, امریکا]",رئیس صندوق بین‌المللی پول: آمریکا از جنگ تعرفه...


<font color='yellow'>Note:</font> If you look at row 0 and row 10, you can see there is many noise in this dataset. We have two different tags for same news.

## Normalizing

In [17]:
normalizer = Normalizer()
news['body'] = news['body'].apply(normalizer.normalize)
news['rutitr'] = news['rutitr'].apply(normalizer.normalize)
news['subtitle'] = news['subtitle'].apply(normalizer.normalize)
news['title'] = news['title'].apply(normalizer.normalize)

## Tokenizing

In [18]:
def tokenize(phrase):
    sentences = sent_tokenize(phrase)
    if len(sentences) > 1:
        words = reduce(np.append, [word_tokenize(sentence) for sentence in sentences])
    elif len(sentences) == 1:
        words = word_tokenize(sentences[0])
    else:
        words = None
    return words

In [19]:
news['body'] = news['body'].apply(tokenize)
news['rutitr'] = news['rutitr'].apply(tokenize)
news['subtitle'] = news['subtitle'].apply(tokenize)
news['title'] = news['title'].apply(tokenize)

## Stemming

In [20]:
stemmer = Stemmer()
stem = lambda s: [stemmer.stem(w) for w in s] if s is not None else None
news['body'] = news['body'].apply(stem)
news['rutitr'] = news['rutitr'].apply(stem)
news['subtitle'] = news['subtitle'].apply(stem)
news['title'] = news['title'].apply(stem)

## Lemmatizing

In [21]:
lemmatizer = Lemmatizer()
lemmatize = lambda s: [lemmatizer.lemmatize(w) for w in s] if s is not None else None
news['body'] = news['body'].apply(lemmatize)
news['rutitr'] = news['rutitr'].apply(lemmatize)
news['subtitle'] = news['subtitle'].apply(lemmatize)
news['title'] = news['title'].apply(lemmatize)

In [22]:
news.head(5)

Unnamed: 0,NewsAgency,body,newsPath,rutitr,subtitle,tags,title
0,AsrIran,"[مدیرعامل, سابق, استقلال, از, عضو, در, ه, مدیر...",ورزشی,,,"[استقلال, افتخاری]","[افتخار, قید, ماندن, در, ه, مدیره, استقلال, را..."
1,AsrIran,"[دادس, انتظام, مالیات, ساز, امور, مالیات, گف, ...",اجتماعی,,,"[دادستان, مالیات]","[دادس, انتظام, مالیات, ساز, مال, :, آخرین, اخط..."
2,AsrIran,"[قیم, سبد, نفت, اوپک, دیروز, به, روند, کاهش, خ...",اقتصادی,,,"[نفت, اوپک]","[قیم, سبد, نفت, اوپک, یک, گا, دیگر, عقب, نشست/..."
3,AsrIran,"[رئیس, فراکسیون, فرهنگ, مجلس, خطاب, به, وزیر, ...",اجتماعی,,,"[آموزش و پرورش, فرهنگیان]","[حاجی‌بابا, به, بطحا, :, آقا, وزیر, در, اطلاع‌..."
4,AsrIran,"[رئیس, صندوق, بین&zwnj;الملل, پول, در, آستانه,...",بین الملل,,,"[صندوق بین المللی پول, امریکا]","[رئیس, صندوق, بین‌الملل, پول, :, آمریکا, از, ج..."


## Filter Words

### Remove Stopwords
For this step, we use stopwords from <a href='https://github.com/kharazi/persian-stopwords'>this repository</a>.

There are some files of stopwords and we are using <a href='https://github.com/kharazi/persian-stopwords/blob/master/persian'>this</a>.

```bash
git clone https://github.com/kharazi/persian-stopwords.git
```

In [23]:
stopwords_root = 'persian-stopwords'

with open(os.path.join(stopwords_root, 'persian'), encoding='utf-8') as stopwords_file:
    stopwords = [re.sub(r'\n','',word) for word in stopwords_file]
pd.DataFrame([stopwords[150:170]])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,اید,ایشان,ایم,این,این جوری,این قدر,این گونه,اینان,اینجا,اینجاست,ایند,اینطور,اینقدر,اینها,اینهاست,اینو,اینچنین,اینک,اینکه,اینگونه


Now we remove all stopwords from our dataset.

In [24]:
def filter_words(words_list, stopwords=stopwords):
    """
    Gets a list of words and remove stopwords from that list.
    
    :param words_list: a list of words to apply stopwords
    :param stopwords: a list of stopwords to remove from words_list
    """
    if words_list is None:
        return None
    filtered_words = [word for word in words_list if word not in stopwords]
    return filtered_words

s = news.title[0]
print(s)
s_filtered = filter_words(s, stopwords)
print(s_filtered)

['افتخار', 'قید', 'ماندن', 'در', 'ه', 'مدیره', 'استقلال', 'را', 'ه', 'زد#زن']
['افتخار', 'قید', 'ماندن', 'مدیره', 'استقلال', 'زد#زن']


In [25]:
news['body'] = news['body'].apply(filter_words)
news['rutitr'] = news['rutitr'].apply(filter_words)
news['subtitle'] = news['subtitle'].apply(filter_words)
news['title'] = news['title'].apply(filter_words)

In [26]:
print(news.title[0])
news.head(3)

['افتخار', 'قید', 'ماندن', 'مدیره', 'استقلال', 'زد#زن']


Unnamed: 0,NewsAgency,body,newsPath,rutitr,subtitle,tags,title
0,AsrIran,"[مدیرعامل, استقلال, عضو, مدیره, باشگاه, کناره,...",ورزشی,,,"[استقلال, افتخاری]","[افتخار, قید, ماندن, مدیره, استقلال, زد#زن]"
1,AsrIran,"[دادس, انتظام, مالیات, ساز, مالیات, گف, صور, ع...",اجتماعی,,,"[دادستان, مالیات]","[دادس, انتظام, مالیات, ساز, مال, آخرین, اخطار,..."
2,AsrIran,"[قیم, سبد, نفت, اوپک, روند, کاهش, ادامه, سه, ش...",اقتصادی,,,"[نفت, اوپک]","[قیم, سبد, نفت, اوپک, گا, نشست/, ۷۰, دلار, بشکه]"


In [27]:
for idx,n in enumerate(news.newsPath):
    if n=='دانلود':
        print(news.body[idx])

['IObit', 'Malware', 'Fighter', 'برنامه', 'عال', 'بردن', 'افزار', 'نر', 'افزار', 'جاسوس', 'ابزار', 'تبلیغات', 'مزاح', 'تروجان', 'لاگر', 'ربات', 'کرم', 'بود#باش', 'موتور', 'هسته', 'تعبیه', 'نر', 'افزار', 'IObit', 'Malware', 'Fighter', 'توانست#توان', 'برنامه', 'مخرب', 'اس', 'سیس', 'آسیب', 'رساند#رسان', 'ببرید', 'کوچک', 'مشکل', 'سیس', 'ایجاد', 'سیس', 'شد#شو', 'نر', 'افزار', 'IObit', 'Malware', 'Fighter', 'بردن', 'افزار', 'داشت#دار', 'اسکن', 'هوشمند', 'اسکن', 'اسکن', 'سفارش', '&nbsp;و', 'مطمئن', 'بود#باش', 'سه', 'حرفه', 'عمق', 'بدافزار', 'سیس', 'نابود', 'شد#شو', 'قابلیت', 'نر', 'افزار', 'IObit', 'Malware', 'Fighter', 'امک', 'رس', 'خودکار', 'نر', 'افزار', 'اینترن', 'امک', 'اسکن', 'سیس', 'سه', 'متد', 'اسکن', 'سیس', 'صور', 'مداو', 'اتوماتیک', 'جلوگیر', 'نفوذ', 'برنامه', 'مخرب', 'سیس', 'ایجاد', 'Real-time', 'Protection', 'قو', 'قدرتمند', 'ابزار', 'Startup', 'Guard', 'Browser', 'Guard', 'Network', 'Guard', 'File', 'Guard', 'Cookie', 'Guard', 'Process', 'Guard', 'USD', 'Disk', 'Guard', 'Maliciou

this shows that our 'دانلود' newsPath tag is valid but there is only a few samples for this tag.

### Remove ASCII
There are a lot of ascii characters which we do not need them such as html tags.

In [28]:
def remove_ascii(word_list):
    """
    Remove ascii characters from a list of words or a string
    :params word_list: a list of string or a string
    """
    
    if type(word_list) is list:
        return [re.sub(r'[\x00-\x7F]+',' ',w) for w in word_list]
    if type(word_list) is str:
        words = word_list.split()
        return [re.sub(r'[\x00-\x7F]+',' ',w) for w in words]

# Feature Engineering
The inherent unstructured (no neatly formatted data columns!) and noisy nature of textual data makes it harder for machine learning methods to directly work on raw text data.

##### Motivation
The importance of feature engineering is even more important for unstructured, textual data because we need to convert free flowing text into some numeric representations which can then be understood by machine learning algorithms.

### Feature Engineering Strategies
We try different methods and compare result using F1, precision and recall score gathered by result of different machine learning methods.

#### Import Libraries

In [29]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import feature_extraction as fe

%matplotlib inline

#### Bag of Words Model
First of all we should define **vector space** term. When we convert unstructured data to number such each dimension of a vector is one feature of space, we have a vector space.

**Bag of words** model is one of simpleset vector space methods.<br>
In bag of words model, each vector represent a document in corpus and each dimension is word in document and the value of corresponding dimension is the frequency of given word.

In [30]:
print(len([w for w in news.body.values if w is not None]))

969


In [31]:
# convert a list of string items to a string item (detokenization)
body_train = news.body.values
# just check it out! I got a weird result!
# body_train = [' '.join([w for w in s]) for s in body_train if s is not None]
body_train_ = []
for idx,s in enumerate(body_train):
    string=''
    if s is not None:
        for w in s:
            string+=' '+w
        body_train_.append(string)
    else:
        body_train_.append(string)

In [32]:
# Note: We just apply this method to "body", then after builing some models, we try other ones.
def bag_of_words(train):
    """
    calculate bag of word vector space of a list of strings
    
    :params train: train data as a list of strings
    """
    cv = fe.text.CountVectorizer(ngram_range=(1, 4), min_df=0.005, vocabulary=None,
                                lowercase=False, analyzer='word') # 4-gram model
    bow_train = cv.fit_transform(train)
    return bow_train, cv.get_feature_names()
bowt,feature_names = bag_of_words(body_train_)

In [33]:
pd.DataFrame(bowt.todense(),columns=feature_names).head(5)

Unnamed: 0,FATF,ad_type,ad_type standardvideo,ad_type standardvideo max_ad_duration,ad_type standardvideo max_ad_duration ۶۰۰۰,ads,ads pre,ads pre roll,ads pre roll advert,advert,...,۹۲,۹۳,۹۴,۹۵,۹۵ درصد,۹۶,۹۷,۹۸,۹۹,۹۹ درصد
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##### Bag of Words Vector Space Dimensions


In [34]:
print('number of examples: {}'.format(bowt.shape[0]))
print('number of features: {}'.format(bowt.shape[1]))

number of examples: 1000
number of features: 10865


OK let it go!

#### Labels
Now for training purpose, we should train our model using train data and corresponding labels.

In this dataset, labels are `newsPath`.

To deal with multiclass labels, we use `onehotencoders` to encode classes into categorical values represted by numbers.
Here we go...

##### Import libraries

In [35]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [36]:
y = news['newsPath'].values

def label_encoder(array):
    """
    Return corresponding label encoded array
    """
    array_label_encoder = LabelEncoder()
    encoded = array_label_encoder.fit_transform(array)
    return array_label_encoder,encoded
    
def onehot_encoder(label_encoded_array):
    """
    Return onehot encoded version of a label encoded input array
    """
    array_onehot_encoder = OneHotEncoder()
    onehot_encoded_array = array_onehot_encoder.fit_transform(label_encoded_array)
    return array_onehot_encoder, onehot_encoded_array


pd.DataFrame(label_encoder(y)[1], columns=['newsPath']).head()

Unnamed: 0,newsPath
0,16
1,0
2,1
3,0
4,2


#### Split Dataset Into Trainset and Testset

In [37]:
from sklearn.model_selection import train_test_split

In [38]:
# for Naive Bayes we just need to label encode target labels.
_, y_label_encoded = label_encoder(y)

In [39]:
x_train,x_test,y_train,y_test = train_test_split(bowt , y_label_encoded.reshape(-1,1), train_size = 0.90, random_state=85)
print('x #examples: {}, x #features:{}'.format(bowt.shape[0], bowt.shape[1]))
print('x_train #examples: {}, x_train #features: {}'.format(x_train.shape[0], x_train.shape[1]))
print('x_test #examples: {}, x_test #features: {}'.format(x_test.shape[0], x_test.shape[1]))
print('y_train #examples: {}, y_train #features: {}'.format(y_train.shape[0], y_train.shape[1]))
print('y_test #examples: {}, y_test #features: {}'.format(y_test.shape[0], y_test.shape[1]))

x_train = x_train.todense()
x_test = x_test.todense()



x #examples: 1000, x #features:10865
x_train #examples: 900, x_train #features: 10865
x_test #examples: 100, x_test #features: 10865
y_train #examples: 900, y_train #features: 1
y_test #examples: 100, y_test #features: 1


# Model Selection

In [40]:
from sklearn.metrics import confusion_matrix

In [41]:
def accuracy_on_cm(confusion_matrix):
    """
    Calculate accuracy on given confusion matrix
    """
    t = np.trace(confusion_matrix)
    f = np.sum(confusion_matrix) - t
    ac = t/(t+f)
    return (t,f,ac)

## Naive Bayes

### Gaussian Naive Bayes

In [42]:
from sklearn import naive_bayes as nb
naive_bayes = nb.GaussianNB()
naive_bayes = naive_bayes.fit(x_train, y_train.ravel())

In [43]:
y_train_predict = naive_bayes.predict(x_train)

In [44]:
cm_train = confusion_matrix(y_train,y_train_predict)
t_train,f_train,acc_train = accuracy_on_cm(cm_train)
print('Train status = #{} True, #{} False, %{} Accuracy'.format(t_train,f_train,acc_train*100))

Train status = #838 True, #62 False, %93.11111111111111 Accuracy


In [45]:
y_test_predict = naive_bayes.predict(x_test)
cm_test = confusion_matrix(y_test,y_test_predict)
t_test,f_test,acc_test = accuracy_on_cm(cm_test)
print('Guassian Test status = #{} True, #{} False, %{} Accuracy'.format(t_test,f_test,acc_test*100))

Guassian Test status = #51 True, #49 False, %51.0 Accuracy


### Multinomial Naive Bayes

In [46]:
naive_bayes = nb.MultinomialNB()
naive_bayes = naive_bayes.fit(x_train, y_train.ravel())

In [47]:
y_train_predict = naive_bayes.predict(x_train)

In [48]:
cm_train = confusion_matrix(y_train,y_train_predict)
t_train,f_train,acc_train = accuracy_on_cm(cm_train)
print('Multinomial Train status = #{} True, #{} False, %{} Accuracy'.format(t_train,f_train,acc_train*100))

Multinomial Train status = #772 True, #128 False, %85.77777777777777 Accuracy


In [49]:
y_test_predict = naive_bayes.predict(x_test)
cm_test = confusion_matrix(y_test,y_test_predict)
t_test,f_test,acc_test = accuracy_on_cm(cm_test)
print('Multinomial Test status = #{} True, #{} False, %{} Accuracy'.format(t_test,f_test,acc_test*100))

Multinomial Test status = #70 True, #30 False, %70.0 Accuracy


### Bernoulli Naive Bayes

In [50]:
naive_bayes = nb.BernoulliNB()
naive_bayes = naive_bayes.fit(x_train, y_train.ravel())

In [51]:
y_train_predict = naive_bayes.predict(x_train)
cm_train = confusion_matrix(y_train,y_train_predict)
t_train,f_train,acc_train = accuracy_on_cm(cm_train)
print('Bernoulli Train status = #{} True, #{} False, %{} Accuracy'.format(t_train,f_train,acc_train*100))

y_test_predict = naive_bayes.predict(x_test)
cm_test = confusion_matrix(y_test,y_test_predict)
t_test,f_test,acc_test = accuracy_on_cm(cm_test)
print('Bernoulli Test status = #{} True, #{} False, %{} Accuracy'.format(t_test,f_test,acc_test*100))

Bernoulli Train status = #635 True, #265 False, %70.55555555555556 Accuracy
Bernoulli Test status = #47 True, #53 False, %47.0 Accuracy


### Complement Naive Bayes

In [52]:
naive_bayes = nb.ComplementNB()
naive_bayes = naive_bayes.fit(x_train, y_train.ravel())

In [70]:
y_train_predict = naive_bayes.predict(x_train)
cm_train = confusion_matrix(y_train,y_train_predict)
t_train,f_train,acc_train = accuracy_on_cm(cm_train)
print('Complement Train status = #{} True, #{} False, %{} Accuracy'.format(t_train,f_train,acc_train*100))

y_test_predict = naive_bayes.predict(x_test)
cm_test = confusion_matrix(y_test,y_test_predict)
t_test,f_test,acc_test = accuracy_on_cm(cm_test)
print('Complement Test status = #{} True, #{} False, %{} Accuracy'.format(t_test,f_test,acc_test*100))

Bernoulli Train status = #745 True, #155 False, %82.77777777777777 Accuracy
Bernoulli Test status = #73 True, #27 False, %73.0 Accuracy


## Logistic Regression

### Kernel Newton-CG

In [53]:
from sklearn import linear_model as l
logistic_regression = l.LogisticRegression(random_state=85, solver='newton-cg', multi_class='auto')
logistic_regression = logistic_regression.fit(x_train, y_train.ravel())

In [54]:
y_train_predict = logistic_regression.predict(x_train)
cm_train = confusion_matrix(y_train,y_train_predict)
t_train,f_train,acc_train = accuracy_on_cm(cm_train)
print('Logistic Regression Train status = #{} True, #{} False, %{} Accuracy'.format(t_train,f_train,acc_train*100))

y_test_predict = logistic_regression.predict(x_test)
cm_test = confusion_matrix(y_test,y_test_predict)
t_test,f_test,acc_test = accuracy_on_cm(cm_test)
print('Logistic Regression Test status = #{} True, #{} False, %{} Accuracy'.format(t_test,f_test,acc_test*100))

Bernoulli Train status = #865 True, #35 False, %96.11111111111111 Accuracy
Bernoulli Test status = #62 True, #38 False, %62.0 Accuracy


### Scratch Model
In this section we implement Logistic Regression from scratch using numpy library.

We will explain our structure in each section provided below:
1. Constants
1. Cost Function
2. Gradient Function
3. Learning Parameters using `minimize` optimizer
4. Prediction
5. Evaluating Model
6. Hyperparameter Tuning

#### 1. Constants

In [43]:
m,n = x_train.shape

# define theta as zero
theta = np.zeros(n)

# define hyperparameter λ
lambda_ = 1

#### 2. Cost Function


In [223]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

def lr_hypothesis(x,theta):
    return np.dot(x,theta)

def compute_cost(theta, x, y, lambda_, m, n):
    theta = theta.reshape(n,1)
    x = x.reshape(m,n)
    y = y.reshape(m,1)
    infunc1 = -y*(np.log(sigmoid(lr_hypothesis(x,theta)))) - (1-y)*(np.log(1 - sigmoid(lr_hypothesis(x,theta))))
    infunc2 = lambda_*np.sum(theta[1:]**2)/(2*m)
    j = np.sum(infunc1)/m+infunc2
    return j

#### 3. Gradient Function

In [234]:
zzz = []

In [272]:
# gradient[0] correspond to gradient for theta(0)
# gradient[1:] correspond to gradient for theta(j) j>0
def compute_gradient(theta, x, y, lambda_, m, n):
    gradient = np.zeros(n).reshape(n,) # 5,
    theta = theta.reshape(n,1) # 5,1
    x = x.reshape(m,n)
    y = y.reshape(m,1)
    infunc1 = sigmoid(lr_hypothesis(x,theta))-y # 10,1
    gradient_ = np.dot(x.T,infunc1)/m # shape=(n,1)
    gradient_ = gradient_.reshape(n,) # this line not working at all. shape=(1,n) !!!!!!!
    #gradient[0] = gradient_[0]    
    gradient[1:] = gradient_[1:]+(lambda_*theta[1:,]/m).reshape(n-1,) # theta(j) ; j>0
    return gradient

In [273]:
theta_test = np.array([-2,-1,1,2])

x_test = np.append(np.ones(5),np.arange(0.1,1.6,0.1)).reshape(5,4, order='F')

y_test = np.array([1,0,1,0,1]).reshape(-1,1)

m,n= x_test.shape

cost_temp = compute_cost(theta=theta_test,x=x_test,y=y_test,lambda_=3,m=5, n=4)
gradient_temp = compute_gradient(theta=theta_test,x=x_test,y=y_test,lambda_=3, m=5, n=4)

print('if lambda = 3 =======>\n cost = {}\n ,\n gradients = \n{}'
      .format(cost_temp,gradient_temp))

 cost = 2.5348193961097443
 ,
 gradients = 
[ 0.         -0.54855841  0.72472227  1.39800296]


#### 4. Learning Parameters Using `fmin_cg` Optimizer
**Scipy's fmin_cg** is an optimization solver that finds the **minimum of an unconstrained** function. For regularized logistic regression, you want to **optimize the cost function J(θ) with parameters θ**. Concretely, you are going to use minimize to find the best parameters θ for the regularized logistic regression cost function, given a fixed dataset (of x and y values). You will pass to minimize the following inputs:
1. The initial values of the parameters we are trying to optimize.
2. A function that, when given the training set and a particular θ, computes the regularized logistic regression cost with respect to θ for the dataset (x, y) ======> compute_cost
3. A function that, when given the training set and a particular θ, computes the regularized logistic regression gradient with respect to θ for the dataset (x, y) ======> compute_gradient

#### 5. Evaluting Model
We need to calculate **probabilities and related predictions** and then compare predicted value to real one to get accuracy.

In [277]:
m,n = x_train.shape
lambda_=0.1
theta = np.zeros(shape=(n,))

In [278]:
# import library
import scipy.optimize as opt

def one_vs_all(theta,x,y,num_labels,lambda_, m, n):
    all_theta = np.zeros(shape=(num_labels,n))
    
    
    for i in range(0,num_labels):
        all_theta[i] = opt.fmin_cg(f=compute_cost, fprime=compute_gradient, 
                                   x0 = theta, args=(x,(y==i)*1, lambda_, m, n), full_output=True)
        #optimized = opt.minimize(compute_cost, theta, args=(x,(y==i)*1,lambda_),
        #                     method=None, jac= True)
        #all_theta[c] = optimized.X
    return all_theta

In [279]:
all_theta = one_vs_all(theta.flatten() ,x_train.flatten() , y_train, len(valid_tags), lambda_,
                       m=x_train.shape[0], n=x_train.shape[1])

ValueError: operands could not be broadcast together with shapes (0,10865) (10864,) 

### Kernel SAGA

In [55]:
from sklearn import linear_model as l
logistic_regression = l.LogisticRegression(random_state=85, solver='saga', multi_class='auto')
logistic_regression = logistic_regression.fit(x_train, y_train.ravel())



In [56]:
y_train_predict = logistic_regression.predict(x_train)
cm_train = confusion_matrix(y_train,y_train_predict)
t_train,f_train,acc_train = accuracy_on_cm(cm_train)
print('Saga Train status = #{} True, #{} False, %{} Accuracy'.format(t_train,f_train,acc_train*100))

y_test_predict = logistic_regression.predict(x_test)
cm_test = confusion_matrix(y_test,y_test_predict)
t_test,f_test,acc_test = accuracy_on_cm(cm_test)
print('Saga Test status = #{} True, #{} False, %{} Accuracy'.format(t_test,f_test,acc_test*100))

Bernoulli Train status = #649 True, #251 False, %72.11111111111111 Accuracy
Bernoulli Test status = #63 True, #37 False, %63.0 Accuracy


## SVM

In [61]:
from sklearn import svm
svc = svm.SVC(kernel='linear', random_state=85, gamma='auto')
svc = svc.fit(x_train, y_train.ravel())

In [62]:
y_train_predict = svc.predict(x_train)
cm_train = confusion_matrix(y_train,y_train_predict)
t_train,f_train,acc_train = accuracy_on_cm(cm_train)
print('SVM Train status = #{} True, #{} False, %{} Accuracy'.format(t_train,f_train,acc_train*100))

y_test_predict = svc.predict(x_test)
cm_test = confusion_matrix(y_test,y_test_predict)
t_test,f_test,acc_test = accuracy_on_cm(cm_test)
print('SVM Test status = #{} True, #{} False, %{} Accuracy'.format(t_test,f_test,acc_test*100))

Bernoulli Train status = #866 True, #34 False, %96.22222222222221 Accuracy
Bernoulli Test status = #53 True, #47 False, %53.0 Accuracy


## Simple ANN

## CNN

## RNN

## CNN + RNN