# Support Vector Regression/Machine for Humorous Level Prediction

This program is modified from https://github.com/nkartik94/Multi-Label-Text-Classification
on 2019/11/18 by Yuen-Hsien Tseng

## 1. EDA: Exploratory Data Analysis

In [1]:
import os, sys, time
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))
#printmd('**bold**')

In [3]:
!cd /Users/sam/GoogleDrive/指導的學生/2019_吳玟萱/2019_1104_Joke_Datasets
!pwd

/Users/sam/GoogleDrive/指導的學生/2019_吳玟萱/2019_1104_Joke_Datasets


In [4]:
data_path = "mlabel_corpora/JokeHumorLevel.txt"

In [5]:
# set global variables: df
df = pd.read_csv(data_path, delimiter="\t")
#data_raw = df.loc[np.random.choice(data_raw.index, size=2000)]
print(df.shape) # same as data_raw.shape in Jupyter

(3365, 4)


In [6]:
from sklearn.model_selection import train_test_split

# ID=L1850 為分界，之前：吳玟萱，之後：黃亭筠，均為中文系同一屆
train, test = train_test_split(df, train_size=1691, shuffle=False) 
# (tempararily) set global variables: train, test 

with open('mlabel_corpora/JokeHumorLevel_train.txt', 'w') as outF:
    outF.write(train.to_csv(sep='\t', index=False))

with open('mlabel_corpora/JokeHumorLevel_test.txt', 'w') as outF:
    outF.write(test.to_csv(sep='\t', index=False))

print(train.shape)
print(test.shape)

(1691, 4)
(1674, 4)




In [7]:
# Do not do this, because there are many duplicate titles
# Merge Title into Content
'''
df['Content'] = df[df.columns[1:3]].apply(
    lambda x: ' 。 '.join(x.dropna().astype(str)),
    axis=1
)
print("Number of rows in data =",df.shape[0])
print("Number of columns in data =",df.shape[1])
print("\n")
printmd("**Sample data:**")
df.head()
'''

'\ndf[\'Content\'] = df[df.columns[1:3]].apply(\n    lambda x: \' 。 \'.join(x.dropna().astype(str)),\n    axis=1\n)\nprint("Number of rows in data =",df.shape[0])\nprint("Number of columns in data =",df.shape[1])\nprint("\n")\nprintmd("**Sample data:**")\ndf.head()\n'

### 1.1. Checking for missing values

In [8]:
missing_values_check = df.isnull().sum()
print(missing_values_check)

ID            0
Title         0
Content       0
HumorLevel    0
dtype: int64


### 1.2. Calculating number of jokes under each label

In [9]:
# Jokes with no label are considered to be clean jokes.
# Creating seperate column in dataframe to identify clean jokes.
# We use axis=1 to count row-wise and axis=0 to count column wise
def print_empty_label(df, s):
    rowSums = df.iloc[:,3:].sum(axis=1)
    #print(rowSums.shape)
    #print(rowSums.head())
    clean_comments_count = (rowSums==0).sum(axis=0)

    print(f"Total number of {s} jokes = ",len(df))
    print(f"Number of clean jokes in {s}= ",clean_comments_count)
    print(f"Number of {s} jokes with labels =",(len(df)-clean_comments_count))
    print()

In [10]:
print_empty_label(df, 'all')
print_empty_label(train, 'train')
print_empty_label(test, 'test')

Total number of all jokes =  3365
Number of clean jokes in all=  0
Number of all jokes with labels = 3365

Total number of all jokes =  1691
Number of clean jokes in all=  0
Number of all jokes with labels = 1691

Total number of all jokes =  1674
Number of clean jokes in all=  0
Number of all jokes with labels = 1674



In [11]:
# set global variables: categories
categories = list(df.columns.values)
print(categories)
categories = categories[3:]
print(categories)

['ID', 'Title', 'Content', 'HumorLevel']
['HumorLevel']


In [12]:
# Calculating number of humor levels 
# https://stackoverflow.com/questions/45759966/counting-unique-values-in-a-column-in-pandas-dataframe-like-in-qlik
def print_HumorLevel_count(df, categories):
    for c in categories:
        print(df[c].value_counts())
        #print(df[c].value_counts(normalize=True))
    print()

In [13]:
print_HumorLevel_count(df, categories)
print_HumorLevel_count(train, categories)
print_HumorLevel_count(test, categories)

3    1313
2     867
4     729
1     363
5      93
Name: HumorLevel, dtype: int64

3    742
4    604
2    251
5     47
1     47
Name: HumorLevel, dtype: int64

2    616
3    571
1    316
4    125
5     46
Name: HumorLevel, dtype: int64



In [14]:
# Not correct yet!!!
def plot_HumorLevel_count(df, categories):
    sns.set(font_scale = 2)
    plt.figure(figsize=(15,8))

    ax= sns.barplot(categories, df[categories[0]].value_counts())

    plt.title("Humorous Level Count", fontsize=24)
    plt.ylabel('Number of jokes', fontsize=18)
    plt.xlabel('Humorous Level', fontsize=18)

    #adding the text labels
    rects = ax.patches
    #print(rects)
    labels = df[categories[0]].value_counts()
    #print(labels)
    for rect, label in zip(rects, labels):
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom', fontsize=18)

    plt.show()

In [15]:
#plot_HumorLevel_count(df, categories)

## 2. Data Pre-Processing

In [16]:
import jieba
import Stopwords

In [17]:
# Compute statistics of the dataset: MaxLength, MinLength, AvgChars, AvgWords
Len = df.Content.map(len)
print(f'Number of characters in   all jokes: Max={max(Len)}, Min={min(Len)}, Avg={sum(Len)/len(Len)}')
Len = train.Content.map(len)
print(f'Number of characters in train jokes: Max={max(Len)}, Min={min(Len)}, Avg={sum(Len)/len(Len)}')
Len = test.Content.map(len)
print(f'Number of characters in  test jokes: Max={max(Len)}, Min={min(Len)}, Avg={sum(Len)/len(Len)}')

Number of characters in   all jokes: Max=2024, Min=10, Avg=134.07637444279345
Number of characters in train jokes: Max=2024, Min=10, Avg=132.7906564163217
Number of characters in  test jokes: Max=874, Min=12, Avg=135.3751493428913


In [18]:
# set global variables: data
data = df
#data = df.loc[np.random.choice(df.index, size=3365)]
data.shape

(3365, 4)

In [19]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

### 2.1. Cleaning Data

In [20]:
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext

def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

In [21]:
import Stopwords # import my own module with STOP_WORDS
from nltk.stem import PorterStemmer, WordNetLemmatizer
ps = PorterStemmer()
wnl = WordNetLemmatizer()

In [22]:
def clean_text(text): 
    '''
    Given a raw text string, return a clean text string.
    Example: 
        input:  "Years  passed. 多少   年过 去 了 。  "
        output: "years passed.多少年过去了。"
    '''
    text = str(text)
    text = text.lower() # 'years  passed. 多少   年过 去 了 。'
    # Next line will remove redundant white space for jeiba to cut
    text = re.sub(r'\s+([^a-zA-Z0-9.])', r'\1', text) # years passed.多少年过去了。
# see: https://stackoverflow.com/questions/16720541/python-string-replace-regular-expression
    text = text.strip(' ')
    return text

def clean_words(text, RmvStopWord=True, RmvMark=True):
    words = jieba.lcut(text)
#    print("After jieba.lcut():", words)
#    WL = [ w 
    WL = [ ps.stem(w)
#    WL = [ wnl.lemmatize(w)
        for w in words 
          if (not re.match(r'\s', w)) # remove white spaces
            and (RmvMark==False or not re.match(r'\W', w)) # remove punctuations
#            and (RmvMark==False or not re.match('^[a-z_]$', w)) # remove punctuations
#            and (RmvMark==False or w not in PUNCTUATIONS)
            and (RmvStopWord==False or w not in Stopwords.STOP_WORDS)
            and (not re.match(r'^\d+$', w)) # remove digit
         ]
    WL = " ".join(WL)
    return WL

In [23]:
print(data.head())
data['Content'] = data['Content'].str.lower()
#data['Content'] = data['Content'].apply(cleanHtml)
#data['Content'] = data['Content'].apply(cleanPunc)
#data['Content'] = data['Content'].apply(keepAlpha)
data['Content'] = data['Content'].apply(clean_text)
data['Content'] = data['Content'].apply(clean_words)
#data.head()

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/kg/jcdj05xn20144cv9kwywp26r0000gn/T/jieba.cache


      ID Title                                            Content  HumorLevel
0  L0001  要求加薪  員工：老闆，您必須幫我加薪，已經有三家公司在找我了！     老闆：哪三家？     員工：...           4
1  L0002  查無此人  某市政府辦公大樓落成，門口缺副對聯。     副市長揮毫     上聯：說實話辦實事一身正氣...           3
2  L0003   遣散費  中午老闆視察自己的建築工地時，發現有個人在角落玩手機。     老闆：你月薪多少？     ...           4
3  L0004  職業習慣  一天，一位法官的女友看見兩個蚊子，便叫法官打死。     只見法官只把那個肚子飽飽的蚊子打死...           2
4  L0005  美女吵架  辦公室中兩位女同事吵起來了。     經理忍無可忍：「太不像話了！現在是什麼情況？你們把原因...           4


Loading model cost 0.630 seconds.
Prefix dict has been built succesfully.


### 2.2. Removing Stop Words

In [24]:
stop_words = set(stopwords.words('english'))
stop_words.update(['zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)

data['Content'] = data['Content'].apply(removeStopWords)
#data.head()

### 2.3. Stemming

In [25]:
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

data['Content'] = data['Content'].apply(stemming)
data.head()

Unnamed: 0,ID,Title,Content,HumorLevel
0,L0001,要求加薪,員工 老 闆 必須 幫 加薪 已經 三家 公司 找 老 闆 三家 員工 自來 水 公司 台電...,4
1,L0002,查無此人,某 市政府 辦公大樓 落成 門口 缺 副 對聯 副 市長 揮 毫上 聯 實話 辦實事 一身 ...,3
2,L0003,遣散費,中午 老 闆 視察 自己 建築 工地 時 發現 個 人 角落 玩手 機 老 闆 月薪 多少 ...,4
3,L0004,職業習慣,一天 一位 法官 女友 看見 兩個 蚊子 便 叫 法官 打死 只見 法官 只 那個 肚子 飽...,2
4,L0005,美女吵架,辦 公室 中 兩位 女同事 吵起 經理 忍無可忍 太不像 話 情況 原因 給我 清楚 兩人 ...,4


### 2.4. Train-Test Split

In [26]:
from sklearn.model_selection import train_test_split

# set global variables: train, test
#train, test = train_test_split(data, random_state=42, test_size=0.10, shuffle=True)
train, test = train_test_split(data, random_state=42, train_size=1691, shuffle=False)

print(train.shape)
print(test.shape)

(1691, 4)
(1674, 4)


In [27]:
# set global variables: train_text, test_text
train_text = train['Content']
test_text = test['Content']

In [28]:
# Compute statistics of the dataset: MaxLength, MinLength, AvgChars, AvgWords
Len = data.Content.map(lambda x: len(x.split()))
print(f'Number of words in   all jokes: Max={max(Len)}, Min={min(Len)}, Avg={sum(Len)/len(Len)}')
Len = train.Content.map(lambda x: len(x.split()))
print(f'Number of words in train jokes: Max={max(Len)}, Min={min(Len)}, Avg={sum(Len)/len(Len)}')
Len = test.Content.map(lambda x: len(x.split()))
print(f'Number of words in  test jokes: Max={max(Len)}, Min={min(Len)}, Avg={sum(Len)/len(Len)}')

Number of words in   all jokes: Max=491, Min=3, Avg=41.6222882615156
Number of words in train jokes: Max=491, Min=3, Avg=40.33234772324069
Number of words in  test jokes: Max=290, Min=4, Avg=42.92532855436081


### 2.6 TF-IDF

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm

In [86]:
#vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', 
                             ngram_range=(1,2), norm='l2')
vectorizer.fit(train_text)
vectorizer.fit(test_text)

In [None]:
# Set global variables:
x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['ID', 'Title', 'Content'], axis=1)
#print(y_train.head())
train_yL = y_train['HumorLevel']
#print(type(train_yL), "\n", train_yL.head())

x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['ID', 'Title', 'Content'], axis=1)
test_yL = y_test['HumorLevel']

# label encode the target variable 
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
LabEncoder = preprocessing.LabelEncoder() # convert label name to label int
train_y = LabEncoder.fit_transform(train_yL)
test_y = LabEncoder.fit_transform(test_yL)
Num_Classes = len(LabEncoder.classes_)

In [87]:
time_TfidfVector = time.time()

def Create_TFxIDF(data_text, train_text, test_text):

# word level tf-idf
    #tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=10000)
    tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', 
        stop_words=Stopwords.STOP_WORDS, max_df=0.95, min_df=1, max_features=10000)
    tfidf_vect.fit(data_text)
    xtrain_tfidf = tfidf_vect.transform(train_text)
    xtest_tfidf = tfidf_vect.transform(test_text)
    print(f"xtrain_tfidf.shape:{xtrain_tfidf.shape}, xtest_tfidf.shape: {xtest_tfidf.shape}")

# word level ngram tf-idf 
    tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', 
                    stop_words=Stopwords.STOP_WORDS, max_df=0.95, min_df=1,
                    ngram_range=(1,3), max_features=10000)
    tfidf_vect_ngram.fit(data_text)
    xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_text)
    xtest_tfidf_ngram =  tfidf_vect_ngram.transform(test_text)
    print(f"xtrain_tfidf_ngram.shape:{xtrain_tfidf.shape}, xtest_tfidf_ngram.shape: {xtest_tfidf.shape}")

# character level ngram tf-idf
    tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', 
                    stop_words=Stopwords.STOP_WORDS, max_df=0.95, min_df=1,
                    ngram_range=(1,3), max_features=10000)
    tfidf_vect_ngram_chars.fit(data_text)
    xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_text) 
    xtest_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_text) 
    print(f"xtrain_tfidf_ngram_chars.shape:{xtrain_tfidf.shape}, xtest_tfidf_ngram_chars.shape: {xtest_tfidf.shape}")

    print("It takes %4.2f seconds to convert 3 TFxIDF vectors."%(time.time()-time_TfidfVector))

    return (xtrain_tfidf, xtest_tfidf, 
             xtrain_tfidf_ngram, xtest_tfidf_ngram,
             xtrain_tfidf_ngram_chars, xtest_tfidf_ngram_chars,
            tfidf_vect, tfidf_vect_ngram, tfidf_vect_ngram_chars)

# Set global variables:
(xtrain_tfidf, xtest_tfidf, 
 xtrain_tfidf_ngram, xtest_tfidf_ngram,
 xtrain_tfidf_ngram_chars, xtest_tfidf_ngram_chars,
 tfidf_vect, tfidf_vect_ngram, tfidf_vect_ngram_chars) = Create_TFxIDF(data.Content, train_text, test_text)

xtrain_tfidf.shape:(1691, 10000), xtest_tfidf.shape: (1674, 10000)
xtrain_tfidf_ngram.shape:(1691, 10000), xtest_tfidf_ngram.shape: (1674, 10000)
xtrain_tfidf_ngram_chars.shape:(1691, 10000), xtest_tfidf_ngram_chars.shape: (1674, 10000)
It takes 3.24 seconds to convert 3 TFxIDF vectors.


In [88]:
# re-assign x_train and x_test to what we want
#x_train, x_test, vectorizer = xtrain_tfidf, xtest_tfidf, tfidf_vect
#x_train, x_test, vectorizer = xtrain_tfidf_ngram, xtest_tfidf_ngram, tfidf_vect_ngram
#x_train, x_test, vectorizer = xtrain_tfidf_ngram_chars, xtest_tfidf_ngram_chars, tfidf_vect_ngram_chars
print(x_train.shape, x_test.shape)
#print(x_train)

(1691, 62330) (1674, 62330)


### 2.6 Count Vector

In [89]:
time_CountVector = time.time()

def Create_CountVector(data_text, train_text, test_text):

# Create a count vectorizer object.
# It takes the steps of prepocessing, tokenizer, stopwording, ...
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
    count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', 
        stop_words=Stopwords.STOP_WORDS, max_df=0.95, min_df=1)
    count_vect.fit(data_text)

# transform the training and validation data using count vectorizer object
    xtrain_count = count_vect.transform(train_text)
    xtest_count = count_vect.transform(test_text)

    print("It takes %4.2f seconds to convert count vectors."%(time.time()-time_CountVector))

    return(xtrain_count, xtest_count, count_vect)

# Set global variables:
(xtrain_count, xtest_count, count_vect) = Create_CountVector(data.Content, train_text, test_text)

It takes 0.27 seconds to convert count vectors.


In [90]:
def Print_count_vect(xtrain_count, xtest_count, count_vect):
    print(type(count_vect), count_vect)
    print(type(xtrain_count), type(xtest_count))
    print("xtrain_count.shape:", xtrain_count.shape)
    print("xtest_count.shape :", xtest_count.shape)
# https://stackoverflow.com/questions/36967666/transform-scipy-sparse-csr-to-pandas
# from scipy.sparse.csr import csr_matrix
    # A = csr_matrix([[1, 0, 2], [0, 3, 0]]); print(A)
    # df = pd.DataFrame(A.toarray()); print(df)
    #print(xtrain_count)
    #print(xtest_count[0, 0:10])
    print("\nUsed stop words: ", count_vect.get_stop_words())
    
Print_count_vect(xtrain_count, xtest_count, count_vect)

<class 'sklearn.feature_extraction.text.CountVectorizer'> CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['的', '是', '了', '和', '與', '及', '或', '於', '也', '並', '之', '以', '在', '另', '又', '該', '由', '但', '仍', '就', '都', '讓', '要', '把', '上', '來', '說', '從', '等', '我', '你', '他', '妳', '她', '它', '您', '我們', '你們', '妳們', '他們', '她們', '有', '此', '因', '且', '為', '嗎', '那', '哪', '吧', '很', '這', '並有', '並可', '可以', '可供',...ithin', 'without', 'would', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves', 'said', 'told'],
        strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
        vocabulary=None)
<class 'scipy.sparse.csr.csr_matrix'> <class 'scipy.sparse.csr.csr_matrix'>
xtrain_count.shape: (1691, 28013)
xtest_count.shape : (1674, 28013)

Used stop words:  frozenset({'eight

## 3. Support Vector Regression

In [91]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error
from sklearn.metrics import explained_variance_score, r2_score

In [92]:
# define the evaluation metrics
def print_svr_report(y_true, prediction):
    #print('max_error: %1.4f'%(max_error(y_true, prediction)))
    print('mean_squared_error: %1.4f'%(mean_squared_error(y_true, prediction)))
    print('mean_absolute_error: %1.4f'%(mean_absolute_error(y_true, prediction)))
    print('r2_score: %1.4f'%(r2_score(y_true, prediction)))
    print('explained_variance_score: %1.4f'%(explained_variance_score(y_true, prediction)))

    print(f'y_true.shape={y_true.shape}, prediction.shape={prediction.shape}')
    #type(y_true<class 'pandas.core.frame.DataFrame'>, type(prediction)=<class 'numpy.ndarray'>
    print(f'type(y_true{type(y_true)}, type(prediction)={type(prediction)}')

    #pred = pd.DataFrame(data=prediction, columns=[y_true.columns]) # cannot have the same column name with y_true
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.reset_index.html
    pred = pd.DataFrame(data=y_true.reset_index(drop=True), columns=['HumorLevel'])
    pred['Funiness'] = prediction
    pred['FuninessLevel'] = pred['Funiness'].map(round)
    #print('y_true.head() :\n', y_true.head())
    print('pred.head() :\n', pred.head())

    print(y_true[y_true.columns[0]].value_counts(normalize=False, sort=False, ascending=False, bins=None, dropna=True))
    print(pred.FuninessLevel.value_counts(normalize=False, sort=False, ascending=False, bins=None, dropna=True))
# https://stackoverflow.com/questions/52777668/python-pandas-compare-two-columns-for-equality-and-result-in-third-dataframe
    pred['result'] = np.where(pred['HumorLevel'] == pred['FuninessLevel'], 1, 0)
    print(pred['result'].value_counts(normalize=False, sort=False, ascending=False, bins=None, dropna=True))
    print(pred['result'].value_counts(normalize=True, sort=False, ascending=False, bins=None, dropna=True))

In [93]:
# Next line refers to: http://scikit.ml/tutorial.html
from sklearn.svm import SVR, LinearSVR
# Next line refers to https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
from sklearn.linear_model import LinearRegression


In [94]:
reg = LinearRegression()
reg.fit(x_train, y_train)
print("reg.score:", reg.score(x_train, y_train))
predictions = reg.predict(x_test)
print_svr_report(y_test, predictions)


reg.score: 0.992050752320993
mean_squared_error: 1.8555
mean_absolute_error: 1.1269
r2_score: -0.9988
explained_variance_score: -0.1625
y_true.shape=(1674, 1), prediction.shape=(1674, 1)
type(y_true<class 'pandas.core.frame.DataFrame'>, type(prediction)=<class 'numpy.ndarray'>
pred.head() :
    HumorLevel  Funiness  FuninessLevel
0           5  2.828638              3
1           4  3.108231              3
2           4  3.270776              3
3           4  3.490234              3
4           4  3.320486              3
1    316
2    616
3    571
4    125
5     46
Name: HumorLevel, dtype: int64
2      41
3    1197
4     433
5       2
7       1
Name: FuninessLevel, dtype: int64
0    1227
1     447
Name: result, dtype: int64
0    0.732975
1    0.267025
Name: result, dtype: float64


In [95]:
%%time

# https://scikit-learn.org/stable/modules/svm.html#regression
classifier = LinearSVR()
classifier.fit(x_train, y_train)
predictions = classifier.predict(x_test)

print_svr_report(y_test, predictions)

mean_squared_error: 1.6650
mean_absolute_error: 1.0614
r2_score: -0.7936
explained_variance_score: -0.0319
y_true.shape=(1674, 1), prediction.shape=(1674,)
type(y_true<class 'pandas.core.frame.DataFrame'>, type(prediction)=<class 'numpy.ndarray'>
pred.head() :
    HumorLevel  Funiness  FuninessLevel
0           5  2.831474              3
1           4  3.316667              3
2           4  3.105655              3
3           4  3.274082              3
4           4  3.372976              3
1    316
2    616
3    571
4    125
5     46
Name: HumorLevel, dtype: int64
3    1509
4     165
Name: FuninessLevel, dtype: int64
0    1145
1     529
Name: result, dtype: int64
0    0.68399
1    0.31601
Name: result, dtype: float64
CPU times: user 89.3 ms, sys: 2.34 ms, total: 91.7 ms
Wall time: 23.7 ms


In [96]:
%%time

# https://scikit-learn.org/stable/modules/svm.html#regression
classifier = SVR()
classifier.fit(x_train, y_train)
predictions = classifier.predict(x_test)

print_svr_report(y_test, predictions)

mean_squared_error: 1.4409
mean_absolute_error: 0.9547
r2_score: -0.5521
explained_variance_score: 0.0000
y_true.shape=(1674, 1), prediction.shape=(1674,)
type(y_true<class 'pandas.core.frame.DataFrame'>, type(prediction)=<class 'numpy.ndarray'>
pred.head() :
    HumorLevel  Funiness  FuninessLevel
0           5  3.100021              3
1           4  3.100055              3
2           4  3.100031              3
3           4  3.100032              3
4           4  3.100041              3
1    316
2    616
3    571
4    125
5     46
Name: HumorLevel, dtype: int64
3    1674
Name: FuninessLevel, dtype: int64
0    1103
1     571
Name: result, dtype: int64
0    0.658901
1    0.341099
Name: result, dtype: float64
CPU times: user 1.01 s, sys: 13 ms, total: 1.02 s
Wall time: 669 ms


In [97]:
with open('out/HumorLevel_True.txt', 'w') as outF:
    outF.write(y_test.to_csv(sep='\t', index=False))

# https://stackoverflow.com/questions/36967666/transform-scipy-sparse-csr-to-pandas
with open('out/HumorLevel_Pred.txt', 'w') as outF:
    outF.write(pd.DataFrame(predictions, columns=list(y_test.columns)).to_csv(sep='\t', index=False))

## 4. use classification for prediction Humorous Level

### 4.1 Define metrics

In [98]:
def train_predict(classifier, feature_vector_train, label, feature_vector_test):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    # predict the labels on test dataset
    return classifier.predict(feature_vector_test), classifier

In [99]:
def tcfunc(x, n=4): # trancate a number to have n decimal digits
    d = '0' * n
    d = int('1' + d)
# https://stackoverflow.com/questions/4541155/check-if-a-number-is-int-or-float
    if isinstance(x, (int, float)): return int(x * d) / d
    return x

In [100]:
# http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
#import itertools # replace this line by next line on 2019/01/03, because cannot find itertools for Python 3.6.7
import more_itertools
import matplotlib.pyplot as plt
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, numpy.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    #print(cm) # print out consufion matrix

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = numpy.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
#    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
# Replace the above line by the next line on 2019/01/03, because cannot find itertools for Python 3.6.7
    for i, j in more_itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [101]:
# use global variables:
#  test_y
#  LabEncoder.classes_
def show_confusion_matrix(predictions):
    # Compute confusion matrix
    cnf_matrix = confusion_matrix(test_y, predictions)
    numpy.set_printoptions(precision=2)

    # Plot non-normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(cnf_matrix, classes=LabEncoder.classes_ ,
                      title='Confusion matrix, without normalization')
    # Plot normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(cnf_matrix, classes=LabEncoder.classes_ , normalize=True,
                      title='Normalized confusion matrix')

    plt.show()

In [111]:
# http://scikit-learn.org/stable/modules/model_evaluation.html
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# use a global variable: test_y
def show_Result(predictions):
    print(predictions[:10])

    # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
#    print("MicroF1 = %0.4f, MacroF1=%0.4f" %
#       (metrics.f1_score(test_y, predictions, average='micro'),
#        metrics.f1_score(test_y, predictions, average='macro')))
# https://stackoverflow.com/questions/455612/limiting-floats-to-two-decimal-points

# http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
    print("\tPrecision\tRecall\tF1\tSupport")
    (Precision, Recall, F1, Support) = list(map(tcfunc, 
        precision_recall_fscore_support(test_y, predictions, average='micro')))
    print("Micro\t{}\t{}\t{}\t{}".format(Precision, Recall, F1, Support))
    (Precision, Recall, F1, Support) = list(map(tcfunc, 
        precision_recall_fscore_support(test_y, predictions, average='macro')))
    print("Macro\t{}\t{}\t{}\t{}".format(Precision, Recall, F1, Support))
    
    if True:
    #if False:
        print(confusion_matrix(test_y, predictions))
        try: 
            print(classification_report(test_y, predictions, digits=4))
        except ValueError:
            print('May be some category has no predicted samples')
        show_confusion_matrix(predictions)


# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.reset_index.html
    y_true = pd.DataFrame(test_y, columns=['HumorLevel'])
    pred = pd.DataFrame(data=y_true.reset_index(drop=True), columns=['HumorLevel'])
    pred['FuninessLevel'] = predictions
    
    print(y_true[y_true.columns[0]].value_counts(normalize=False, sort=False, ascending=False, bins=None, dropna=True))
    print(pred.FuninessLevel.value_counts(normalize=False, sort=False, ascending=False, bins=None, dropna=True))
# https://stackoverflow.com/questions/52777668/python-pandas-compare-two-columns-for-equality-and-result-in-third-dataframe
    pred['result'] = np.where(pred['HumorLevel'] == pred['FuninessLevel'], 1, 0)
    print(pred['result'].value_counts(normalize=False, sort=False, ascending=False, bins=None, dropna=True))
    print(pred['result'].value_counts(normalize=True, sort=False, ascending=False, bins=None, dropna=True))

In [112]:
# This function is modified from: https://gist.github.com/bbengfort/044682e76def583a12e6c09209c664a1
# and from: https://stackoverflow.com/questions/26976362/how-to-get-most-informative-features-for-scikit-learn-classifier-for-different-c
# This function only works for binary classes
def most_informative_feature_for_class(vectorizer, classifier, labels, n=10):
    coefs = sorted( # Zip the feature names with the coefs and sort
        zip(classifier.coef_[0], vectorizer.get_feature_names()))
    topn  = zip(coefs[:n], coefs[:-(n+1):-1])
    # Create two columns with most negative and most positive features.
    for (cp, fnp), (cn, fnn) in topn:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (cp, fnp, cn, fnn))

# nltk.classify.NaiveBayesClassifier has a show_most_informative_features()
# You may compare the result here with those at: https://www.twilio.com/blog/2017/09/sentiment-analysis-python-messy-data-nltk.html


### 4.2 Run classifiers

In [113]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn import decomposition, ensemble

In [114]:
def Run_NaiveBayes():
    
    time_NaiveBayes = time.time()

# Naive Bayes on Count Vectors   
    predict, clf = train_predict(naive_bayes.MultinomialNB(), xtrain_count, train_y, xtest_count)
    print("\nNB, Count Vectors: ")
    show_Result(predict)
    most_informative_feature_for_class(count_vect, clf, train_yL, n=10)

# Naive Bayes on Word Level TF IDF Vectors
    predict, clf = train_predict(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xtest_tfidf)
    print("\nNB, WordLevel TF-IDF: ")
    show_Result(predict)
    #most_informative_feature_for_class(vectorizer, classifier, classlabel, n=10)
    most_informative_feature_for_class(tfidf_vect, clf, train_y, n=10)

# Naive Bayes on Ngram Level TF IDF Vectors
    predict, clf = train_predict(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)
    print("\nNB, N-Gram Vectors: ")
    show_Result(predict)
    most_informative_feature_for_class(tfidf_vect_ngram, clf, train_y, n=10)

# Naive Bayes on Character Level TF IDF Vectors
    predict, clf = train_predict(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)
    print("NB, CharLevel Vectors: ")
    show_Result(predict)
    most_informative_feature_for_class(tfidf_vect_ngram_chars, clf, train_y, n=10)

    print("\nIt takes %4.2f seconds for Naive Bayes."%(time.time()-time_NaiveBayes))

Run_NaiveBayes()


NB, Count Vectors: 
[1 3 3 2 3 3 3 3 1 2]
	Precision	Recall	F1	Support
Micro	0.1875	0.1875	0.1875	None
Macro	0.1684	0.2047	0.108	None
0    316
1    616
2    571
3    125
4     46
Name: HumorLevel, dtype: int64
0       2
1      22
2     640
3    1009
4       1
Name: FuninessLevel, dtype: int64
0    1360
1     314
Name: result, dtype: int64
0    0.812425
1    0.187575
Name: result, dtype: float64
	-10.3078	0rz            		-7.2167	去              
	-10.3078	1              		-7.3633	不              
	-10.3078	10             		-7.4174	到              
	-10.3078	139            		-7.5352	老師             
	-10.3078	14             		-7.5997	一個             
	-10.3078	15             		-7.6687	吃              
	-10.3078	1500cc         		-7.6687	人              
	-10.3078	18             		-7.6687	一天             
	-10.3078	1c             		-7.7428	後              
	-10.3078	2              		-7.8229	覺得             

NB, WordLevel TF-IDF: 
[3 3 2 2 3 2 3 3 2 2]
	Precision	Recall	F1	Support
Micro	0.2783	0.2

In [115]:
def Run_LogisticRegret():
    
    time_LogisticRegret = time.time()

# Linear Classifier on Count Vectors
    predict, clf = train_predict(linear_model.LogisticRegression(), xtrain_count, train_y, xtest_count)
    print("\nLR, Count Vectors: ")
    show_Result(predict)
    most_informative_feature_for_class(count_vect, clf, train_y, n=10)

# Linear Classifier on Word Level TF IDF Vectors
    predict, clf = train_predict(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xtest_tfidf)
    print("\nLR, WordLevel TF-IDF: ")
    show_Result(predict)
    most_informative_feature_for_class(tfidf_vect, clf, train_y, n=10)

# Linear Classifier on Ngram Level TF IDF Vectors
    predict, clf = train_predict(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)
    print("\nLR, N-Gram Vectors: ")
    show_Result(predict)
    most_informative_feature_for_class(tfidf_vect_ngram, clf, train_y, n=10)

# Linear Classifier on Character Level TF IDF Vectors
    predict, clf = train_predict(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)
    print("\nLR, CharLevel Vectors: ")
    show_Result(predict)
    most_informative_feature_for_class(tfidf_vect_ngram_chars, clf, train_y, n=10)

    print("\nIt takes %4.2f seconds for Logistic Regression."%(time.time()-time_LogisticRegret))

Run_LogisticRegret()


LR, Count Vectors: 
[2 2 2 2 3 2 2 2 2 2]
	Precision	Recall	F1	Support
Micro	0.2371	0.2371	0.2371	None
Macro	0.2603	0.1974	0.1269	None
0    316
1    616
2    571
3    125
4     46
Name: HumorLevel, dtype: int64
0      2
1     69
2    903
3    697
4      3
Name: FuninessLevel, dtype: int64
0    1277
1     397
Name: result, dtype: int64
0    0.762843
1    0.237157
Name: result, dtype: float64
	-0.7019	看              		1.0498	菜              
	-0.5585	叫              		1.0457	覺得             
	-0.5130	買              		0.8503	冷              
	-0.4866	阿              		0.8264	髮              
	-0.4821	時候             		0.8253	隨便             
	-0.4756	聽              		0.8003	最紅             
	-0.4690	答              		0.7731	番茄             
	-0.4137	醫生             		0.7526	衣服             
	-0.4102	著              		0.7437	電話響            
	-0.4039	如果             		0.7389	好煩             

LR, WordLevel TF-IDF: 
[2 3 3 2 2 2 2 3 2 2]
	Precision	Recall	F1	Support
Micro	0.2467	0.2467	0.2467	None
Macro	0.

In [116]:
def Run_SVM():
    
    time_LinearSVM = time.time()

# Use of class_weight='balanced' decrease accuracy, although PCWeb is unbalanced
#accuracy = train_model(svm.SVC(class_weight='balanced'), xtrain_count, train_y, xtest_count)
# LinearSVC() is much much better than SVC()
    predict, clf = train_predict(svm.LinearSVC(), xtrain_count, train_y, xtest_count)
    print("\nSVM, Count Vectors: ")
    show_Result(predict)
    most_informative_feature_for_class(count_vect, clf, train_y, n=10)

    predict, clf = train_predict(svm.LinearSVC(), xtrain_tfidf, train_y, xtest_tfidf)
    print("\nSVM, WordLevel TF-IDF: ")
    show_Result(predict)
    most_informative_feature_for_class(tfidf_vect, clf, train_y, n=10)

    predict, clf = train_predict(svm.LinearSVC(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)
    print("\nSVM, N-Gram Vectors: ")
    show_Result(predict)
    most_informative_feature_for_class(tfidf_vect_ngram, clf, train_y, n=10)

    predict, clf = train_predict(svm.LinearSVC(), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)
    print("\nSVM, CharLevel Vectors: ")
    show_Result(predict)
    most_informative_feature_for_class(tfidf_vect_ngram_chars, clf, train_y, n=10)

    print("\nIt takes %4.2f seconds for Linear SVM."%(time.time()-time_LinearSVM))

Run_SVM()


SVM, Count Vectors: 
[1 3 2 2 3 2 2 3 2 2]
	Precision	Recall	F1	Support
Micro	0.2401	0.2401	0.2401	None
Macro	0.2025	0.1954	0.1365	None
0    316
1    616
2    571
3    125
4     46
Name: HumorLevel, dtype: int64
0     13
1    125
2    878
3    639
4     19
Name: FuninessLevel, dtype: int64
0    1272
1     402
Name: result, dtype: int64
0    0.759857
1    0.240143
Name: result, dtype: float64
	-0.3721	遠              		0.6954	菜              
	-0.3368	離              		0.6785	最紅             
	-0.3323	最              		0.5201	好煩             
	-0.3140	找              		0.5201	太美             
	-0.2881	買              		0.5095	恰北北            
	-0.2584	看              		0.4271	電話響            
	-0.2262	阿              		0.4048	宵夜             
	-0.2016	一樣             		0.3874	臉書             
	-0.1770	男              		0.3689	覺得             
	-0.1750	阿美             		0.3604	動物園            

SVM, WordLevel TF-IDF: 
[1 3 3 2 3 2 3 3 1 2]
	Precision	Recall	F1	Support
Micro	0.2311	0.2311	0.2311	None
Macro	

In [117]:
def Run_RdnForest():
    
    time_RdnForest = time.time()

# RF on Count Vectors
    predict, clf = train_predict(ensemble.RandomForestClassifier(), xtrain_count, train_y, xtest_count)
    print("\nRF, Count Vectors: ")
    show_Result(predict)
    #most_informative_feature_for_class(count_vect, clf, train_y, n=10)
    #'RandomForestClassifier' object has no attribute 'coef_'

# RF on Word Level TF IDF Vectors
    predict, clf = train_predict(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xtest_tfidf)
    print("\nRF, WordLevel TF-IDF: ")
    show_Result(predict)

    predict, clf = train_predict(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)
    print("\nRF, N-Gram Vectors: ")
    show_Result(predict)

    predict, clf = train_predict(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)
    print("\nRF, CharLevel Vectors: ")
    show_Result(predict)

    print("\nIt takes %4.2f seconds for Random Forest."%(time.time()-time_RdnForest))

Run_RdnForest()


RF, Count Vectors: 
[2 2 2 2 2 2 2 2 2 2]
	Precision	Recall	F1	Support
Micro	0.2598	0.2598	0.2598	None
Macro	0.1478	0.1942	0.1255	None
0    316
1    616
2    571
3    125
4     46
Name: HumorLevel, dtype: int64
0       1
1      70
2    1107
3     493
4       3
Name: FuninessLevel, dtype: int64
0    1239
1     435
Name: result, dtype: int64
0    0.740143
1    0.259857
Name: result, dtype: float64

RF, WordLevel TF-IDF: 
[2 2 2 2 2 2 2 2 2 2]
	Precision	Recall	F1	Support
Micro	0.2514	0.2514	0.2514	None
Macro	0.1148	0.1847	0.1136	None
0    316
1    616
2    571
3    125
4     46
Name: HumorLevel, dtype: int64
0       1
1      40
2    1131
3     501
4       1
Name: FuninessLevel, dtype: int64
0    1253
1     421
Name: result, dtype: int64
0    0.748507
1    0.251493
Name: result, dtype: float64

RF, N-Gram Vectors: 
[2 3 3 2 3 2 2 2 2 2]
	Precision	Recall	F1	Support
Micro	0.2747	0.2747	0.2747	None
Macro	0.1505	0.2082	0.1281	None
0    316
1    616
2    571
3    125
4     46
Name: HumorLeve