In [2]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from os import listdir
from tqdm import tqdm
import re
from nltk.tokenize import TweetTokenizer
import string
from sklearn.utils import shuffle
from senticnet.senticnet import Senticnet
import requests
from collections import defaultdict
import nltk
import numpy as np
from sklearn.preprocessing import StandardScaler
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet
from nltk.corpus import sentiwordnet as swn
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from textblob import TextBlob
from textblob.taggers import PatternTagger
from sklearn.cross_validation import KFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [3]:
def get_data_set(filename):
    data = pd.read_csv(filename, encoding="utf-8", engine='python')
    data = data[["text", "sarc"]]
    data = data[data["text"].notnull()]
    data = data[data["sarc"].notnull()]
    data["sarc"] = data["sarc"].apply(lambda x: int(x))
    data = data.drop_duplicates()
    
    
    sarc = data[data["sarc"]==1]
    not_sarc = data[data["sarc"]==0][:len(data[data["sarc"]==1])]
    data_1_1 = pd.concat([sarc, not_sarc], ignore_index=True)
    data_1_1 = shuffle(data_1_1)
    return data_1_1

In [4]:
data = get_data_set("sarcasm_set_small.csv")

In [5]:
def get_anc_dict(f):
    anc = defaultdict(dict)
    anc_file = open(f, "r")
    for line in anc_file:
        l = line.split()
        anc[l[0]][l[2]] = int(l[3]) 
    return anc

In [6]:
anc_all_data = "sp_files/ANC-all-lemma.txt"
anc_spoken_data = "sp_files/ANC-spoken-lemma.txt"
anc_written_data = "sp_files/ANC-written-lemma.txt"

In [7]:
anc_all = get_anc_dict(anc_all_data)
anc_written = get_anc_dict(anc_written_data)
anc_spoken = get_anc_dict(anc_spoken_data)

In [8]:
intens = pd.read_csv("sp_files/wn-asr-multilevel-assess.csv")

In [9]:
keys = list(intens["Word"])
values = list(intens["NormedScore"])
int_scores = dict(zip(keys, values))

In [10]:
def get_tagging(text):
    text_and_tags = PatternTagger().tag(text.decode("utf-8").lower())
    return text_and_tags

In [11]:
def get_mean_and_rarest_fr(tagged_text):
    arr = [int(anc_all[i[0]][i[1]]) for i in tagged_text if (i[0] in anc_all) and 
                    (i[1] in anc_all[i[0]])]
    if len(arr) == 0:
        return (0, 0)
    mean = np.mean(arr)
    minimum = np.min(arr)
    return (mean, minimum)

def get_frequency_f(df):
    df = df.copy()
    df["mean_and_r_fr"] = df["tagged"].apply(lambda x: get_mean_and_rarest_fr(x))
    df["mean_fr"] = df["mean_and_r_fr"].apply(lambda x: x[0])
    df["r_fr"] = df["mean_and_r_fr"].apply(lambda x: x[1])
    df = df.drop("mean_and_r_fr", axis=1)
    df["fr_gap"] = abs(df["mean_fr"] - df["r_fr"])
    return df

In [12]:
def get_mean_written(tagged_text):
    arr = [int(anc_written[i[0]][i[1]]) for i in tagged_text if (i[0] in anc_written) and 
                    (i[1] in anc_written[i[0]])]
    if len(arr) == 0:
        return 0
    return np.mean(arr)

def get_mean_spoken(tagged_text):
    arr = [int(anc_spoken[i[0]][i[1]]) for i in tagged_text if (i[0] in anc_spoken) and 
                    (i[1] in anc_spoken[i[0]])]
    if len(arr) == 0:
        return 0
    return np.mean(arr)

def get_style_f(df):
    df = df.copy()
    df["m_w"] = df["tagged"].apply(lambda x: get_mean_written(x))
    df["m_s"] = df["tagged"].apply(lambda x: get_mean_spoken(x))
    df["w_s_gap"] = abs(df["m_w"] - df["m_s"])
    return df

In [13]:
def get_words_number(text):
    nonPunct = re.compile('.*[A-Za-z0-9].*')
    filtered = [w[0] for w in text if nonPunct.match(w[0])]
    return len(filtered)

def get_words_length_mean(text):
    nonPunct = re.compile('.*[A-Za-z0-9].*')
    filtered = [w[0] for w in text if nonPunct.match(w[0])]
    return np.mean([len(w) for w in filtered] or [0])

def get_verbs_number(text):
    verbs = re.compile('VB*')
    return len([t[1] for t in text if verbs.match(t[1])])

def get_nouns_number(text):
    nouns = re.compile('NN*')
    return len([t[1] for t in text if nouns.match(t[1])])

def get_adjectives_number(text):
    adjectives = re.compile('JJ*')
    return len([t[1] for t in text if adjectives.match(t[1])])
    
def get_adverbs_number(text):
    adverbs = re.compile('RB*')
    return len([t[1] for t in text if adverbs.match(t[1])])

def div(a, b):
    if b == 0:
        return 0
    else:
        return a * 1.0 / b
    
def get_stracture_f(df):
    df = df.copy()
    df["length"] = df["text"].apply(lambda x: len(x))
    df["words_number"] = df["tagged"].apply(lambda x: get_words_number(x))
    df["words_length_mean"] = df["tagged"].apply(lambda x: get_words_length_mean(x))
    df["verbs_number"] = df["tagged"].apply(lambda x: get_verbs_number(x))
    df["nouns_number"] = df["tagged"].apply(lambda x: get_nouns_number(x))
    df["adverbs_number"] = df["tagged"].apply(lambda x: get_adverbs_number(x))
    df["adjectives_number"] = df["tagged"].apply(lambda x: get_adjectives_number(x))
    df["verbs_ratio"] = df[["verbs_number", "words_number"]].apply(lambda x: div(x[0], x[1]), axis=1) 
    df["nouns_ratio"] = df[["nouns_number", "words_number"]].apply(lambda x: div(x[0], x[1]), axis=1) 
    df["adverbs_ratio"] = df[["adverbs_number", "words_number"]].apply(lambda x: div(x[0], x[1]), axis=1) 
    df["adjectives_ratio"] = df[["adjectives_number", "words_number"]].apply(lambda x: div(x[0], x[1]), axis=1) 
    df["laughing_number"] = df["text"].apply(lambda x: x.lower().count("hahah") + x.lower().count("lol")
                                             + x.lower().count("rofl") + x.lower().count("lmao"))
    df["commas_number"] = df["text"].apply(lambda x: x.lower().count(','))
    df["full_stops_number"] = df["text"].apply(lambda x: x.lower().count('.'))
    df["ellipsis_number"] = df["text"].apply(lambda x: x.lower().count('...'))
    df["exclamation_number"] = df["text"].apply(lambda x: x.lower().count('!'))
    df["quatation_number"] = df["text"].apply(lambda x: x.lower().count('?'))
    df["punctuation"] = df["commas_number"] + df["exclamation_number"]
    df["punctuation"] = df["punctuation"] + df["quatation_number"] 
    df["punctuation"] = df["punctuation"] + df["ellipsis_number"] 
    df["punctuation"] = df["punctuation"]+ df["full_stops_number"]
    df["emoticon"] = df["text"].apply(lambda x: x.lower().count(':)') + x.lower().count(':(') + x.lower().count(':D')
                                      +x.lower().count(';)'))
    return df   

In [14]:
def get_adj_total_mean_max(text):
    adjectives = re.compile('JJ*')
    adjs = [t[0] for t in text if adjectives.match(t[1])]
    scores = [int_scores[t+"/a"] for t in adjs if (t+"/a") in int_scores]
    if len(scores) == 0:
        return (0, 0, 0)
    return (np.sum(scores), np.mean(scores), np.max(scores))

def get_adv_total_mean_max(text):
    adverbs = re.compile('RB*')
    advbs = [t[0] for t in text if adverbs.match(t[1])]
    scores = [int_scores[t+"/r"] for t in advbs if (t+"/r") in int_scores]
    if len(scores) == 0:
        return (0, 0, 0)
    return (np.sum(scores), np.mean(scores), np.max(scores))

def get_intensity_f(df):
    df = df.copy()
    df["adj_t_mean_max"] = df["tagged"].apply(lambda x: get_adj_total_mean_max(x))
    df["adj_total"] = df["adj_t_mean_max"].apply(lambda x: x[0])
    df["adj_mean"] = df["adj_t_mean_max"].apply(lambda x: x[1])
    df["adj_max"] = df["adj_t_mean_max"].apply(lambda x: x[2])
    df["adj_gap"] = abs(df["adj_max"] - df["adj_mean"])
    df["adv_t_mean_max"] = df["tagged"].apply(lambda x: get_adv_total_mean_max(x))
    df["adv_total"] = df["adv_t_mean_max"].apply(lambda x: x[0])
    df["adv_mean"] = df["adv_t_mean_max"].apply(lambda x: x[1])
    df["adv_max"] = df["adv_t_mean_max"].apply(lambda x: x[2])
    df["adv_gap"] = abs(df["adv_max"] - df["adv_mean"])
    df = df.drop("adj_t_mean_max", axis=1)
    df = df.drop("adv_t_mean_max", axis=1)
    return df

In [15]:
def get_synonyms(word, tag):
    synonyms = []
    for syn in wordnet.synsets(word, pos=tag):
        for l in syn.lemmas():
            synonyms.append(l.name())
    return list(set(synonyms))

def get_text_syn_f(tagged_text):
    words = {}
    verbs = re.compile('VB*')
    nouns = re.compile('NN*')
    adjectives = re.compile('JJ*')
    adverbs = re.compile('RB*')
    for w in tagged_text:
        if verbs.match(w[1]):
            words[w[0]] = get_synonyms(w[0], wn.VERB)
        if nouns.match(w[1]):
            words[w[0]] = get_synonyms(w[0], wn.NOUN)
        if adjectives.match(w[1]):
            words[w[0]] = get_synonyms(w[0], wn.ADJ)
        if adverbs.match(w[1]):
            words[w[0]] = get_synonyms(w[0], wn.ADV)
    sl_w = [len([s for s in words[w] if anc_all[s]< anc_all[w]]) for w in words.keys()]
    s_mean = 0
    if len(sl_w) != 0:
        s_mean = np.mean(sl_w)
    wls_t = max(sl_w or [0])
    sg_w = [len([s for s in words[w] if anc_all[s] > anc_all[w]]) for w in words.keys()]
    wgs_t = max(sg_w or [0])
    s_l_gap = abs(wls_t - s_mean)
    g_mean = 0
    if len(sg_w) != 0:
        g_mean = np.mean(sg_w)
    s_g_gap =abs(wgs_t - g_mean)
    return (s_mean, s_l_gap, g_mean, s_g_gap)
    
def get_synonyms_f(df):
    df = df.copy()
    df["syn_smean_sgap_g_mean_g_gap"] = df["tagged"].apply(lambda x: get_text_syn_f(x))
    df["s_mean"] = df["syn_smean_sgap_g_mean_g_gap"].apply(lambda x: x[0])
    df["s_l_gap"] = df["syn_smean_sgap_g_mean_g_gap"].apply(lambda x: x[1])
    df["g_mean"] = df["syn_smean_sgap_g_mean_g_gap"].apply(lambda x: x[2])
    df["s_g_gap"] = df["syn_smean_sgap_g_mean_g_gap"].apply(lambda x: x[3])
    df = df.drop("syn_smean_sgap_g_mean_g_gap", axis=1)
    return df

In [16]:
def get_ambiquity_text_f(tagged_text):
    nonPunct = re.compile('.*[A-Za-z0-9].*')
    words = [w[0] for w in tagged_text if nonPunct.match(w[0])]
    synsets = [len(wn.synsets(w)) for w in words]
    if len(synsets) == 0:
        synset_mean = 0
    else:
        synset_mean = sum(synsets or [0]) * 1.0 / len(synsets)
    synset_max = max(synsets or [0])
    synset_gap = abs(synset_max - synset_mean)
    return (synset_mean, synset_max, synset_gap)
        
def get_ambiguity_f(df):
    df = df.copy()
    df["mmg"] = df["tagged"].apply(lambda x: get_ambiquity_text_f(x))
    df["sysnset_mean"] = df["mmg"].apply(lambda x: x[0])
    df["sysnset_max"] = df["mmg"].apply(lambda x: x[1])
    df["sysnset_gap"] = df["mmg"].apply(lambda x: x[2])
    df = df.drop("mmg", axis=1)
    return df

In [17]:
def get_senti_synonyms(word, tag):
    synonyms = []
    for syn in swn.senti_synsets(word, pos=tag):
        for l in syn.lemmas():
            synonyms.append(l.name())
    return list(set(synonyms))

def get_sentiment_f_from_text(tagged_text):
    words = {}
    verbs = re.compile('VB*')
    nouns = re.compile('NN*')
    adjectives = re.compile('JJ*')
    adverbs = re.compile('RB*')
    for w in tagged_text:
        l = []
        if verbs.match(w[1]):
            l = swn.senti_synsets(w[0], wn.VERB)
        if nouns.match(w[1]):
            l = swn.senti_synsets(w[0], wn.NOUN)
        if adjectives.match(w[1]):
            l = swn.senti_synsets(w[0], wn.ADJ)
        if adverbs.match(w[1]):
            l = swn.senti_synsets(w[0], wn.ADV)
        if len(l) > 0:
            words[w[0]] = l[0]
    pos_scores = [words[key].pos_score() for key in words.keys()]
    neg_scores = [words[key].neg_score() for key in words.keys()]
    pos_scores_sum = sum(pos_scores or [0])
    neg_scores_sum = sum(neg_scores or [0])
    avg_pos_scores = (pos_scores_sum + neg_scores_sum) * 1.0 / 2
    pos_neg_gap = pos_scores_sum - neg_scores_sum
    positive_single_gap = abs(pos_scores_sum - max(pos_scores or [0]))
    negative_single_gap = abs(neg_scores_sum - max(neg_scores or [0]))
    return (pos_scores_sum, neg_scores_sum, avg_pos_scores, pos_neg_gap, positive_single_gap, negative_single_gap)
    
    
def get_sentiments_f(df):
    df = df.copy()
    df["s"] = df["tagged"].apply(lambda x: get_sentiment_f_from_text(x))
    df["pos_scores_sum"] = df["s"].apply(lambda x: x[0])
    df["neg_scores_sum"] = df["s"].apply(lambda x: x[1])
    df["avg_pos_scores"] = df["s"].apply(lambda x: x[2])
    df["pos_neg_gap"] = df["s"].apply(lambda x: x[3])
    df["positive_single_gap"] = df["s"].apply(lambda x: x[4])
    df["negative_single_gap"] = df["s"].apply(lambda x: x[5])
    df = df.drop("s", axis=1)
    return df

In [18]:
def create_feature_set_novel(df):
    df = df.copy()
    df["tagged"] = df["text"].apply(lambda x: get_tagging(x))
    df = get_frequency_f(df)
    df = get_style_f(df)
    df = get_stracture_f(df)
    df = get_intensity_f(df)
    df = get_synonyms_f(df)
    df = get_ambiguity_f(df)
    df = get_sentiments_f(df)
    return df

In [18]:
data_f = create_feature_set_novel(data)

In [19]:
def get_trains_tests(data_f):
    skf = KFold(len(data_f), n_folds=10, shuffle=False, random_state=None)
    trains = []
    tests = []
    y_trains = []
    y_tests = []
    data_1_1 = shuffle(data_f)
    for train, test in skf:
        trains.append(data_f.iloc[train]) 
        tests.append(data_f.iloc[test])
        y_trains.append(data_f.sarc.iloc[train])
        y_tests.append(data_f.sarc.iloc[test])
    return trains, tests, y_trains, y_tests

In [20]:
def create_tt_sets(trains, test, columns_f):
    new_trains = []
    new_tests = []
    for train, test in tqdm(zip(trains, test)):
        scaler = StandardScaler()
        s_train = scaler.fit_transform(train[columns_f].astype(float))
        s_test = scaler.transform(test[columns_f].astype(float))
        new_trains.append(s_train)
        new_tests.append(s_test)
    return new_trains, new_tests

In [21]:
def evaluation(clf, trains, tests, ans_trains, ans_tests, proba):
    f_scores = []
    recall_scores = []
    precision_scores = []
    accuracy_scores = []
    auc_scores = []
    predictions = []
    probas = []
    for train, test, y_train, y_test in tqdm(zip(trains, tests, y_trains, y_tests)):
        clf.fit(train, y_train)
        y_pred = clf.predict(test)
        if proba:
            y_proba = clf.predict_proba(test)
            probas.append(y_proba)
            auc_scores.append(roc_auc_score(y_test, y_proba[:,1]))
        predictions.append(y_pred)
        f_scores.append(f1_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred))
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        
    return f_scores, recall_scores, precision_scores, accuracy_scores, auc_scores, predictions, probas

In [34]:
trains, tests, y_trains, y_tests = get_trains_tests(data_f)
clf = svm.SVC(C=2.1, probability=True)
svm_features = data_f.columns.difference(set(["text", "sarc", "tagged"]))
svm_trains, svm_tests = create_tt_sets(trains, tests, svm_features)
svm_f1, svm_recall, svm_pr, svm_acc, svm_auc, svm_predict, svm_probas = evaluation(clf, svm_trains, svm_tests,
                                                                       y_trains, y_tests, True)



In [35]:
print "svm avg f1:", np.mean(svm_f1)
print "svm avg recall:", np.mean(svm_recall)
print "svm avg precision:", np.mean(svm_pr)
print "svm avg accuracy:", np.mean(svm_acc)
print "svm avg auc:", np.mean(svm_auc)

svm avg f1: 0.720279569172
svm avg recall: 0.758680090855
svm avg precision: 0.686152664105
svm avg accuracy: 0.705633175631
svm avg auc: 0.769866676989


In [28]:
trains, tests, y_trains, y_tests = get_trains_tests(data_f)
clf = RandomForestClassifier(n_estimators=250, n_jobs=-1)
rf_features = data_f.columns.difference(set(["text", "sarc", "tagged"]))
rf_trains, rf_tests = create_tt_sets(trains, tests, rf_features)
rf_f1, rf_recall, rf_pr, rf_acc, rf_auc, rf_predict, rf_probas = evaluation(clf, rf_trains, rf_tests,
                                                                       y_trains, y_tests, True)



In [36]:
print "rf avg f1:", np.mean(rf_f1)
print "rf avg recall:", np.mean(rf_recall)
print "rf avg precision:", np.mean(rf_pr)
print "rf avg accuracy:", np.mean(rf_acc)
print "rf avg auc:", np.mean(rf_auc)

rf avg f1: 0.743496364085
rf avg recall: 0.802771964579
rf avg precision: 0.693070667372
rf avg accuracy: 0.723319909115
rf avg auc: 0.795802187818


In [28]:
def load_data_sets():
    trains = []
    tests = []
    y_trains = []
    y_tests = []
    for i in tqdm(range(10)):
        train = pd.read_csv("data_train_kf_"+str(i)+".csv", encoding="utf-8")
        test = pd.read_csv("data_test_kf_"+str(i)+".csv", encoding="utf-8")
        x_train = create_feature_set_novel(train)
        trains.append(x_train)
        y_train = x_train.sarc
        y_trains.append(y_train)
        x_test = create_feature_set_novel(test)
        tests.append(x_test)
        y_test = x_test.sarc
        y_tests.append(y_test)
    return trains, tests, y_trains, y_tests  

In [29]:
trains, tests, y_trains, y_tests = load_data_sets()



In [32]:
clf = RandomForestClassifier(n_estimators=250, n_jobs=-1)
rf_features = trains[0].columns.difference(set(["text", "sarc", "tagged"]))
rf_trains, rf_tests = create_tt_sets(trains, tests, rf_features)
rf_f1, rf_recall, rf_pr, rf_acc, rf_auc, rf_predict, rf_probas = evaluation(clf, rf_trains, rf_tests,
                                                                       y_trains, y_tests, True)



In [33]:
def store_answers(alg_name, prd, probas):
    i = 0
    for y_pr, y_proba in zip(prd, probas):
        pd.Series(list(y_pr)).to_csv(alg_name +"_y_prd_kf_"+str(i)+".csv")
        pd.Series(list(y_proba[:,1])).to_csv(alg_name +"_y_prb_kf_"+str(i)+".csv")
        i += 1

In [35]:
store_answers("nov", rf_predict, rf_probas)

In [36]:
print "rf avg f1:", np.mean(rf_f1)
print "rf avg recall:", np.mean(rf_recall)
print "rf avg precision:", np.mean(rf_pr)
print "rf avg accuracy:", np.mean(rf_acc)
print "rf avg auc:", np.mean(rf_auc)

rf avg f1: 0.743029425788
rf avg recall: 0.798624161494
rf avg precision: 0.695059829862
rf avg accuracy: 0.724203127104
rf avg auc: 0.796637798851
