In [1]:
import pandas as pd
from tqdm import tqdm
import re
from scipy.stats import mode
from numpy import mean
from numpy import median
from numpy import std
from nltk.tokenize import TweetTokenizer
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import shuffle
from senticnet.senticnet import Senticnet
import requests
from collections import defaultdict
import nltk
import numpy as np
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import svm
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.cross_validation import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sps
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
from textblob import TextBlob
from textblob.taggers import PatternTagger
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
def get_data_set(filename):
    data = pd.read_csv(filename, encoding="utf-8", engine='python')
    data = data[["text", "sarc"]]
    data = data[data["text"].notnull()]
    data = data[data["sarc"].notnull()]
    data["sarc"] = data["sarc"].apply(lambda x: int(x))
    data = data.drop_duplicates()
    
    
    sarc = data[data["sarc"]==1]
    not_sarc = data[data["sarc"]==0][:len(data[data["sarc"]==1])]
    data_1_1 = pd.concat([sarc, not_sarc], ignore_index=True)
    data_1_1 = shuffle(data_1_1)
    return data_1_1

In [3]:
def get_strength_dict(f, val):
    s_d = {}
    s_file = open(f, "r")
    for line in s_file:
        l = line.split()
        if val:
            s_d[re.sub(r"\*",r"",l[0])] = int(l[1])
        else:
            s_d[re.sub(r"\*",r"",l[0])] = 0
    return s_d

In [4]:
def get_strength_list(f):
    s_l = []
    s_file = open(f, "r")
    for line in s_file:
        s_l.append(line.strip())
    return s_l

In [5]:
booster_dict = get_strength_dict("sp_files/BoosterWordList.txt", True)
idioms_dict = get_strength_list("sp_files/EC-Idioms-Intermediate-Advanced-3.txt")
slang_dict = get_strength_dict("sp_files/SlangLookupTable.txt", False)
emoticon_dict = get_strength_list("sp_files/EmoticonLookupTable.txt")

In [6]:
sentiments_dict = get_strength_dict("sp_files/SentimentLookupTable.txt", True)

In [7]:
def senti_score(x):
    wl = WordNetLemmatizer()
    ps = PorterStemmer()
    if x in sentiments_dict:
        return sentiments_dict[x]
    lemma  = wl.lemmatize(x)
    if lemma in sentiments_dict:
        return sentiments_dict[lemma]
    stem = ps.stem(x)
    if stem in sentiments_dict:
        return sentiments_dict[stem]
    return 0.0

In [8]:
data = get_data_set("sarcasm_set_small.csv")

In [9]:
def get_tagging(text):
    text_and_tags = PatternTagger().tag(text.decode("utf-8").lower())
    return text_and_tags

In [10]:
def iden_pronouns(t1, t2):
    common_words = list(set(t1).intersection(set(t2)))
    for i in common_words:
        if i[1] != "PRP" or i[1] != "PRP$":
            return True
    return False

def iden_str(t1, t2):
    stop = stopwords.words()
    t1 = [i[0] for i in t1 if i[1] not in stop]
    t2 = [i[0] for i in t2 if i[1] not in stop]
    common_words = list(set(t1).intersection(set(t2)))
    if len(common_words) != 0:
        return True
    return False


def check_coherence(text):
    blob = TextBlob(text)
    sentences = blob.sentences
    if len(sentences) <= 1:
        return 0
    demonstrative = ["this", "that", "these", "those"]
    tagged_set = [TextBlob(str(sentence).lower()).tags for sentence in sentences]
    tagged_set = [i for i in tagged_set if len(i) != 0]
    for i in range(len(tagged_set) - 1):
        if iden_pronouns(tagged_set[i], tagged_set[i+1]) or iden_str(tagged_set[i], tagged_set[i+1])\
           or  (tagged_set[i+1][0][0] == "the") or (tagged_set[i+1][0][0] in demonstrative):
                continue
        else:
            return -1
    return 1
        

In [11]:
def get_score(word):
    scores = []
    ss = senti_score(word)
    sn = TextBlob(word + " ").polarity * 5.0
    if ss != 0:
        scores.append(ss)
    if sn != 0:
        scores.append(sn)
    return np.mean(scores or [0])
        
        
def get_scores(filtered_text):
    scores = [get_score(w[0]) for w in filtered_text]
    pos_scores = [s for s in scores if s > 0]
    neg_scores = [n for n in scores if n < 0]
    return (sum(pos_scores or [0]), sum(neg_scores or [0]))
    

def get_emotional_scores(df):
    df = df.copy()
    df["sn"] = df["tagged_words"].apply(lambda x: get_scores(x))
    df["sum_pos_score"] = df["sn"].apply(lambda x: x[0])
    df["sum_neg_score"] = df["sn"].apply(lambda x: x[1])
    df = df.drop("sn", axis=1)
    return df

In [12]:
def get_contra(coh, pos, neg):
    if coh == 0:
        if (abs(pos) > 0) and (abs(neg) > 0):
            return 1
    return 0

def get_contra_coher(coh, pos, neg):
    if coh == 1:
        if (abs(pos) > 0) and (abs(neg) > 0):
            return 1
    return 0

def get_contraditional_features(df):
    df = df.copy()
    df["contra"] = df[["coherence", "sum_pos_score", 
                       "sum_neg_score"]].apply(lambda x: get_contra(x[0], x[1], x[2]),axis=1)
    df["contra_coher"] =  df[["coherence", "sum_pos_score",
                              "sum_neg_score"]].apply(lambda x: get_contra_coher(x[0], x[1], x[2]),axis=1)
    return df
    

In [13]:
def get_sentiment_features(df):
    df = df.copy()
    df["pos_low"] = df["sum_pos_score"].apply(lambda x: int(x<= 0))
    df["pos_medium"] = df["sum_pos_score"].apply(lambda x: int((x > 0) and (x <= 1)))
    df["pos_high"] = df["sum_pos_score"].apply(lambda x: int(x >= 2))
    df["neg_low"] = df["sum_pos_score"].apply(lambda x: int(abs(x) <= 0))
    df["neg_medium"] = df["sum_pos_score"].apply(lambda x:  int((abs(x) > 0) and (abs(x) <= 1)))
    df["neg_high"] = df["sum_pos_score"].apply(lambda x: int(abs(x) >= 2))
    df = df.drop("sum_pos_score", axis=1)
    df = df.drop("sum_neg_score", axis=1)
    return df

In [14]:
def get_rep_punct(text):
    punct = [',', '.', '?', '!']
    count = 0
    for i in range(len(text)-1):
        if (text[i] in punct) and (text[i] == text[i+1]):
            count+=1
    return count


def get_rep_chars(text):
    count = 0
    for i in range(len(text)-1):
        if (text[i].isalpha()) and (text[i] == text[i+1]):
            count+=1
    return count

In [15]:
def get_punct_3_f(df, col_list):
    df = df.copy()
    for col in col_list:
        df[col + "_low"] = df[col].apply(lambda x: int(x ==0))
        df[col + "_medium"] = df[col].apply(lambda x: int((x >= 1) and (x <= 3)))
        df[col + "_high"] = df[col].apply(lambda x: int(x >= 4))
        df = df.drop(col, axis=1)
    return df

In [16]:
def get_punctuation_and_sp_simbols_feature(df):
    df = df.copy()
    df["idioms_number"] = df["text"].apply(lambda x: len([i for i in idioms_dict if i in x]))
    df["exlamation_number"] = df["text"].apply(lambda x: x.count("!"))
    df["slang_and_booster"] = df["text"].apply(lambda x: len([i for i in slang_dict if i in x.lower()]) + 
                                                             len([i for i in booster_dict if i in x.lower()]))
    df["original_tagged_words"] = df["text"].apply(lambda x: TextBlob(x.decode("utf-8"),pos_tagger=PatternTagger()).tags)
    df["number_capitalized"] = df["text"].apply(lambda x: len([i for i in x if i[0].isupper()]))
    df = df.drop("original_tagged_words", axis=1)
    df["emoticons_number"] = df["text"].apply(lambda x: len([i for i in emoticon_dict if i in x]))
    df["rep_punc"] = df["text"].apply(lambda x: get_rep_punct(x))
    df["rep_chars"] = df["text"].apply(lambda x: get_rep_chars(x))
    df = get_punct_3_f(df, ["idioms_number", "exlamation_number", "slang_and_booster", "number_capitalized",
                            "emoticons_number", "rep_punc", "rep_chars"])
    return df

In [17]:
def create_features(df):
    df = df.copy()
    df["tagged_text"] = df["text"].apply(lambda x: get_tagging(x))
    df["tagged_words"] = df["text"].apply(lambda x: TextBlob(x.encode("utf-8").lower(),pos_tagger=PatternTagger()).tags)
    df["coherence"] = df["text"].apply(lambda x: check_coherence(x))
    df = get_emotional_scores(df)
    df = get_contraditional_features(df)
    df = get_sentiment_features(df)
    df = get_punctuation_and_sp_simbols_feature(df)
    df = df.drop("tagged_text", axis=1)
    df = df.drop("tagged_words", axis=1)
    return df

In [20]:
data_f = create_features(data)

In [18]:
def get_scales_features(train, test, columns):
    scaler = StandardScaler()
    l_train = scaler.fit_transform(train[columns].astype(float))
    l_test = scaler.transform(test[columns].astype(float))
    return l_train, l_test

In [19]:
def get_n_gram_features(train, test, column, n_range):
    trigram_vectorizer = CountVectorizer(ngram_range=(1, n_range), min_df=1)
    X_train = trigram_vectorizer.fit_transform(train[column])
    X_test = trigram_vectorizer.transform(test[column])
    return X_train, X_test

In [20]:
def get_full_feature_set(train, test, columns_f, n_gram_col, r):
    train = train.copy()
    test = test.copy()
    s_train, s_test = get_scales_features(train, test, columns_f)
    ngram_train, ngram_test = get_n_gram_features(train, test, n_gram_col, r)
    s_train = sps.hstack((s_train, ngram_train))
    s_test = sps.hstack((s_test, ngram_test))
    return s_train, s_test

In [21]:
def create_tt_sets(trains, test, columns_f, n_gram_col, r):
    new_trains = []
    new_tests = []
    for train, test in tqdm(zip(trains, test)):
        train, test = get_full_feature_set(train, test, columns_f, n_gram_col, r)
        new_trains.append(train)
        new_tests.append(test)
    return new_trains, new_tests

In [40]:
def get_pred(y_proba, thr):
    y_pr = []
    for i in y_proba[:,1]:
        if i < thr:
            y_pr.append(0)
        else:
            y_pr.append(1)
    return y_pr


def evaluation(clf, trains, tests, ans_trains, ans_tests, proba):
    f_scores = []
    recall_scores = []
    precision_scores = []
    accuracy_scores = []
    auc_scores = []
    predictions = []
    probas = []
    for train, test, y_train, y_test in tqdm(zip(trains, tests, y_trains, y_tests)):
        clf.fit(train, y_train)
        y_pred = clf.predict(test)
        if proba:
            y_proba = clf.predict_proba(test)
            probas.append(y_proba)
            #y_pred = get_pred(y_proba, thr)
            auc_scores.append(roc_auc_score(y_test, y_proba[:,1]))
        predictions.append(y_pred)
        f_scores.append(f1_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred))
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        
    return f_scores, recall_scores, precision_scores, accuracy_scores, auc_scores, predictions, probas

In [26]:
def get_trains_tests(data_f):
    skf = KFold(len(data_f), n_folds=10, shuffle=False, random_state=None)
    trains = []
    tests = []
    y_trains = []
    y_tests = []
    data_1_1 = shuffle(data_f)
    for train, test in skf:
        trains.append(data_f.iloc[train]) 
        tests.append(data_f.iloc[test])
        y_trains.append(data_f.sarc.iloc[train])
        y_tests.append(data_f.sarc.iloc[test])
    return trains, tests, y_trains, y_tests

In [27]:
trains, tests, y_trains, y_tests = get_trains_tests(data_f)
clf = LogisticRegression(C=0.6)
log_features = data_f.columns.difference(set(["text", "sarc", "coherence"]))
log_trains, log_tests = create_tt_sets(trains, tests, log_features, "text", 3)
log_f1, log_recall, log_pr, log_acc, log_auc, log_predict, log_probas = evaluation(clf, log_trains, log_tests,
                                                                       y_trains, y_tests, True)



In [28]:
print "log avg f1:", np.mean(log_f1)
print "log avg recall:", np.mean(log_recall)
print "log avg precision:", np.mean(log_pr)
print "log avg accuracy:", np.mean(log_acc)
print "log avg auc:", np.mean(log_auc)

log avg f1: 0.745852386884
log avg recall: 0.755726758129
log avg precision: 0.736679766066
log avg accuracy: 0.742778943527
log avg auc: 0.825570028267


In [32]:
def load_data_sets():
    trains = []
    tests = []
    y_trains = []
    y_tests = []
    for i in tqdm(range(10)):
        train = pd.read_csv("data_train_kf_"+str(i)+".csv", encoding="utf-8")
        test = pd.read_csv("data_test_kf_"+str(i)+".csv", encoding="utf-8")
        x_train = create_features(train)
        trains.append(x_train)
        y_train = x_train.sarc
        y_trains.append(y_train)
        x_test = create_features(test)
        tests.append(x_test)
        y_test = x_test.sarc
        y_tests.append(y_test)
    return trains, tests, y_trains, y_tests    

In [33]:
trains, tests, y_trains, y_tests = load_data_sets()




In [41]:
clf = LogisticRegression(C=0.6)
log_features = trains[0].columns.difference(set(["text", "sarc", "coherence"]))
log_trains, log_tests = create_tt_sets(trains, tests, log_features, "text", 3)
log_f1, log_recall, log_pr, log_acc, log_auc, log_predict, log_probas = evaluation(clf, log_trains, log_tests,
                                                                       y_trains, y_tests, True)



In [47]:
def store_answers(alg_name, prd, probas):
    i = 0
    for y_pr, y_proba in zip(prd, probas):
        pd.Series(list(y_pr)).to_csv(alg_name +"_y_prd_kf_"+str(i)+".csv")
        pd.Series(list(y_proba[:,1])).to_csv(alg_name +"_y_prb_kf_"+str(i)+".csv")
        i += 1

In [48]:
store_answers("con", log_predict, log_probas)

In [44]:
print "log avg f1:", np.mean(log_f1)
print "log avg recall:", np.mean(log_recall)
print "log avg precision:", np.mean(log_pr)
print "log avg accuracy:", np.mean(log_acc)
print "log avg auc:", np.mean(log_auc)

log avg f1: 0.748300806964
log avg recall: 0.756898896539
log avg precision: 0.740420700415
log avg accuracy: 0.745578262324
log avg auc: 0.823564116121
