In [1]:
import pandas as pd
import os
import librosa
import numpy as np
import scipy
import re


#Data handling
from sklearn.model_selection import train_test_split

#Pickling
from six.moves import cPickle as pickle

# Models 
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

#Metrics
from sklearn.metrics import recall_score, precision_score, f1_score

#HTTP
import requests
import json

#
import collections

import matplotlib.pyplot as plt
import matplotlib.style as ms
import librosa.display
import IPython.display as ipd
import seaborn
import codecs
ms.use('seaborn-muted')
%matplotlib inline

no_alignment_file = [4764]
wrong_alignment = [3730]

In [2]:
pfief_path = 'Pattern_construction_code/luis_pattern_half/patterns_ignore_5'
# pat_table = pd.read_csv('Pattern_construction_code/luis_pattern_half/patterns_ignore_5',sep='\t')
# pat_table

with codecs.open(pfief_path,'r','utf-8') as content_file:
    content = content_file.read()
len(set(map(lambda x: x.split('\t')[0] ,content.split('\n'))))   

44

# Models

In [3]:
import basic_models

In [4]:
def get_pattern(text):
    text = json.dumps(text)
    url = 'http://192.168.2.101:7878/api/get_patt'
    data = dict(input_tweets = text)
    resp = requests.post(url=url, data=data)
    r = json.loads(resp.text)
    return map(lambda x: x['pattern'],r)
    
def get_deep_emotion(text):
    text = json.dumps(text)
    url = 'http://192.168.2.101:7878/api/get_emo'
    data = dict(input_tweets = text)
    resp = requests.post(url=url, data=data)
    r = json.loads(resp.text)
    return r

def clean_text(text, remove_actions = True):
    punct_str = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~«»“…‘”'
    if(remove_actions):
        text = re.sub(r" ?\[[^)]+\]", "", text)
    for p in punct_str:
        text = text.replace(p,' ')
    text = re.sub(' +', ' ', text)
    return text.lower().strip()

def get_f1_score(precision,recall):
    return 2 * (precision * recall) / (precision + recall)

def get_patterns_load(data,patterns_df, emotion):
    index = data[data.emotion == emotion ]['index']
    patterns = patterns_df.loc[list(index)]
    load = []
    for pat in patterns.iterrows():
        load = load + list(pat[1].dropna())
    return load


def extract_patterns(data,extract=False):
    if(extract):
        patterns = {}
        for index, row in data.iterrows():
            patterns[row['index']] = set(get_pattern([row['text']])[0].values())
            print('Extracted pattern from '+ row['index'] + ' index:'+ str(index))
            print('Size: ', len(patterns[row['index']]), 'Patterns size', len(patterns))
        try:
            print('Saving Pickle')
            with open('pickles/patterns/pattern.pickle','wb') as f:
                save = {
                    'patterns' : patterns
                }
                pickle.dump(save,f,pickle.HIGHEST_PROTOCOL)
                print('Successfully saved in pattern.pickle')
                return patterns
        except Exception as e:
            print('Unable to save data to pickle', e)
            print('Patterns probably not saved.')
            return patterns
    else:
        try:
            with open('pickles/patterns/pattern.pickle','rb') as f:
                save = pickle.load(f)
                patterns = save['patterns']
                del save
                returning = {}
                for key in list(data['index']):
                    returning[key] = patterns[key]
                return returning
        except Exception as e:
            print('Error loading base datasets pickle: ', e)
            
def build_emotions_counter(data,patterns_df):
    emotions_counter ={}
    emotions_list = list(data['emotion'].unique())
    for emotion in emotions_list:
        load = get_patterns_load(data,patterns_df,emotion)
        emotions_counter[emotion] = collections.Counter(load)
    return emotions_counter

def build_frequencyframe(all_patterns,emotions_counter):
    df_patt = {}
    for pattern in all_patterns:
        df_patt[pattern] = {}
        for emotion in emotions_counter:
            df_patt[pattern][emotion] = emotions_counter[emotion][pattern]
    return pd.DataFrame(df_patt).T

def build_pfief(df_patt):
    ief = ((df_patt+1).rdiv(df_patt.sum(axis=1)+1, axis=0)+1).apply(np.log10)
    pf = ((df_patt.sum(axis=0)+1)/(df_patt+1)).apply(np.log10)
    return ief * pf

def balance_data(data):
    min_sample = min(data.groupby('emotion').count()['index'])
    emotions_list = list(data['emotion'].unique())
    samples = []
    for emotion in emotions_list:
        samples.append(data[data.emotion == emotion].sample(n=min_sample))
    result = pd.concat(samples).sample(frac=1)
    return result
        
def two_emotions(data,emotional_mapping,emotion1,emotion2):
    emotion_code = emotional_mapping[emotion1]
    emotion_sample = data[data.emotion_code == emotion_code]
    emotion_code2 = emotional_mapping[emotion2]
    emotion_sample2 = data[data.emotion_code == emotion_code2]
    if(len(emotion_sample2) < len(emotion_sample)):
        emotion_sample = emotion_sample.sample(n=len(emotion_sample2))
    else:
        emotion_sample2 = emotion_sample2.sample(n=len(emotion_sample))
    sample = pd.concat([emotion_sample,emotion_sample2]).sample(frac=1)
    return sample

def filter_word_count(data, n_count):
    return data[list(map(lambda x: len(x.split(' ')) >= n_count,data['text']))]

def remove_empty_patterns(data,patterns):
    empty_patterns = [k for k, v in patterns.items() if len(v) < 1]
    patterns = { k:v for k, v in patterns.items() if len(v) > 1 }
    data = filter(lambda x: x[1]['index'] not in empty_patterns ,data.iterrows())
    data = pd.DataFrame.from_items(data).T
    return data,patterns





In [5]:
def load_data(word_count,emotional_mapping):
    # full = generate_IEMOCAP_df()
    data = pd.read_csv('data/IEMOCAP_sentences_votebased.csv',index_col=0)
    data['emotion_code'] = data['emotion'].map( emotional_mapping ).astype(int)
    # Take away fear, surprise,disgust, xxx and others. Not enough data
    data = data[data.emotion_code < 4]
    # Clean Transcripts
    try:
        data = data.drop(no_alignment_file)
    except Exception as e:
        print('Error at: ',e)
    # Remove rows that have wrong Alignment file
    try:
        data = data.drop(wrong_alignment)
    except Exception as e:
        print('Error at: ',e)
    data['text'] = data['text'].apply(clean_text)
    # Filter Word Count
    data = filter_word_count(data, word_count)
    patterns = extract_patterns(data)
    data,patterns = remove_empty_patterns(data,patterns)
    return data,patterns

def build_model(data,patterns):
    transcript_order = list(data['index'])
    patterns_df = pd.DataFrame.from_dict(patterns, orient='index')
    patterns_df = patterns_df.loc[transcript_order]
    emotions_counter = build_emotions_counter(X_train,patterns_df)
    all_patterns = []
    for pat in patterns_df.iterrows():
        all_patterns = all_patterns + list(pat[1].dropna())
        
    df_patt = build_frequencyframe(all_patterns,emotions_counter)
    em_df = build_pfief(df_patt)
    return em_df

def get_frequency_vectors(data,patterns_list):
    patterns = extract_patterns(data)
    transcript_order = list(data['index'])
    frequency_vectors = []
    for index in patterns:
        frequency_vectors.append(np.isin(patterns_list,np.array(list(patterns[index]))))
    vectors = pd.DataFrame(frequency_vectors,columns=patterns_list,index=patterns.keys())
    vectors = vectors.loc[transcript_order]
    vectors = vectors * 1
    return vectors
    
def calculate_scores(em_df,vectors):
    em_matrix = em_df.T.as_matrix()
    emotional_scores = []
    for index, vector in vectors.iterrows():
        emotional_scores.append(em_matrix.dot(vector))
    emotions_list = list(em_df.columns)
    scores = pd.DataFrame(emotional_scores,columns=emotions_list,index=list(vectors.index))
    scores['pred_emotion'] = list(map(lambda x: x[1].idxmin(),scores.iterrows()))
    scores['pred_code'] = scores['pred_emotion'].map(emotional_mapping).astype(int)
    return scores

In [6]:
emotional_mapping = {'ang': 0, 'sad': 1, 'hap': 2, 'neu': 3,'fru': 4,'exc': 5,'fea': 6,'sur': 7,'dis': 8, 'xxx':9,'oth':10}
data,patterns = load_data(3,emotional_mapping)
# data = two_emotions(data,emotional_mapping,'sad','exc')
# Balance Data
# data = balance_data(data)
y = data.emotion_code
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2)

X_train.groupby('emotion').count()['index'] #  6,453 Total
X_test.groupby('emotion').count()['index']

Error at:  labels [4764] not contained in axis


emotion
ang    235
hap    141
neu    276
sad    190
Name: index, dtype: int64

In [7]:
em_df = build_model(X_train,patterns)
patterns_list = np.array(list(em_df.index))

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


In [8]:
print(len(em_df))
# em_df.head()
# em_df.to_pickle('')

5928


# Score - Training Data

In [9]:
vectors = get_frequency_vectors(X_train,patterns_list)

In [10]:
scores = calculate_scores(em_df,vectors)
pred_y = list(scores['pred_code'])

In [11]:
# pred_y, y_train
precision = precision_score(list(y_train),pred_y,average='macro')
recall = recall_score(list(y_train),pred_y,average='macro')
f1 = get_f1_score(precision,recall)
print('Precision Macro',precision)
print('Recall Macro',recall)
print('F1 Macro',f1)
print(' ')
precision = precision_score(list(y_train),pred_y,average='micro')
recall = recall_score(list(y_train),pred_y,average='micro')
f1 = get_f1_score(precision,recall)
print('Precision Micro',precision)
print('Recall Micro',recall)
print('F1 Micro',f1)
print(' ')
precision = precision_score(list(y_train),pred_y,average='weighted')
recall = recall_score(list(y_train),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print('Precision Weighted',precision)
print('Recall Weighted',recall)
print('F1 Weighted',f1)

Precision Macro 0.7560774144162135
Recall Macro 0.6190157085313921
F1 Macro 0.6807157836499023
 
Precision Micro 0.6830065359477124
Recall Micro 0.6830065359477124
F1 Micro 0.6830065359477124
 
Precision Weighted 0.7273584910326966
Recall Weighted 0.6830065359477124
F1 Weighted 0.7044851422840877


#  Testing

In [12]:
vectors = get_frequency_vectors(X_test,patterns_list)

In [13]:
scores = calculate_scores(em_df,vectors)
pred_y = list(scores['pred_code'])

In [14]:
precision = precision_score(list(y_test),pred_y,average='macro')
recall = recall_score(list(y_test),pred_y,average='macro')
f1 = get_f1_score(precision,recall)
print('Precision Macro',precision)
print('Recall Macro',recall)
print('F1 Macro',f1)
print(' ')
precision = precision_score(list(y_test),pred_y,average='micro')
recall = recall_score(list(y_test),pred_y,average='micro')
f1 = get_f1_score(precision,recall)
print('Precision Micro',precision)
print('Recall Micro',recall)
print('F1 Micro',f1)
print(' ')
precision = precision_score(list(y_test),pred_y,average='weighted')
recall = recall_score(list(y_test),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print('Precision Weighted',precision)
print('Recall Weighted',recall)
print('F1 Weighted',f1)

Precision Macro 0.6081145163271687
Recall Macro 0.4898913854942629
F1 Macro 0.5426383636891221
 
Precision Micro 0.5498812351543944
Recall Micro 0.5498812351543944
F1 Micro 0.5498812351543944
 
Precision Weighted 0.5912229312072166
Recall Weighted 0.5498812351543944
F1 Weighted 0.5698031875572036


# Without multiple wild-card patterns

In [15]:
def remove_multiwildcard(patterns):
    for index, patt in patterns.items():
        flt_patt = {p for p in patt if p.split(' ').count('.+') == 1}
        patterns[index] = flt_patt
    return patterns

patterns = remove_multiwildcard(patterns)
# data = two_emotions(data,emotional_mapping,'sad','exc')
# Balance Data
# data = balance_data(data)
y = data.emotion_code
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3)

data.groupby('emotion').count()['index'] #  6,453 Total

emotion
ang    1141
hap     680
neu    1440
sad     947
Name: index, dtype: int64

In [16]:
em_df = build_model(X_train,patterns)
patterns_list = np.array(list(em_df.index))

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


In [17]:
print(len(em_df))
em_df.head()
# em_df.to_pickle('pickles/patterns/pfief_matrix.pickle')

5614


Unnamed: 0,ang,hap,neu,sad
.+ a,1.335632,1.773339,1.233435,1.653975
.+ a big,1.514692,3.875035,2.202922,2.782855
.+ a bit,1.21858,1.937517,2.090332,2.043222
.+ a day,1.387807,3.667312,2.665174,2.600678
.+ a dog,2.075031,1.937517,1.228233,2.043222


# Score - Training Data

In [18]:
vectors = get_frequency_vectors(X_train,patterns_list)
scores = calculate_scores(em_df,vectors)
pred_y = list(scores['pred_code'])

In [19]:
# pred_y, y_train
# precision = precision_score(list(y_train),pred_y,average='macro')
# recall = recall_score(list(y_train),pred_y,average='macro')
# f1 = get_f1_score(precision,recall)
# print('Precision Macro',precision)
# print('Recall Macro',recall)
# print('F1 Macro',f1)
# print(' ')
# precision = precision_score(list(y_train),pred_y,average='micro')
# recall = recall_score(list(y_train),pred_y,average='micro')
# f1 = get_f1_score(precision,recall)
# print('Precision Micro',precision)
# print('Recall Micro',recall)
# print('F1 Micro',f1)
# print(' ')
precision = precision_score(list(y_train),pred_y,average='weighted')
recall = recall_score(list(y_train),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print('Precision Weighted',precision)
print('Recall Weighted',recall)
print('F1 Weighted',f1)


# Precision Macro 0.7664926811354967
# Recall Macro 0.6423609709732143
# F1 Macro 0.6989583086378197
 
# Precision Micro 0.6947368421052632
# Recall Micro 0.6947368421052632
# F1 Micro 0.6947368421052632
 
# Precision Weighted 0.7386741269995074
# Recall Weighted 0.6947368421052632
# F1 Weighted 0.7160320960247799

Precision Weighted 0.7558028038759393
Recall Weighted 0.7256366723259763
F1 Weighted 0.740412605913883


#  Testing

In [20]:
vectors = get_frequency_vectors(X_test,patterns_list)

In [21]:
scores = calculate_scores(em_df,vectors)
pred_y = list(scores['pred_code'])

In [22]:
# precision = precision_score(list(y_test),pred_y,average='macro')
# recall = recall_score(list(y_test),pred_y,average='macro')
# f1 = get_f1_score(precision,recall)
# print('Precision Macro',precision)
# print('Recall Macro',recall)
# print('F1 Macro',f1)
# print(' ')
# precision = precision_score(list(y_test),pred_y,average='micro')
# recall = recall_score(list(y_test),pred_y,average='micro')
# f1 = get_f1_score(precision,recall)
# print('Precision Micro',precision)
# print('Recall Micro',recall)
# print('F1 Micro',f1)
# print(' ')
precision = precision_score(list(y_test),pred_y,average='weighted')
recall = recall_score(list(y_test),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print('Precision Weighted',precision)
print('Recall Weighted',recall)
print('F1 Weighted',f1)


# Precision Macro 0.6295216882336137
# Recall Macro 0.5249158266314806
# F1 Macro 0.5724794856483989
 
# Precision Micro 0.5874901029295329
# Recall Micro 0.5874901029295329
# F1 Micro 0.5874901029295329
 
# Precision Weighted 0.6153167839850325
# Recall Weighted 0.5874901029295329
# F1 Weighted 0.6010815612885869

Precision Weighted 0.6200686604247473
Recall Weighted 0.5874901029295329
F1 Weighted 0.6033399155241551


# Acoustic Weights

In [99]:
try:
    with open('pickles/patterns/scaledmfcc20_pattern_features4emo.pickle','rb') as f:
        save = pickle.load(f)
        full_feature_table = save['full_feature_table']
        wc_feature_table = save['wc_feature_table']
        cw_feature_table = save['cw_feature_table']
        del save
except Exception as e:
    print('Error loading pattern features pickle: ', e)
    

def calculate_final_matrix(em_df,matrix):
    final = []
    for val in em_df.iterrows():
        final.append(val[1] * matrix.loc[val[0]])
    return pd.DataFrame(final)

def calculate_final_matrix_mul(em_df,matrix):
    final = []
    for val in em_df.iterrows():
        final.append(val[1] + (val[1] * matrix.loc[val[0]]))
    return pd.DataFrame(final)

def calculate_final_matrix_sum(em_df,matrix):
    final = []
    for val in em_df.iterrows():
        final.append(val[1] + matrix.loc[val[0]])
    return pd.DataFrame(final)


def build_acumatrix(data,feature_table,saveToPickle = False, savePath = ''):
    matrix = {}
    emotions_list = list(data['emotion'].unique())
    for index, row in data.iterrows():
        emo = row.emotion
        key = row['index']
        patts = feature_table[key].keys()
        for patt in patts:
            tpatt = patt.split('_')[1]
            if(tpatt not in matrix):
                matrix[tpatt] = {}
            if(emo not in matrix[tpatt]):
                matrix[tpatt][emo] = []
            matrix[tpatt][emo].append(feature_table[key][patt])
    for val in matrix:
        for emo in matrix[val].keys():
            matrix[val][emo] = np.mean(matrix[val][emo])
    matrix = pd.DataFrame(matrix).T
    if(saveToPickle and savePath != ''):
        matrix.to_pickle(savePath)
    return matrix

def build_multiacumatrix(data,feature_table,saveToPickle = False, savePath = '',size = 20):
    multimatrix = []
    for i in range(size):
        multimatrix.append(dict())
    emotions_list = list(data['emotion'].unique())
    for index, row in data.iterrows():
        emo = row.emotion
        key = row['index']
        patts = feature_table[key].keys()
        for patt in patts:
            tpatt = patt.split('_')[1]
            if(tpatt not in multimatrix[0]):
                for matrix in multimatrix:
                    matrix[tpatt] = {}
                    for emotion in emotions_list:
                        matrix[tpatt][emotion] = []
            for i in range(size):
                sub_feature = feature_table[key][patt][i]
                multimatrix[i][tpatt][emo].append(sub_feature)
    for i in range(size):
        for val in multimatrix[i]:
            for emo in multimatrix[i][val].keys():
                if(len(multimatrix[i][val][emo]) > 0):
                    multimatrix[i][val][emo] = np.mean(multimatrix[i][val][emo])
                else:
                    multimatrix[i][val][emo] = np.nan()
        multimatrix[i] = pd.DataFrame(multimatrix[i]).T
    if(saveToPickle and savePath != ''):
        try:
            print('Saving Pickle')
            with open(savePath,'wb') as f:
                save = {
                    'multimatrix' : multimatrix
                }
                pickle.dump(save,f,pickle.HIGHEST_PROTOCOL)
                print('Successfully saved matrix to '+savePath)
        except Exception as e:
            print('Unable to save data to pickle', e)
            print('Patterns probably not saved.')
    return multimatrix

def get_cumatrix(filePath):
    try:
        matrix = pd.read_pickle(filePath)
        return matrix
    except Exception as e:
        print('Error loading matrix: ', e)

def get_multiacumatrix(filePath):
    try:
        with open(filePath,'rb') as f:
            save = pickle.load(f)
            multimatrix = save['multimatrix']
            del save
            return multimatrix
    except Exception as e:
        print('Error loading matrix: ', e)

In [108]:
# full_matrix = build_acumatrix(X_train,full_feature_table)
# full_matrix = full_matrix.fillna(np.max(full_matrix))

# wc_matrix = build_acumatrix(X_train,wc_feature_table)
# wc_matrix = wc_matrix.fillna(np.max(wc_matrix))

# cw_matrix = build_acumatrix(X_train,cw_feature_table)
# cw_matrix = cw_matrix.fillna(np.max(cw_matrix))

#FOR MFCC 20 

# full_matrices = build_multiacumatrix(X_train,full_feature_table,saveToPickle=True,savePath = 'pickles/patterns/full_mfcc20_matrix.pickle')
# for i in range(20):
#     full_matrices[i] = full_matrices[i].fillna(np.max(full_matrices[i]))
# full_matrices[0]
# wc_matrices = build_multiacumatrix(X_train,wc_feature_table,saveToPickle=True,savePath = 'pickles/patterns/wc_mfcc20_matrix.pickle')
# for i in wc_matrices:
#     wc_matrices[i] = wc_matrices[i].fillna(np.max(wc_matrices[i]))
# # wc_matrices = wc_matrix.fillna(np.max(wc_matrix))

# cw_matrices = build_multiacumatrix(X_train,cw_feature_table,saveToPickle=True,savePath= 'pickles/patterns/cw_mfcc20_matrix.pickle')
# for i in cw_matrices:
#     cw_matrices[i] = cw_matrices[i].fillna(np.max(cw_matrices[i]))
# cw_matrix = cw_matrix.fillna(np.max(cw_matrix))


nan

In [101]:
# full_matrix

np.mean([])

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


nan

In [432]:
summatrix = calculate_final_matrix_sum(em_df,full_matrix)
mulmatrix = calculate_final_matrix(em_df,full_matrix)
mul2matrix = calculate_final_matrix_mul(em_df,full_matrix)
em_df.head()

Unnamed: 0,ang,hap,neu,sad
.+ a,1.330367,1.728037,1.196536,1.769234
.+ a big,1.623602,4.072397,1.932672,2.933993
.+ a bit,1.603251,2.451827,1.622258,2.56715
.+ a day,1.380523,3.677741,2.662913,2.588625
.+ a few,3.026475,1.799399,1.435312,2.980369


In [433]:
set1 =set(full_matrix.index)
set2 = set(em_df.index)

In [434]:
set1.difference(set2)

{'.+ pants'}

In [435]:
mulmatrix.head()

Unnamed: 0,ang,hap,neu,sad
.+ a,0.263903,0.226839,0.142655,0.161814
.+ a big,0.409879,0.0,0.097358,0.226219
.+ a bit,0.117136,0.0,0.213028,0.0
.+ a day,0.250383,0.0,0.179725,0.385733
.+ a few,0.023776,0.223846,0.140013,0.0


In [436]:
print(summatrix.shape,mulmatrix.shape,em_df.shape,full_matrix.shape,cw_matrix.shape,wc_matrix.shape)

(5611, 4) (5611, 4) (5611, 4) (5612, 4) (5612, 4) (5612, 4)


# Full Pattern Feature

# Train

In [437]:
vectors = get_frequency_vectors(X_train,patterns_list)
scores = calculate_scores(mulmatrix,vectors)
pred_y = list(scores['pred_code'])

In [438]:
precision = precision_score(list(y_train),pred_y,average='weighted')
recall = recall_score(list(y_train),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print(precision)
print(recall)
print(f1)

0.24978847049199454
0.2601018675721562
0.2548408660561287


In [439]:
vectors = get_frequency_vectors(X_train,patterns_list)
scores = calculate_scores(summatrix,vectors)
pred_y = list(scores['pred_code'])

precision = precision_score(list(y_train),pred_y,average='weighted')
recall = recall_score(list(y_train),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print(precision)
print(recall)
print(f1)

0.754382442165381
0.7140916808149406
0.7336843294314903


In [440]:
vectors = get_frequency_vectors(X_train,patterns_list)
scores = calculate_scores(mul2matrix,vectors)
pred_y = list(scores['pred_code'])

precision = precision_score(list(y_train),pred_y,average='weighted')
recall = recall_score(list(y_train),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print(precision)
print(recall)
print(f1)

0.74539261834061
0.700169779286927
0.7220738252768559


## Testing

In [441]:
vectors = get_frequency_vectors(X_test,patterns_list)
scores = calculate_scores(mulmatrix,vectors)
pred_y = list(scores['pred_code'])

In [442]:
precision = precision_score(list(y_test),pred_y,average='weighted')
recall = recall_score(list(y_test),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print(precision)
print(recall)
print(f1)

0.2355038243640479
0.2596991290577989
0.24701039302980007


  'precision', 'predicted', average, warn_for)


In [443]:
vectors = get_frequency_vectors(X_test,patterns_list)
scores = calculate_scores(mul2matrix,vectors)
pred_y = list(scores['pred_code'])
precision = precision_score(list(y_test),pred_y,average='weighted')
recall = recall_score(list(y_test),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print(precision)
print(recall)
print(f1)

0.6018295286493484
0.5827395091053048
0.5921306954889984


In [444]:
vectors = get_frequency_vectors(X_test,patterns_list)
scores = calculate_scores(summatrix,vectors)
pred_y = list(scores['pred_code'])
precision = precision_score(list(y_test),pred_y,average='weighted')
recall = recall_score(list(y_test),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print(precision)
print(recall)
print(f1)

0.604586204950291
0.5922406967537609
0.5983497776623551


# WildCard Weights

## Train

In [445]:
summatrix = calculate_final_matrix_sum(em_df,wc_matrix)
mulmatrix = calculate_final_matrix(em_df,wc_matrix)
mul2matrix = calculate_final_matrix_mul(em_df,wc_matrix)
em_df.head()

# mulmatrix = em_df + matrix


Unnamed: 0,ang,hap,neu,sad
.+ a,1.330367,1.728037,1.196536,1.769234
.+ a big,1.623602,4.072397,1.932672,2.933993
.+ a bit,1.603251,2.451827,1.622258,2.56715
.+ a day,1.380523,3.677741,2.662913,2.588625
.+ a few,3.026475,1.799399,1.435312,2.980369


In [446]:
vectors = get_frequency_vectors(X_train,patterns_list)
scores = calculate_scores(summatrix,vectors)
pred_y = list(scores['pred_code'])

precision = precision_score(list(y_train),pred_y,average='weighted')
recall = recall_score(list(y_train),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print(precision)
print(recall)
print(f1)

0.7553726542222787
0.7147707979626485
0.7345110628696748


In [447]:
vectors = get_frequency_vectors(X_train,patterns_list)
scores = calculate_scores(mulmatrix,vectors)
pred_y = list(scores['pred_code'])

precision = precision_score(list(y_train),pred_y,average='weighted')
recall = recall_score(list(y_train),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print(precision)
print(recall)
print(f1)

0.2352463709263634
0.2614601018675722
0.2476615203357769


In [448]:
vectors = get_frequency_vectors(X_train,patterns_list)
scores = calculate_scores(mul2matrix,vectors)
pred_y = list(scores['pred_code'])

precision = precision_score(list(y_train),pred_y,average='weighted')
recall = recall_score(list(y_train),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print(precision)
print(recall)
print(f1)

0.7480388196605868
0.7052631578947368
0.7260214715583349


## Test

In [449]:
vectors = get_frequency_vectors(X_test,patterns_list)
scores = calculate_scores(summatrix,vectors)
pred_y = list(scores['pred_code'])

precision = precision_score(list(y_test),pred_y,average='weighted')
recall = recall_score(list(y_test),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print(precision)
print(recall)
print(f1)

0.6004792694299194
0.5914489311163895
0.5959298922520592


In [450]:
vectors = get_frequency_vectors(X_test,patterns_list)
scores = calculate_scores(mulmatrix,vectors)
pred_y = list(scores['pred_code'])

precision = precision_score(list(y_test),pred_y,average='weighted')
recall = recall_score(list(y_test),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print(precision)
print(recall)
print(f1)

0.2094549375087102
0.26128266033254155
0.23251570957167425


In [451]:
vectors = get_frequency_vectors(X_test,patterns_list)
scores = calculate_scores(mul2matrix,vectors)
pred_y = list(scores['pred_code'])

precision = precision_score(list(y_test),pred_y,average='weighted')
recall = recall_score(list(y_test),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print(precision)
print(recall)
print(f1)

0.598300604043489
0.5851148060174188
0.5916342459272065


# ContentWord Weights

In [452]:
summatrix = calculate_final_matrix_sum(em_df,cw_matrix)
mulmatrix = calculate_final_matrix(em_df,cw_matrix)
mul2matrix = calculate_final_matrix_mul(em_df,cw_matrix)
em_df.head()

Unnamed: 0,ang,hap,neu,sad
.+ a,1.330367,1.728037,1.196536,1.769234
.+ a big,1.623602,4.072397,1.932672,2.933993
.+ a bit,1.603251,2.451827,1.622258,2.56715
.+ a day,1.380523,3.677741,2.662913,2.588625
.+ a few,3.026475,1.799399,1.435312,2.980369


## Train

In [453]:
vectors = get_frequency_vectors(X_train,patterns_list)
scores = calculate_scores(summatrix,vectors)
pred_y = list(scores['pred_code'])

precision = precision_score(list(y_train),pred_y,average='weighted')
recall = recall_score(list(y_train),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print(precision)
print(recall)
print(f1)

0.7556017582404628
0.7171477079796265
0.7358727081508172


In [454]:
vectors = get_frequency_vectors(X_train,patterns_list)
scores = calculate_scores(mulmatrix,vectors)
pred_y = list(scores['pred_code'])

precision = precision_score(list(y_train),pred_y,average='weighted')
recall = recall_score(list(y_train),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print(precision)
print(recall)
print(f1)

0.2547370178861727
0.26553480475382
0.2600238620063823


In [455]:
vectors = get_frequency_vectors(X_train,patterns_list)
scores = calculate_scores(mul2matrix,vectors)
pred_y = list(scores['pred_code'])

precision = precision_score(list(y_train),pred_y,average='weighted')
recall = recall_score(list(y_train),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print(precision)
print(recall)
print(f1)

0.7444354201785992
0.702546689303905
0.7228847356434576


## Test

In [456]:
vectors = get_frequency_vectors(X_test,patterns_list)
scores = calculate_scores(summatrix,vectors)
pred_y = list(scores['pred_code'])

precision = precision_score(list(y_test),pred_y,average='weighted')
recall = recall_score(list(y_test),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print(precision)
print(recall)
print(f1)

0.6038484179750068
0.5922406967537609
0.5979882324675635


In [457]:
vectors = get_frequency_vectors(X_test,patterns_list)
scores = calculate_scores(mulmatrix,vectors)
pred_y = list(scores['pred_code'])

precision = precision_score(list(y_test),pred_y,average='weighted')
recall = recall_score(list(y_test),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print(precision)
print(recall)
print(f1)

0.26036006843556897
0.26920031670625494
0.26470640496171755


In [458]:
vectors = get_frequency_vectors(X_test,patterns_list)
scores = calculate_scores(mul2matrix,vectors)
pred_y = list(scores['pred_code'])

precision = precision_score(list(y_test),pred_y,average='weighted')
recall = recall_score(list(y_test),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print(precision)
print(recall)
print(f1)

0.6013929020878835
0.5890736342042755
0.5951695265974676
