In [34]:
import pandas as pd
import os
import librosa
import numpy as np
import scipy
import re


#Data handling
from sklearn.model_selection import train_test_split

#Pickling
from six.moves import cPickle as pickle

# Models 
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

#Metrics
from sklearn.metrics import recall_score, precision_score, f1_score

#HTTP
import requests
import json

#
import collections

import matplotlib.pyplot as plt
import matplotlib.style as ms
import librosa.display
import IPython.display as ipd
import seaborn
import codecs
ms.use('seaborn-muted')
%matplotlib inline

no_alignment_file = [4764]
wrong_alignment = [3730]

In [35]:
pfief_path = 'Pattern_construction_code/luis_pattern_half/patterns_ignore_5'
# pat_table = pd.read_csv('Pattern_construction_code/luis_pattern_half/patterns_ignore_5',sep='\t')
# pat_table

with codecs.open(pfief_path,'r','utf-8') as content_file:
    content = content_file.read()
len(set(map(lambda x: x.split('\t')[0] ,content.split('\n'))))   

44

# Models

In [36]:
import basic_models

In [37]:
def get_pattern(text):
    text = json.dumps(text)
    url = 'http://192.168.2.101:7878/api/get_patt'
    data = dict(input_tweets = text)
    resp = requests.post(url=url, data=data)
    r = json.loads(resp.text)
    return map(lambda x: x['pattern'],r)
    
def get_deep_emotion(text):
    text = json.dumps(text)
    url = 'http://192.168.2.101:7878/api/get_emo'
    data = dict(input_tweets = text)
    resp = requests.post(url=url, data=data)
    r = json.loads(resp.text)
    return r

def clean_text(text, remove_actions = True):
    punct_str = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~«»“…‘”'
    if(remove_actions):
        text = re.sub(r" ?\[[^)]+\]", "", text)
    for p in punct_str:
        text = text.replace(p,' ')
    text = re.sub(' +', ' ', text)
    return text.lower().strip()

def get_f1_score(precision,recall):
    return 2 * (precision * recall) / (precision + recall)

def get_patterns_load(data,patterns_df, emotion):
    index = data[data.emotion == emotion ]['index']
    patterns = patterns_df.loc[list(index)]
    load = []
    for pat in patterns.iterrows():
        load = load + list(pat[1].dropna())
    return load


def extract_patterns(data,extract=False):
    if(extract):
        patterns = {}
        for index, row in data.iterrows():
            patterns[row['index']] = set(get_pattern([row['text']])[0].values())
            print('Extracted pattern from '+ row['index'] + ' index:'+ str(index))
            print('Size: ', len(patterns[row['index']]), 'Patterns size', len(patterns))
        try:
            print('Saving Pickle')
            with open('pickles/patterns/pattern.pickle','wb') as f:
                save = {
                    'patterns' : patterns
                }
                pickle.dump(save,f,pickle.HIGHEST_PROTOCOL)
                print('Successfully saved in pattern.pickle')
                return patterns
        except Exception as e:
            print('Unable to save data to pickle', e)
            print('Patterns probably not saved.')
            return patterns
    else:
        try:
            with open('pickles/patterns/pattern.pickle','rb') as f:
                save = pickle.load(f)
                patterns = save['patterns']
                del save
                returning = {}
                for key in list(data['index']):
                    returning[key] = patterns[key]
                return returning
        except Exception as e:
            print('Error loading base datasets pickle: ', e)
            
def build_emotions_counter(data,patterns_df):
    emotions_counter ={}
    emotions_list = list(data['emotion'].unique())
    for emotion in emotions_list:
        load = get_patterns_load(data,patterns_df,emotion)
        emotions_counter[emotion] = collections.Counter(load)
    return emotions_counter

def build_frequencyframe(all_patterns,emotions_counter):
    df_patt = {}
    for pattern in all_patterns:
        df_patt[pattern] = {}
        for emotion in emotions_counter:
            df_patt[pattern][emotion] = emotions_counter[emotion][pattern]
    return pd.DataFrame(df_patt).T

def build_pfief(df_patt):
    ief = ((df_patt+1).rdiv(df_patt.sum(axis=1)+1, axis=0)+1).apply(np.log10)
    pf = ((df_patt.sum(axis=0)+1)/(df_patt+1)).apply(np.log10)
    return ief * pf

def balance_data(data):
    min_sample = min(data.groupby('emotion').count()['index'])
    emotions_list = list(data['emotion'].unique())
    samples = []
    for emotion in emotions_list:
        samples.append(data[data.emotion == emotion].sample(n=min_sample))
    result = pd.concat(samples).sample(frac=1)
    return result
        
def two_emotions(data,emotional_mapping,emotion1,emotion2):
    emotion_code = emotional_mapping[emotion1]
    emotion_sample = data[data.emotion_code == emotion_code]
    emotion_code2 = emotional_mapping[emotion2]
    emotion_sample2 = data[data.emotion_code == emotion_code2]
    if(len(emotion_sample2) < len(emotion_sample)):
        emotion_sample = emotion_sample.sample(n=len(emotion_sample2))
    else:
        emotion_sample2 = emotion_sample2.sample(n=len(emotion_sample))
    sample = pd.concat([emotion_sample,emotion_sample2]).sample(frac=1)
    return sample

def filter_word_count(data, n_count):
    return data[list(map(lambda x: len(x.split(' ')) >= n_count,data['text']))]

def remove_empty_patterns(data,patterns):
    empty_patterns = [k for k, v in patterns.items() if len(v) < 1]
    patterns = { k:v for k, v in patterns.items() if len(v) > 1 }
    data = filter(lambda x: x[1]['index'] not in empty_patterns ,data.iterrows())
    data = pd.DataFrame.from_items(data).T
    return data,patterns





In [38]:
def load_data(word_count,emotional_mapping):
    # full = generate_IEMOCAP_df()
    data = pd.read_csv('data/IEMOCAP_sentences_votebased.csv',index_col=0)
    data['emotion_code'] = data['emotion'].map( emotional_mapping ).astype(int)
    # Take away fear, surprise,disgust, xxx and others. Not enough data
    data = data[data.emotion_code < 4]
    # Clean Transcripts
    try:
        data = data.drop(no_alignment_file)
    except Exception as e:
        print('Error at: ',e)
    # Remove rows that have wrong Alignment file
    try:
        data = data.drop(wrong_alignment)
    except Exception as e:
        print('Error at: ',e)
    data['text'] = data['text'].apply(clean_text)
    # Filter Word Count
    data = filter_word_count(data, word_count)
    patterns = extract_patterns(data)
    data,patterns = remove_empty_patterns(data,patterns)
    return data,patterns

def build_model(data,patterns):
    transcript_order = list(data['index'])
    patterns_df = pd.DataFrame.from_dict(patterns, orient='index')
    patterns_df = patterns_df.loc[transcript_order]
    emotions_counter = build_emotions_counter(X_train,patterns_df)
    all_patterns = []
    for pat in patterns_df.iterrows():
        all_patterns = all_patterns + list(pat[1].dropna())
        
    df_patt = build_frequencyframe(all_patterns,emotions_counter)
    em_df = build_pfief(df_patt)
    return em_df

def get_frequency_vectors(data,patterns_list):
    patterns = extract_patterns(data)
    transcript_order = list(data['index'])
    frequency_vectors = []
    for index in patterns:
        frequency_vectors.append(np.isin(patterns_list,np.array(list(patterns[index]))))
    vectors = pd.DataFrame(frequency_vectors,columns=patterns_list,index=patterns.keys())
    vectors = vectors.loc[transcript_order]
    vectors = vectors * 1
    return vectors
    
def calculate_scores(em_df,vectors):
    em_matrix = em_df.T.as_matrix()
    emotional_scores = []
    for index, vector in vectors.iterrows():
        emotional_scores.append(em_matrix.dot(vector))
    emotions_list = list(em_df.columns)
    scores = pd.DataFrame(emotional_scores,columns=emotions_list,index=list(vectors.index))
    scores['pred_emotion'] = list(map(lambda x: x[1].idxmin(),scores.iterrows()))
    scores['pred_code'] = scores['pred_emotion'].map(emotional_mapping).astype(int)
    return scores

In [39]:
emotional_mapping = {'ang': 0, 'sad': 1, 'hap': 2, 'neu': 3,'fru': 4,'exc': 5,'fea': 6,'sur': 7,'dis': 8, 'xxx':9,'oth':10}
data,patterns = load_data(3,emotional_mapping)
# data = two_emotions(data,emotional_mapping,'sad','exc')
# Balance Data
# data = balance_data(data)
y = data.emotion_code
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2)

X_train.groupby('emotion').count()['index'] #  6,453 Total
X_test.groupby('emotion').count()['index']

Error at:  labels [4764] not contained in axis


emotion
ang    220
hap    127
neu    274
sad    221
Name: index, dtype: int64

Unnamed: 0,index,start_time,end_time,text,wav_path,alignment_path,emotion,valence,arousal,dominance,gender,emotion_code
9076,Ses05M_impro02_M007,42.91,47.78,i don't have a choice i have to go if i don't ...,data/IEMOCAP_full_release/Session5/sentences/w...,data/IEMOCAP_full_release/Session5/sentences/F...,sad,2,3.6667,2.3333,M,1
280,Ses01F_impro06_F022,217.5,222.41,she used to have these poetry parties on memor...,data/IEMOCAP_full_release/Session1/sentences/w...,data/IEMOCAP_full_release/Session1/sentences/F...,sad,4,3,3.5,F,1
5055,Ses03M_impro06_M029,209.011,212.856,he actually wanted to do something,data/IEMOCAP_full_release/Session3/sentences/w...,data/IEMOCAP_full_release/Session3/sentences/F...,sad,2.5,3,3.5,M,1
5014,Ses03M_impro06_F009,100.1,103.191,how are they,data/IEMOCAP_full_release/Session3/sentences/w...,data/IEMOCAP_full_release/Session3/sentences/F...,sad,2,2,3,F,1
8565,Ses05F_script01_1_M003,31.97,38.22,i don't know but when it cracked he ran back i...,data/IEMOCAP_full_release/Session5/sentences/w...,data/IEMOCAP_full_release/Session5/sentences/F...,sad,2.5,3,3.5,M,1
4390,Ses03F_script02_2_F051,433.999,437.005,i'm sorry augie,data/IEMOCAP_full_release/Session3/sentences/w...,data/IEMOCAP_full_release/Session3/sentences/F...,sad,3,2.3333,1.6667,F,1
5034,Ses03M_impro06_M008,60.8439,68.7049,you know i was always the one to tell him to b...,data/IEMOCAP_full_release/Session3/sentences/w...,data/IEMOCAP_full_release/Session3/sentences/F...,sad,3,2.5,3.5,M,1
5431,Ses03M_script01_3_M041,370.332,375.114,otherwise what you have is loot and there's bl...,data/IEMOCAP_full_release/Session3/sentences/w...,data/IEMOCAP_full_release/Session3/sentences/F...,sad,2,3,3.5,M,1
3896,Ses03F_impro06_M002,48.6398,55.4484,i didn't know him tricia but uh i heard that h...,data/IEMOCAP_full_release/Session3/sentences/w...,data/IEMOCAP_full_release/Session3/sentences/F...,sad,2.5,1.5,3,M,1
2764,Ses02M_impro02_F008,64.7625,67.7895,i'll send you lots of pictures,data/IEMOCAP_full_release/Session2/sentences/w...,data/IEMOCAP_full_release/Session2/sentences/F...,sad,4,3,3,F,1


In [15]:
em_df = build_model(X_train,patterns)
patterns_list = np.array(list(em_df.index))

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


In [17]:
print(len(em_df))
# em_df.head()
# em_df.to_pickle('')

5732


# Score - Training Data

In [18]:
vectors = get_frequency_vectors(X_train,patterns_list)

In [19]:
scores = calculate_scores(em_df,vectors)
pred_y = list(scores['pred_code'])

In [20]:
# pred_y, y_train
precision = precision_score(list(y_train),pred_y,average='macro')
recall = recall_score(list(y_train),pred_y,average='macro')
f1 = get_f1_score(precision,recall)
print('Precision Macro',precision)
print('Recall Macro',recall)
print('F1 Macro',f1)
print(' ')
precision = precision_score(list(y_train),pred_y,average='micro')
recall = recall_score(list(y_train),pred_y,average='micro')
f1 = get_f1_score(precision,recall)
print('Precision Micro',precision)
print('Recall Micro',recall)
print('F1 Micro',f1)
print(' ')
precision = precision_score(list(y_train),pred_y,average='weighted')
recall = recall_score(list(y_train),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print('Precision Weighted',precision)
print('Recall Weighted',recall)
print('F1 Weighted',f1)

Precision Macro 0.03846249145519912
Recall Macro 0.07496104767939242
F1 Macro 0.05083933507700077
 
Precision Micro 0.050933786078098474
Recall Micro 0.050933786078098474
F1 Micro 0.050933786078098474
 
Precision Weighted 0.03823185184053815
Recall Weighted 0.050933786078098474
F1 Weighted 0.043678103100488655


#  Testing

In [318]:
vectors = get_frequency_vectors(X_test,patterns_list)

In [319]:
scores = calculate_scores(em_df,vectors)
pred_y = list(scores['pred_code'])

In [320]:
precision = precision_score(list(y_test),pred_y,average='macro')
recall = recall_score(list(y_test),pred_y,average='macro')
f1 = get_f1_score(precision,recall)
print('Precision Macro',precision)
print('Recall Macro',recall)
print('F1 Macro',f1)
print(' ')
precision = precision_score(list(y_test),pred_y,average='micro')
recall = recall_score(list(y_test),pred_y,average='micro')
f1 = get_f1_score(precision,recall)
print('Precision Micro',precision)
print('Recall Micro',recall)
print('F1 Micro',f1)
print(' ')
precision = precision_score(list(y_test),pred_y,average='weighted')
recall = recall_score(list(y_test),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print('Precision Weighted',precision)
print('Recall Weighted',recall)
print('F1 Weighted',f1)

Precision Macro 0.626538733670343
Recall Macro 0.5235144012549827
F1 Macro 0.5704119923846513
 
Precision Micro 0.5930324623911323
Recall Micro 0.5930324623911323
F1 Micro 0.5930324623911323
 
Precision Weighted 0.6143713968148123
Recall Weighted 0.5930324623911323
F1 Weighted 0.6035133638141245


# Without multiple wild-card patterns

In [321]:
def remove_multiwildcard(patterns):
    for index, patt in patterns.items():
        flt_patt = {p for p in patt if p.split(' ').count('.+') == 1}
        patterns[index] = flt_patt
    return patterns

patterns = remove_multiwildcard(patterns)
# data = two_emotions(data,emotional_mapping,'sad','exc')
# Balance Data
# data = balance_data(data)
y = data.emotion_code
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3)

data.groupby('emotion').count()['index'] #  6,453 Total

emotion
ang    1141
hap     680
neu    1440
sad     947
Name: index, dtype: int64

In [322]:
em_df = build_model(X_train,patterns)
patterns_list = np.array(list(em_df.index))

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


In [10]:
print(len(em_df))
em_df.head()
# em_df.to_pickle('pickles/patterns/pfief_matrix.pickle')

5749


# Score - Training Data

In [11]:
vectors = get_frequency_vectors(X_train,patterns_list)
scores = calculate_scores(em_df,vectors)
pred_y = list(scores['pred_code'])

In [12]:
# pred_y, y_train
# precision = precision_score(list(y_train),pred_y,average='macro')
# recall = recall_score(list(y_train),pred_y,average='macro')
# f1 = get_f1_score(precision,recall)
# print('Precision Macro',precision)
# print('Recall Macro',recall)
# print('F1 Macro',f1)
# print(' ')
# precision = precision_score(list(y_train),pred_y,average='micro')
# recall = recall_score(list(y_train),pred_y,average='micro')
# f1 = get_f1_score(precision,recall)
# print('Precision Micro',precision)
# print('Recall Micro',recall)
# print('F1 Micro',f1)
# print(' ')
precision = precision_score(list(y_train),pred_y,average='weighted')
recall = recall_score(list(y_train),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print('Precision Weighted',precision)
print('Recall Weighted',recall)
print('F1 Weighted',f1)


# Precision Macro 0.7664926811354967
# Recall Macro 0.6423609709732143
# F1 Macro 0.6989583086378197
 
# Precision Micro 0.6947368421052632
# Recall Micro 0.6947368421052632
# F1 Micro 0.6947368421052632
 
# Precision Weighted 0.7386741269995074
# Recall Weighted 0.6947368421052632
# F1 Weighted 0.7160320960247799

Precision Weighted 0.7311524992723947
Recall Weighted 0.699151103565365
F1 Weighted 0.7147938042338186


#  Testing

In [326]:
vectors = get_frequency_vectors(X_test,patterns_list)

In [327]:
scores = calculate_scores(em_df,vectors)
pred_y = list(scores['pred_code'])

In [328]:
# precision = precision_score(list(y_test),pred_y,average='macro')
# recall = recall_score(list(y_test),pred_y,average='macro')
# f1 = get_f1_score(precision,recall)
# print('Precision Macro',precision)
# print('Recall Macro',recall)
# print('F1 Macro',f1)
# print(' ')
# precision = precision_score(list(y_test),pred_y,average='micro')
# recall = recall_score(list(y_test),pred_y,average='micro')
# f1 = get_f1_score(precision,recall)
# print('Precision Micro',precision)
# print('Recall Micro',recall)
# print('F1 Micro',f1)
# print(' ')
precision = precision_score(list(y_test),pred_y,average='weighted')
recall = recall_score(list(y_test),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print('Precision Weighted',precision)
print('Recall Weighted',recall)
print('F1 Weighted',f1)


# Precision Macro 0.6295216882336137
# Recall Macro 0.5249158266314806
# F1 Macro 0.5724794856483989
 
# Precision Micro 0.5874901029295329
# Recall Micro 0.5874901029295329
# F1 Micro 0.5874901029295329
 
# Precision Weighted 0.6153167839850325
# Recall Weighted 0.5874901029295329
# F1 Weighted 0.6010815612885869

Precision Weighted 0.622545876774059
Recall Weighted 0.5827395091053048
F1 Weighted 0.6019853602757568


# Acoustic Weights

In [329]:
try:
    with open('pickles/patterns/scaled_pattern_features4emo.pickle','rb') as f:
        save = pickle.load(f)
        full_feature_table = save['full_feature_table']
        wc_feature_table = save['wc_feature_table']
        cw_feature_table = save['cw_feature_table']
        del save
except Exception as e:
    print('Error loading pattern features pickle: ', e)
    

def calculate_final_matrix(em_df,matrix):
    final = []
    for val, val2 in zip(em_df.iterrows(),matrix.iterrows()):
        final.append(val[1] + (val[1]*val2[1]))
    return pd.DataFrame(final)

def build_acumatrix(data,feature_table,saveToPickle = False, savePath = ''):
    matrix = {}
    emotions_list = list(data['emotion'].unique())
    for index, row in data.iterrows():
        emo = row.emotion
        key = row['index']
        patts = feature_table[key].keys()
        for patt in patts:
            tpatt = patt.split('_')[1]
            if(tpatt not in matrix):
                matrix[tpatt] = {}
            if(emo not in matrix[tpatt]):
                matrix[tpatt][emo] = []
            matrix[tpatt][emo].append(feature_table[key][patt])
    for val in matrix:
        for emo in matrix[val].keys():
            matrix[val][emo] = np.mean(matrix[val][emo])
    matrix = pd.DataFrame(matrix).T
    if(saveToPickle and savePath != ''):
        matrix.to_pickle(savePath)
    return matrix


In [330]:
matrix = build_acumatrix(X_train,full_feature_table)
matrix = matrix.fillna(np.min(matrix))

summatrix = em_df + matrix
mulmatrix = calculate_final_matrix(em_df,matrix)
em_df.head()

Unnamed: 0,ang,hap,neu,sad
.+ a,1.335948,1.900873,1.183167,1.73459
.+ a big,1.737349,4.226652,1.756493,3.078385
.+ a bit,1.607381,2.443553,1.623349,2.563002
.+ a day,1.645587,3.158246,2.21946,2.152346
.+ a dog,2.070842,1.93647,1.228016,2.031131


In [331]:
matrix.head()

Unnamed: 0,ang,hap,neu,sad
.+ a,0.089627,0.032742,0.027693,0.018379
.+ a big,0.060808,0.002155,0.015315,0.019379
.+ a bit,0.010465,0.002155,0.025979,0.000531
.+ a day,0.117548,0.002155,0.008552,0.072628
.+ a dog,0.003351,0.002155,0.028417,0.000531


In [332]:
mulmatrix.head()

Unnamed: 0,ang,hap,neu,sad
.+ a,1.455686,1.963112,1.215933,1.76647
.+ a big,1.842994,4.235759,1.783394,3.138041
.+ a bit,1.624203,2.448818,1.665521,2.564363
.+ a day,1.839023,3.165051,2.238441,2.308667
.+ a dog,2.077782,1.940642,1.262913,2.03221


# Full Pattern Feature

# Train

In [333]:
vectors = get_frequency_vectors(X_train,patterns_list)
scores = calculate_scores(mulmatrix,vectors)
pred_y = list(scores['pred_code'])

In [334]:
# pred_y, y_train
# precision = precision_score(list(y_train),pred_y,average='macro')
# recall = recall_score(list(y_train),pred_y,average='macro')
# f1 = get_f1_score(precision,recall)
# print('Precision Macro',precision)
# print('Recall Macro',recall)
# print('F1 Macro',f1)
# print(' ')
# precision = precision_score(list(y_train),pred_y,average='micro')
# recall = recall_score(list(y_train),pred_y,average='micro')
# f1 = get_f1_score(precision,recall)
# print('Precision Micro',precision)
# print('Recall Micro',recall)
# print('F1 Micro',f1)
# print(' ')
precision = precision_score(list(y_train),pred_y,average='weighted')
recall = recall_score(list(y_train),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print('Precision Weighted',precision)
print('Recall Weighted',recall)
print('F1 Weighted',f1)

Precision Weighted 0.7471977150105737
Recall Weighted 0.7042444821731749
F1 Weighted 0.7250855306668259


## Testing

In [335]:
vectors = get_frequency_vectors(X_test,patterns_list)
scores = calculate_scores(mulmatrix,vectors)
pred_y = list(scores['pred_code'])

In [336]:
precision = precision_score(list(y_test),pred_y,average='weighted')
recall = recall_score(list(y_test),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print('Precision Weighted',precision)
print('Recall Weighted',recall)
print('F1 Weighted',f1)

Precision Weighted 0.6327815855434922
Recall Weighted 0.5827395091053048
F1 Weighted 0.6067304502634466


# WildCard Weights

## Train

In [337]:
matrix = build_acumatrix(X_train,wc_feature_table)
matrix = matrix.fillna(np.min(matrix))
mulmatrix = calculate_final_matrix(em_df,matrix)
# mulmatrix = em_df + matrix


In [338]:
vectors = get_frequency_vectors(X_train,patterns_list)
scores = calculate_scores(mulmatrix,vectors)
pred_y = list(scores['pred_code'])

precision = precision_score(list(y_train),pred_y,average='weighted')
recall = recall_score(list(y_train),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print('Precision Weighted',precision)
print('Recall Weighted',recall)
print('F1 Weighted',f1)

Precision Weighted 0.7464544072310809
Recall Weighted 0.7049235993208829
F1 Weighted 0.7250948065891478


## Test

In [339]:
vectors = get_frequency_vectors(X_test,patterns_list)
scores = calculate_scores(mulmatrix,vectors)
pred_y = list(scores['pred_code'])

precision = precision_score(list(y_test),pred_y,average='weighted')
recall = recall_score(list(y_test),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print('Precision Weighted',precision)
print('Recall Weighted',recall)
print('F1 Weighted',f1)

Precision Weighted 0.6334115933210114
Recall Weighted 0.5835312747426762
F1 Weighted 0.6074491813662626


# ContentWord Weights

In [340]:
matrix = build_acumatrix(X_train,cw_feature_table)
matrix = matrix.fillna(np.min(matrix))

mulmatrix = calculate_final_matrix(em_df,matrix)
# mulmatrix = em_df + matrix

## Train

In [341]:
vectors = get_frequency_vectors(X_train,patterns_list)
scores = calculate_scores(mulmatrix,vectors)
pred_y = list(scores['pred_code'])

precision = precision_score(list(y_train),pred_y,average='weighted')
recall = recall_score(list(y_train),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print('Precision Weighted',precision)
print('Recall Weighted',recall)
print('F1 Weighted',f1)

Precision Weighted 0.7477816420949193
Recall Weighted 0.7066213921901529
F1 Weighted 0.7266190904931528


## Test

In [342]:
vectors = get_frequency_vectors(X_test,patterns_list)
scores = calculate_scores(mulmatrix,vectors)
pred_y = list(scores['pred_code'])

precision = precision_score(list(y_test),pred_y,average='weighted')
recall = recall_score(list(y_test),pred_y,average='weighted')
f1 = get_f1_score(precision,recall)
print('Precision Weighted',precision)
print('Recall Weighted',recall)
print('F1 Weighted',f1)

Precision Weighted 0.6330080569132552
Recall Weighted 0.5851148060174188
F1 Weighted 0.6081199158140121
