In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from rapidfuzz  import fuzz
from rapidfuzz  import process

from tqdm import tqdm_notebook


## Reading in Frequency Lists

In [2]:
SpanishFrequencyDF = pd.read_csv('FrequencyLists/CREA_total.TXT', sep = '\t', encoding='ansi')
SpanishFrequencyDF.reset_index(drop = True, inplace = True)
SpanishFrequencyDF.dropna(inplace = True)
SpanishFrequencyDF.rename({'     Orden': 'Orden'}, axis = 1, inplace = True)

SpanishFrequencyDF.head()

Unnamed: 0,Orden,Frec.absoluta,Frec.normalizada
0,de,9999518,65545.55
1,la,6277560,41148.59
2,que,4681839,30688.85
3,el,4569652,29953.48
4,en,4234281,27755.16


In [3]:
EnglishFrequencyDF = pd.read_csv('FrequencyLists/unigram_freq.csv')

EnglishFrequencyDF.head()

Unnamed: 0,word,count
0,the,23135851162
1,of,13151942776
2,and,12997637966
3,to,12136980858
4,a,9081174698


## Defining Word Frequency Methods

In [4]:
def getWordFrequency(word, lang, fuzzy = True, fuzzylim = 95):

    DatabaseDF = None
    colName = None

    if (lang == 'Spn'):
        ## Spanish Frequency
        DatabaseDF = SpanishFrequencyDF
        colName = 'Orden'

    elif(lang == 'Eng'):
        ## English Frequency
        DatabaseDF = EnglishFrequencyDF
        colName = 'word'

    else:
        raise Exception("Language not in frequency lists")

    if (fuzzy):
        ## Uses fuzzysearch to get similar word
        result = None
        
        try:
            result = process.extractOne(word, DatabaseDF[colName])
        except TypeError:
            return -1

        if (result[1] >= fuzzylim):
            return result[2]
        else:
            return -1

    else:
        ## Uses direct string matching
        result = DatabaseDF[DatabaseDF[colName] == word].index

        if (len(result) > 0):
            return result[0]
        else:
            return -1

def searchWordFrequency(word, lang, fuzzylim = 95):

    frequency = getWordFrequency(word,lang,fuzzy = False)

    if (frequency != -1):
        return frequency

    frequency = getWordFrequency(word,lang,fuzzylim = fuzzylim)

    return frequency


## Reading in Text

In [5]:
KillerCronicasDF = pd.read_csv('CodeSwitching_Text/Killer_Cronicas-output.txt', sep = ',', encoding = 'ansi', quoting = 3)

KillerCronicasDF.tail()

Unnamed: 0,Token,Tag
48904,"""",Punct
48905,es,Spn
48906,otra,Spn
48907,historia,Spn
48908,.,Punct


## Calculating Hit Rate

In [6]:
lanTags = ['Eng', 'Spn']

wordCount = 0
searchCount = 0
fuzzyAttempts = 0
fuzzyCount = 0

notFoundList = []

for index, row in tqdm_notebook(KillerCronicasDF.iterrows(), total=KillerCronicasDF.shape[0]):
   if (row['Tag'] in lanTags):
        wordCount += 1

        searchFrequency = getWordFrequency(row['Token'],row['Tag'], fuzzy = False)

        if (searchFrequency != -1): 
            searchCount += 1
        else:
            fuzzyAttempts += 1
        
            fuzzyfrequency = getWordFrequency(row['Token'],row['Tag'])

            if (fuzzyfrequency != -1):
                fuzzyCount += 1
            else:
                notFoundList.append(row)

        

print("Total Words: ", wordCount)
print("---------")
print("SearchHits: ", searchCount)
print("SearchPercent", (searchCount/wordCount))
print("---------")
print("fuzzyAttempts: ", fuzzyAttempts)
print("fuzzyHits: ", fuzzyCount)
print("fuzzyPercent: ", (fuzzyCount/fuzzyAttempts))
print("---------")
print("Total Percent: ", (searchCount + fuzzyCount)/wordCount)

HBox(children=(IntProgress(value=0, max=48909), HTML(value='')))


Total Words:37424
---------
SearchHits:25604
SearchPercent0.6841598973920479
---------
fuzzyAttempts:11820
fuzzyHits:7452
fuzzyPercent:0.6304568527918781
---------
Total Percent:0.8832834544677213


## Calculating Sentence Complexity for Database

In [7]:
SentenceDF = pd.read_csv('SentenceDatabase.csv', dtype = {'EnglishTrans':str,'SpanishTrans':str})

In [8]:
nonWordList = KillerCronicasDF[(KillerCronicasDF['Tag'] == 'Punct') | (KillerCronicasDF['Tag'] == 'NamedEnt')]['Token'].unique()

In [9]:
lanTags = ['Eng', 'Spn']


def getSentenceFrequency(Sentence = None, tags = None, tag = None, singletag = False, seperator = ' '):
    wordList = Sentence.split(seperator)

    wordCount = 0
    freqCount = 0
    
    if (singletag):
        for word in wordList:
            if (word not in nonWordList):
                freq = searchWordFrequency(word = word, lang = tag)

                if (freq != -1):
                    wordCount += 1
                    freqCount += freq
    else:

        tagList = tags.split(seperator)

        for word, tag in zip(wordList,tagList):
            if (tag in lanTags):
                freq = searchWordFrequency(word = word, lang = tag)

                if (freq != -1):
                    wordCount += 1
                    freqCount += freq

    if (wordCount != 0):
        return (freqCount/wordCount)
    else:
        return np.nan


In [11]:
lanTags = ['Eng', 'Spn']
errorLog = []

for index, row in tqdm_notebook(SentenceDF.iterrows(), total=SentenceDF.shape[0]):


    try:

        ## Original Sentence Score

        if ((pd.isna(row['SentenceFrequency'])) and (not (pd.isna(row['Sentence'])))):

            freq = getSentenceFrequency(Sentence = row['Sentence'], tags = row['Tags'])

            SentenceDF.at[index, 'SentenceFrequency'] = freq

        ## EnglishTrans Score

        if ((pd.isna(row['EnglishFreq'])) and (not (pd.isna(row['EnglishTrans'])))):
                
            freq = getSentenceFrequency(Sentence = row['EnglishTrans'], tag = 'Eng', singletag= True)

            SentenceDF.at[index, 'EnglishFreq'] = freq

        ## SpanishTrans Score

        if ((pd.isna(row['SpanishFreq'])) and (not (pd.isna(row['SpanishTrans'])))):
                
            freq = getSentenceFrequency(Sentence = row['SpanishTrans'], tag = 'Spn', singletag= True)

            SentenceDF.at[index, 'SpanishFreq'] = freq

        if (index % 300):
            SentenceDF.to_csv('SentenceDatabase.csv', index = False)

    except:
        ##print("Error at index ", index)
        errorLog.append(index)
        SentenceDF.to_csv('SentenceDatabase.csv', index = False)


SentenceDF.to_csv('SentenceDatabase.csv', index = False)

HBox(children=(IntProgress(value=0, max=2890), HTML(value='')))




## Analysis