In [1]:
# Laura Burdick (lburdick@umich.edu)
# Make WALS values binary

In [2]:
import pandas as pd

In [3]:
# SET THESE VARIABLES

# Location of WALS values (from Getting WALS Data.ipynb)
# Should be formatted as a csv file, with a separate column for each
# WALS value, as well as a column called "language" with the Bible
# language codes
wals_path = '../corpus/wals/wals_values_bible.csv'

# Location of output binary WALS features
# Will be formatted as a csv file, with a separate column for each
# binary WALS value, as well as a column called "language" with the
# Bible language codes
binary_wals_path = '../corpus/wals/wals_bible_binary_values.csv'

In [4]:
# Read in WALS values
wals = pd.read_csv(wals_path)
wals = wals.drop("Unnamed: 0",axis=1)

In [5]:
# From https://thispointer.com/how-to-find-drop-duplicate-columns-in-a-dataframe-python-pandas/
def getDuplicateColumns(df):
    '''
    Get a list of duplicate columns.
    It will iterate over all the columns in dataframe and find the columns whose contents are duplicate.
    :param df: Dataframe object
    :return: List of columns whose contents are duplicates.
    '''
    duplicateColumnNames = set()
    # Iterate over all the columns in dataframe
    for x in range(df.shape[1]):
        # Select column at xth index.
        col = df.iloc[:, x]
        # Iterate over all the columns in DataFrame from (x+1)th index till end
        for y in range(x + 1, df.shape[1]):
            # Select column at yth index.
            otherCol = df.iloc[:, y]
            # Check if two columns at x 7 y index are equal
            if col.equals(otherCol):
                duplicateColumnNames.add(df.columns.values[y])
 
    return list(duplicateColumnNames)

In [6]:
# Drop duplicate columns
duplicated = getDuplicateColumns(wals) #all nan in the column right now
wals = wals.drop(duplicated,axis=1)

In [7]:
# List of WALS category names
wals_value_names = wals.columns.values[:-4]

In [14]:
wals_value_names

array(['1A: Consonant Inventories', '2A: Vowel Quality Inventories',
       '3A: Consonant-Vowel Ratio',
       '4A: Voicing in Plosives and Fricatives',
       '5A: Voicing and Gaps in Plosive Systems', '6A: Uvular Consonants',
       '7A: Glottalized Consonants', '8A: Lateral Consonants',
       '9A: The Velar Nasal', '10A: Vowel Nasalization',
       '10B: Nasal Vowels in West Africa', '11A: Front Rounded Vowels',
       '12A: Syllable Structure', '13A: Tone',
       '14A: Fixed Stress Locations', '15A: Weight-Sensitive Stress',
       '16A: Weight Factors in Weight-Sensitive Stress Systems',
       '17A: Rhythm Types', '18A: Absence of Common Consonants',
       '19A: Presence of Uncommon Consonants',
       '20A: Fusion of Selected Inflectional Formatives',
       '21A: Exponence of Selected Inflectional Formatives',
       '21B: Exponence of Tense-Aspect-Mood Inflection',
       '22A: Inflectional Synthesis of the Verb',
       '23A: Locus of Marking in the Clause',
       '24A: 

In [8]:
# Replace NaN with '' in WALS
wals = wals.fillna('')

In [16]:
# For each WALS category, get list of values and attach them to WALS
# category name (to get binary features)
new_value_names = []
for name in wals_value_names:
    print(name)
    values = [i for i in list(set(list(wals[name])))]
    print(values)
    for value in values:
        wals[name+'__'+str(value)] = [1 if i==value else 0 for i in wals[name] ]
        new_value_names.append(name+'__'+str(value))

1A: Consonant Inventories
['', 1.0, 2.0, 3.0, 4.0, 5.0]
2A: Vowel Quality Inventories
['', 2.0, 3.0]
3A: Consonant-Vowel Ratio
['', 1.0, 2.0, 3.0, 4.0, 5.0]
4A: Voicing in Plosives and Fricatives
['', 1.0, 2.0, 3.0, 4.0]
5A: Voicing and Gaps in Plosive Systems
['', 1.0, 2.0, 3.0, 4.0]
6A: Uvular Consonants
['', 1.0, 2.0, 3.0]
7A: Glottalized Consonants
['', 1.0, 2.0, 3.0, 5.0]
8A: Lateral Consonants
['', 1.0, 2.0, 3.0]
9A: The Velar Nasal
['', 1.0, 2.0, 3.0]
10A: Vowel Nasalization
['', 1.0, 2.0]
10B: Nasal Vowels in West Africa
['']
11A: Front Rounded Vowels
['', 1.0, 2.0, 3.0]
12A: Syllable Structure
['', 1.0, 2.0, 3.0]
13A: Tone
['', 1.0, 2.0, 3.0]
14A: Fixed Stress Locations
['', 1.0, 2.0, 6.0, 7.0]
15A: Weight-Sensitive Stress
['', 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]
16A: Weight Factors in Weight-Sensitive Stress Systems
['', 1.0, 2.0, 4.0, 5.0, 6.0, 7.0]
17A: Rhythm Types
['', 1.0, 2.0, 4.0, 5.0]
18A: Absence of Common Consonants
['', 1.0, 3.0]
19A: Presence of Uncommon Consonants
['',

  wals[name+'__'+str(value)] = [1 if i==value else 0 for i in wals[name] ]


In [19]:
new_value_names

['1A: Consonant Inventories__',
 '1A: Consonant Inventories__1.0',
 '1A: Consonant Inventories__2.0',
 '1A: Consonant Inventories__3.0',
 '1A: Consonant Inventories__4.0',
 '1A: Consonant Inventories__5.0',
 '2A: Vowel Quality Inventories__',
 '2A: Vowel Quality Inventories__2.0',
 '2A: Vowel Quality Inventories__3.0',
 '3A: Consonant-Vowel Ratio__',
 '3A: Consonant-Vowel Ratio__1.0',
 '3A: Consonant-Vowel Ratio__2.0',
 '3A: Consonant-Vowel Ratio__3.0',
 '3A: Consonant-Vowel Ratio__4.0',
 '3A: Consonant-Vowel Ratio__5.0',
 '4A: Voicing in Plosives and Fricatives__',
 '4A: Voicing in Plosives and Fricatives__1.0',
 '4A: Voicing in Plosives and Fricatives__2.0',
 '4A: Voicing in Plosives and Fricatives__3.0',
 '4A: Voicing in Plosives and Fricatives__4.0',
 '5A: Voicing and Gaps in Plosive Systems__',
 '5A: Voicing and Gaps in Plosive Systems__1.0',
 '5A: Voicing and Gaps in Plosive Systems__2.0',
 '5A: Voicing and Gaps in Plosive Systems__3.0',
 '5A: Voicing and Gaps in Plosive Systems_

In [12]:
# Save binary features
wals_binary = wals[new_value_names+['language']]
wals_binary.to_csv(binary_wals_path)