## Fuzzy Matching School Names

Matches school name in educator data with their ids.

In [206]:
import os
import pandas as pd
from fuzzywuzzy import fuzz, process
from collections import defaultdict

In [207]:
input_data_dir = "data-to-match"

educators_inexperienced_path = os.path.join(input_data_dir, "Educators_Inexperienced_2018_JAN_24th_2019.csv")
educators_out_of_field_path = os.path.join(input_data_dir, "Educators_OUT_OF_FIELD_2018_JAN_24th_2019.csv")
school_id_data_path = os.path.join(input_data_dir, "ga_public_school_contact_list_Ethan.xlsx")

educators_inexperienced = pd.read_csv(educators_inexperienced_path)
educators_out_of_field = pd.read_csv(educators_out_of_field_path)
school_id_data = pd.read_excel(school_id_data_path, sheet_name="Sheet_1")


In [208]:
# Cleaning Public School Data.
school_id_data["SCHOOL_NAME"] = school_id_data["SCHOOL_NAME"].str.upper()
school_id_data["SCHOOL_NAME"] = school_id_data["SCHOOL_NAME"].str.strip()
school_id_data["SYSTEM_NAME"] = school_id_data["SYSTEM_NAME"].str.upper()
school_id_data["SYSTEM_NAME"] = school_id_data["SYSTEM_NAME"].str.strip()
# Must take care of duplicates separately. Some schools have the same name but are differet schools with different IDs.
school_id_data_no_duplicates = school_id_data.drop_duplicates(subset="SCHOOL_NAME", keep=False)

In [209]:
# Cleaning Educators Inexperienced.
educators_inexperienced["INSTN_NAME"] = educators_inexperienced["INSTN_NAME"].str.upper()
educators_inexperienced["INSTN_NAME"] = educators_inexperienced["INSTN_NAME"].str.strip()
educators_inexperienced["SCHOOL_DSTRCT_NM"] = educators_inexperienced["SCHOOL_DSTRCT_NM"].str.upper()
educators_inexperienced["SCHOOL_DSTRCT_NM"] = educators_inexperienced["SCHOOL_DSTRCT_NM"].str.strip()
educators_inexperienced = educators_inexperienced[~educators_inexperienced["INSTN_NAME"].str.endswith("ALL SCHOOLS")]

In [210]:
# Cleaning Educators Out of Field.
educators_out_of_field["INSTN_NAME"] = educators_out_of_field["INSTN_NAME"].str.upper()
educators_out_of_field["INSTN_NAME"] = educators_out_of_field["INSTN_NAME"].str.strip()
educators_out_of_field["SCHOOL_DSTRCT_NM"] = educators_out_of_field["SCHOOL_DSTRCT_NM"].str.upper()
educators_out_of_field["SCHOOL_DSTRCT_NM"] = educators_out_of_field["SCHOOL_DSTRCT_NM"].str.strip()
educators_out_of_field = educators_out_of_field[~educators_out_of_field["INSTN_NAME"].str.endswith("ALL SCHOOLS")]

### Matching for Inexperienced Educators

In [211]:
inexperienced_with_perfect_match = educators_inexperienced[educators_inexperienced["INSTN_NAME"].isin(school_id_data["SCHOOL_NAME"])]
inexperienced_no_perfect_match = educators_inexperienced[~educators_inexperienced["INSTN_NAME"].isin(school_id_data["SCHOOL_NAME"])]

print("Shape of inexperienced_with_perfect_match: {}".format(inexperienced_with_perfect_match.shape))
print("Shape of inexperienced_no_perfect_match: {}".format(inexperienced_no_perfect_match.shape))

Shape of inexperienced_with_perfect_match: (5587, 8)
Shape of inexperienced_no_perfect_match: (97, 8)


In [212]:
print("Number of schools in inexperienced data: {}".format(len(educators_inexperienced)))
print("Number of schools in inexperienced data with perfect match: {}".format(len(inexperienced_with_perfect_match)))
print("Number of schools in inexperienced data with no perfect match: {}".format(len(inexperienced_no_perfect_match)))

correct_size = inexperienced_with_perfect_match.shape[0] + inexperienced_no_perfect_match.shape[0] == educators_inexperienced.shape[0]
print("Sizes of perfect and non-perfect subsets is correct: {}".format(correct_size))

no_overlap = len(set(inexperienced_with_perfect_match["INSTN_NAME"]).intersection(set(inexperienced_no_perfect_match["INSTN_NAME"]))) == 0
print("No overlap between perfect and non-perfect matching subsets: {}".format(no_overlap))

Number of schools in inexperienced data: 5684
Number of schools in inexperienced data with perfect match: 5587
Number of schools in inexperienced data with no perfect match: 97
Sizes of perfect and non-perfect subsets is correct: True
No overlap between perfect and non-perfect matching subsets: True


In [213]:
merged_inexperienced = inexperienced_with_perfect_match.merge(
    school_id_data, # We can use the full dataset here because we are matching on both school name and county.
    left_on=["INSTN_NAME", "SCHOOL_DSTRCT_NM"],
    right_on=["SCHOOL_NAME", "SYSTEM_NAME"],
    how="left",
)

merged_inexperienced["MATCH_TYPE"] = "PERFECT"

print("Shape of merged_inexperienced: {}".format(merged_inexperienced.shape))

Shape of merged_inexperienced: (5587, 31)


In [214]:
unmatched_inexperienced = merged_inexperienced[merged_inexperienced["Full School_ID"].isnull()]
unmatched_inexperienced_compliment = merged_inexperienced[~merged_inexperienced["Full School_ID"].isnull()]

print("Number of unmatched schools by county: {}".format(len(unmatched_inexperienced)))

dictionary = defaultdict(lambda : "")
for index, row in school_id_data_no_duplicates.iterrows():
    dictionary[row["SCHOOL_NAME"]] = row["Full School_ID"]

unmatched_inexperienced["Full School_ID"] = unmatched_inexperienced["INSTN_NAME"].map(dictionary)
# Set SCHOOL_NAME and SYSTEM_NAME to the same values as INSTN_NAME and SCHOOL_DSTRCT_NM from the school_id_data_no_duplicates df using the Full School_ID as the key.
unmatched_inexperienced["SCHOOL_NAME"] = unmatched_inexperienced["Full School_ID"].map(school_id_data_no_duplicates.set_index("Full School_ID")["SCHOOL_NAME"])
unmatched_inexperienced["SYSTEM_NAME"] = unmatched_inexperienced["Full School_ID"].map(school_id_data_no_duplicates.set_index("Full School_ID")["SYSTEM_NAME"])


# set the score to 100 for all perfect matches where we have a school ID.
unmatched_inexperienced.loc[unmatched_inexperienced["Full School_ID"] != "", "SCORE"] = 100
unmatched_inexperienced.loc[unmatched_inexperienced["Full School_ID"] == "", "SCORE"] = 0
unmatched_inexperienced["MATCH_TYPE"] = "NON_MATCHING_COUNTY"

unmatched_inexperienced_compliment["SCORE"] = 100

merged_inexperienced = pd.concat([unmatched_inexperienced_compliment, unmatched_inexperienced])

Number of unmatched schools by county: 272


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unmatched_inexperienced["Full School_ID"] = unmatched_inexperienced["INSTN_NAME"].map(dictionary)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unmatched_inexperienced["SCHOOL_NAME"] = unmatched_inexperienced["Full School_ID"].map(school_id_data_no_duplicates.set_index("Full School_ID")["SCHOOL_NAME"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_

In [215]:
def fuzzy_match_school_name(name):
    """Matches school name to school id using fuzzy matching.
    
    Parameters
    ----------
    name : str
        School name to match.
        
    Returns
    -------
    name : str
        Tuple of matched name, match confidence score, school id, system name, and school name.
    """
    match = process.extractOne(name, school_id_data_no_duplicates["SCHOOL_NAME"])
    name = match[0]
    score = match[1]
    row = match[2]
    school_id = school_id_data_no_duplicates.loc[row]["Full School_ID"]
    system_name = school_id_data_no_duplicates.loc[row]["SYSTEM_NAME"]
    school_name = school_id_data_no_duplicates.loc[row]["SCHOOL_NAME"]
    return (name, score, school_id, system_name, school_name)

In [216]:
inexperienced_no_perfect_match["SCHOOL_ID"] = inexperienced_no_perfect_match["INSTN_NAME"].apply(fuzzy_match_school_name)
# Splitting the tuple into separate columns.
inexperienced_no_perfect_match[["SCHOOL_NAME", "SCORE", "Full School_ID", "SYSTEM_NAME", "SCHOOL_NAME"]] = pd.DataFrame(inexperienced_no_perfect_match["SCHOOL_ID"].tolist(), index=inexperienced_no_perfect_match.index)
inexperienced_no_perfect_match = inexperienced_no_perfect_match.drop("SCHOOL_ID", axis=1)

inexperienced_no_perfect_match["MATCH_TYPE"] = "FUZZY"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inexperienced_no_perfect_match["SCHOOL_ID"] = inexperienced_no_perfect_match["INSTN_NAME"].apply(fuzzy_match_school_name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inexperienced_no_perfect_match[["SCHOOL_NAME", "SCORE", "Full School_ID", "SYSTEM_NAME", "SCHOOL_NAME"]] = pd.DataFrame(inexperienced_no_perfect_match["SCHOOL_ID"].tolist(), index=inexperienced_no_perfect_match.index)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See th

### Matching for Out of Field Educators

In [217]:
out_of_field_with_perfect_match = educators_out_of_field[educators_out_of_field["INSTN_NAME"].isin(school_id_data["SCHOOL_NAME"])]
out_of_field_no_perfect_match = educators_out_of_field[~educators_out_of_field["INSTN_NAME"].isin(school_id_data["SCHOOL_NAME"])]

print("Shape of out_of_field_with_perfect_match: {}".format(out_of_field_with_perfect_match.shape))
print("Shape of out_of_field_no_perfect_match: {}".format(out_of_field_no_perfect_match.shape))

Shape of out_of_field_with_perfect_match: (3343, 8)
Shape of out_of_field_no_perfect_match: (59, 8)


In [218]:
print("Number of schools in out of field data: {}".format(len(educators_out_of_field)))
print("Number of schools in out of field data with perfect match: {}".format(len(out_of_field_with_perfect_match)))
print("Number of schools in out of field data with no perfect match: {}".format(len(out_of_field_no_perfect_match)))

correct_size = out_of_field_with_perfect_match.shape[0] + out_of_field_no_perfect_match.shape[0] == educators_out_of_field.shape[0]
print("Sizes of out of field data subsets add up to the size of the original data: {}".format(correct_size))

no_overlap = len(set(out_of_field_with_perfect_match["INSTN_NAME"]).intersection(set(out_of_field_no_perfect_match["INSTN_NAME"]))) == 0
print("There is no overlap between the schools in the out of field data subsets: {}".format(no_overlap))

Number of schools in out of field data: 3402
Number of schools in out of field data with perfect match: 3343
Number of schools in out of field data with no perfect match: 59
Sizes of out of field data subsets add up to the size of the original data: True
There is no overlap between the schools in the out of field data subsets: True


In [219]:
merged_out_of_field = pd.merge(
    out_of_field_with_perfect_match,
    school_id_data, # We can use the full dataset here because we are matching on both school name and county.
    left_on=["INSTN_NAME", "SCHOOL_DSTRCT_NM"],
    right_on=["SCHOOL_NAME", "SYSTEM_NAME"],
    how="left"
)

merged_out_of_field["MATCH_TYPE"] = "PERFECT"

print("Shape of merged_out_of_field: {}".format(merged_out_of_field.shape))

Shape of merged_out_of_field: (3343, 31)


In [220]:
unmatched_out_of_field = merged_out_of_field[merged_out_of_field["Full School_ID"].isnull()]
unmatched_out_of_field_compliment = merged_out_of_field[~merged_out_of_field["Full School_ID"].isnull()]

print("Number of unmatched schools by county: {}".format(len(unmatched_out_of_field)))

# using old dictionary from before
unmatched_out_of_field["Full School_ID"] = unmatched_out_of_field["INSTN_NAME"].map(dictionary)
# Set SCHOOL_NAME and SYSTEM_NAME to the same values as INSTN_NAME and SCHOOL_DSTRCT_NM from the school_id_data_no_duplicates df using the Full School_ID as the key.
unmatched_out_of_field["SCHOOL_NAME"] = unmatched_out_of_field["Full School_ID"].map(school_id_data_no_duplicates.set_index("Full School_ID")["SCHOOL_NAME"])
unmatched_out_of_field["SYSTEM_NAME"] = unmatched_out_of_field["Full School_ID"].map(school_id_data_no_duplicates.set_index("Full School_ID")["SYSTEM_NAME"])


# set the score to 100 for all perfect matches where we have a school ID.
unmatched_out_of_field.loc[unmatched_out_of_field["Full School_ID"] != "", "SCORE"] = 100
unmatched_out_of_field.loc[unmatched_out_of_field["Full School_ID"] == "", "SCORE"] = 0
unmatched_out_of_field["MATCH_TYPE"] = "NON_MATCHING_COUNTY"

unmatched_out_of_field_compliment["SCORE"] = 100

merged_out_of_field = pd.concat([unmatched_out_of_field_compliment, unmatched_out_of_field])

Number of unmatched schools by county: 156


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unmatched_out_of_field["Full School_ID"] = unmatched_out_of_field["INSTN_NAME"].map(dictionary)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unmatched_out_of_field["SCHOOL_NAME"] = unmatched_out_of_field["Full School_ID"].map(school_id_data_no_duplicates.set_index("Full School_ID")["SCHOOL_NAME"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guid

In [221]:
out_of_field_no_perfect_match["SCHOOL_ID"] = out_of_field_no_perfect_match["INSTN_NAME"].apply(fuzzy_match_school_name)
# Splitting the tuple into separate columns.
out_of_field_no_perfect_match[["SCHOOL_NAME", "SCORE", "Full School_ID", "SYSTEM_NAME", "SCHOOL_NAME"]] = pd.DataFrame(out_of_field_no_perfect_match["SCHOOL_ID"].tolist(), index=out_of_field_no_perfect_match.index)
out_of_field_no_perfect_match = out_of_field_no_perfect_match.drop("SCHOOL_ID", axis=1)

out_of_field_no_perfect_match["MATCH_TYPE"] = "FUZZY"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  out_of_field_no_perfect_match["SCHOOL_ID"] = out_of_field_no_perfect_match["INSTN_NAME"].apply(fuzzy_match_school_name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  out_of_field_no_perfect_match[["SCHOOL_NAME", "SCORE", "Full School_ID", "SYSTEM_NAME", "SCHOOL_NAME"]] = pd.DataFrame(out_of_field_no_perfect_match["SCHOOL_ID"].tolist(), index=out_of_field_no_perfect_match.index)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cav

### Saving Output

In [222]:
output_dir = "matched-data"

In [None]:
# rememeber to remove non-alphanumeric characters from school names.

inexperienced_matched = pd.concat([merged_inexperienced, inexperienced_no_perfect_match])
inexperienced_matched[
    [
        "LONG_SCHOOL_YEAR",
        "SCHOOL_DSTRCT_NM",
        "INSTN_NAME",
        "SYSTEM_NAME",
        "SCHOOL_NAME",
        "LABEL_LVL_3_DESC",
        "LABEL_LVL_2_DESC",
        "FTE",
        "INEXPERIENCED_FTE",
        "INEXPERIENCED_FTE_PCT",
        "Full School_ID",
        "MATCH_TYPE",
        "SCORE"
    ]
].to_csv(os.path.join(output_dir, "inexperienced-matched.csv"), index=False)

# Cleaned by hand.



In [223]:
out_of_field_matched = pd.concat([merged_out_of_field, out_of_field_no_perfect_match])
out_of_field_matched[
    [
        "LONG_SCHOOL_YEAR",
        "SCHOOL_DSTRCT_NM",
        "INSTN_NAME",
        "SYSTEM_NAME",
        "SCHOOL_NAME",
        "LABEL_LVL_3_DESC",
        "LABEL_LVL_2_DESC",
        "FTE",
        "OUTOFFIELD_FTE",
        "OUTOFFIELD_FTE_PCT",
        "Full School_ID",
        "MATCH_TYPE",
        "SCORE"
    ]
].to_csv(os.path.join(output_dir, "out-of-field-matched.csv"), index=False)

# Cleaned by hand.