## Fuzzy Matching School Names

Matches school name in educator data with their ids.

In [14]:
import os
import pandas as pd
from fuzzywuzzy import fuzz, process

In [15]:
input_data_dir = "data-to-match"

educators_inexperienced_path = os.path.join(input_data_dir, "Educators_Inexperienced_2018_JAN_24th_2019.csv")
educators_out_of_field_path = os.path.join(input_data_dir, "Educators_OUT_OF_FIELD_2018_JAN_24th_2019.csv")
school_id_data_path = os.path.join(input_data_dir, "ga_public_school_contact_list_Ethan.xlsx")

educators_inexperienced = pd.read_csv(educators_inexperienced_path)
educators_out_of_field = pd.read_csv(educators_out_of_field_path)
school_id_data = pd.read_excel(school_id_data_path, sheet_name="Sheet_1")


In [16]:
school_id_data["SCHOOL_NAME"] = school_id_data["SCHOOL_NAME"].str.upper()
school_id_data["SCHOOL_NAME"] = school_id_data["SCHOOL_NAME"].str.strip()
school_id_data_no_duplicates = school_id_data.drop_duplicates(subset="SCHOOL_NAME", keep=False)

In [17]:
educators_inexperienced["INSTN_NAME"] = educators_inexperienced["INSTN_NAME"].str.upper()
educators_inexperienced["INSTN_NAME"] = educators_inexperienced["INSTN_NAME"].str.strip()

In [18]:
educators_out_of_field["INSTN_NAME"] = educators_out_of_field["INSTN_NAME"].str.upper()
educators_out_of_field["INSTN_NAME"] = educators_out_of_field["INSTN_NAME"].str.strip()

### Matching for Inexperienced Educators

In [19]:
# Get rid of the rows ending with "ALL SCHOOLS"
educators_inexperienced = educators_inexperienced[~educators_inexperienced["INSTN_NAME"].str.endswith("ALL SCHOOLS")]

In [20]:
inexperienced_duplicates = educators_inexperienced[educators_inexperienced.duplicated(subset=["INSTN_NAME"], keep=False)]
inexperienced_no_duplicates = educators_inexperienced[~educators_inexperienced.duplicated(subset=["INSTN_NAME"], keep=False)]
inexperienced_with_perfect_match = inexperienced_no_duplicates[inexperienced_no_duplicates["INSTN_NAME"].isin(school_id_data["SCHOOL_NAME"])]
inexperienced_no_perfect_match = inexperienced_no_duplicates[~inexperienced_no_duplicates["INSTN_NAME"].isin(school_id_data["SCHOOL_NAME"])]

In [21]:
inexperienced_with_perfect_match.shape[0] + inexperienced_no_perfect_match.shape[0] + inexperienced_duplicates.shape[0] == educators_inexperienced.shape[0]

True

In [22]:
# add column to inexperienced_with_perfect_match with the school_id
inexperienced_with_perfect_match["Full School_ID"] = inexperienced_with_perfect_match["INSTN_NAME"].map(school_id_data.set_index("SCHOOL_NAME")["Full School_ID"])


InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [None]:
def fuzzy_match_school_name(row):
    """Matches row of data with school_id_data on school name."""
    result = process.extractOne(row["INSTN_NAME"], school_id_data["SCHOOL_NAME"])
    matching_name = result[0]
    confidence = result[1]
    row_index = result[2]
    school_id = school_id_data.loc[row_index]["Full School_ID"]
    return school_id, matching_name, confidence

In [None]:
inexperienced_no_perfect_match["Full School_ID"] = inexperienced_no_perfect_match.apply(lambda row: fuzzy_match_school_name(row)[0], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inexperienced_no_perfect_match["Full School_ID"] = inexperienced_no_perfect_match.apply(lambda row: fuzzy_match_school_name(row)[0], axis=1)


In [None]:
inexperienced_fuzzy_matched = pd.merge(inexperienced_no_perfect_match, school_id_data, on="Full School_ID", how="left")
inexperienced_fuzzy_matched["MATCH_TYPE"] = "FUZZY"

In [None]:
inexperienced_matched = pd.concat([inexperienced_fuzzy_matched, inexperienced_matched])
# check dims   

### Matching for Out of Field Educators

### Saving Output

In [None]:
output_data_dir = "matched-data"
inexperienced_output_path = os.path.join(output_data_dir, "inexperienced_matched.csv")
inexperienced_matched.to_csv(inexperienced_output_path, index=False)