In [1]:
import numpy as np
import pandas as pd
import pprint
import matplotlib.pyplot as plt
from scipy import stats
from collections import Counter

In [2]:
df = pd.read_csv("sample_string.csv", index_col = "Index")
df

Unnamed: 0_level_0,Marital status,Application mode,University's position in preferences when applying,Course,Daytime/evening attendance,Previous qualification,Nationality,Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 2nd sem enrolled,Total exams across all classes in 2nd sem,Curricular units 2nd sem passed,Curricular units 2nd sem grade,Curricular units 2nd sem without exams,Unemployment rate at enrollment,Inflation rate at enrollment,GDP at enrollment,Target,AI prediction
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
648,Single,Change in course,2,Oral Hygiene,daytime,Secondary education,Portuguese,Secondary Education—12th Year of Schooling or ...,Basic Education 3rd Cycle (9th/10th/11th Year)...,Administrative staff,...,8,10,8,13.2125,0,10.8,1.4,1.74,Graduate,Graduate
2781,Single,1st phase—general contingent,1,Basic Education,daytime,Secondary education,Portuguese,2nd cycle of the general high school course,Basic education 1st cycle (4th/5th year) or eq...,Unskilled Workers,...,6,14,1,11.0,0,10.8,1.4,1.74,Dropout,Dropout
2883,Married,Over 23 years old,1,Social Service (evening attendance),evening,Secondary education,Portuguese,General Course of Administration and Commerce,Basic education 1st cycle (4th/5th year) or eq...,Unskilled Workers,...,6,8,6,11.5,0,8.9,1.4,3.51,Graduate,Graduate
266,Single,2nd phase—general contingent,4,Nursing,daytime,Secondary education,Portuguese,Higher Education—master’s degree,Secondary Education—12th Year of Schooling or ...,Specialists in Intellectual and Scientific Act...,...,8,8,7,14.828571,0,12.7,3.7,-1.7,Graduate,Graduate
2537,Single,1st phase—general contingent,1,Advertising and Marketing Management,daytime,Secondary education,Portuguese,General commerce course,Basic Education 3rd Cycle (9th/10th/11th Year)...,"Skilled Workers in Industry, Construction, and...",...,6,6,6,15.833333,0,12.4,0.5,1.79,Graduate,Graduate
2791,Single,1st phase—general contingent,1,Veterinary Nursing,daytime,Secondary education,Portuguese,Supplementary Accounting and Administration,Basic Education 3rd Cycle (9th/10th/11th Year)...,Unskilled Workers,...,6,8,6,14.142857,0,11.1,0.6,2.02,Graduate,Graduate
1576,Married,Over 23 years old,2,Advertising and Marketing Management,daytime,Basic education 3rd cycle (9th/10th/11th year)...,Portuguese,General Course of Administration and Commerce,Basic education 1st cycle (4th/5th year) or eq...,Unskilled Workers,...,6,9,4,13.75,0,12.4,0.5,1.79,Dropout,Dropout
988,Single,1st phase—general contingent,1,Social Service,daytime,Secondary education,Portuguese,Higher Education—bachelor’s degree,Higher Education—bachelor’s degree,Intermediate Level Technicians and Professions,...,6,12,2,10.0,0,11.1,0.6,2.02,Dropout,Dropout
3232,Single,1st phase—general contingent,2,Journalism and Communication,daytime,Secondary education,Portuguese,General commerce course,Basic Education 2nd Cycle (6th/7th/8th Year) o...,Intermediate Level Technicians and Professions,...,6,6,5,12.6,0,12.7,3.7,-1.7,Graduate,Graduate
3214,Single,1st phase—general contingent,4,Communication Design,daytime,Secondary education,Portuguese,Higher Education—degree,Higher Education—degree,Specialists in Intellectual and Scientific Act...,...,6,6,6,15.166667,0,9.4,-0.8,-3.12,Graduate,Graduate


In [21]:
# generating somewhat sensible explanations

explanations = {}

# firstly, lets look at numerical values.

# We will check for each column, if in order to predict "graduate" (as opposodes to "dropout") 
# the value should be high or low. 
# where there is no or barely any difference, we will consider the column irrelevant ("na"). 
# after visual explanations, the reasoning for all numerical columns seems to make sense.

df_grad = df[df["Target"] == "Graduate"]
df_drop = df[df["Target"] == "Dropout"]

for column in df:
    
    if df.dtypes[column] in ["int", "float64"]:
        
        grad_mean = np.mean(df_grad[column])
        drop_mean = np.mean(df_drop[column])
        
        if np.abs(grad_mean - drop_mean) < 0.5:
            explanations[column] = "na"
        else:
            if grad_mean > drop_mean:
                explanations[column] = "high"
            else:
                explanations[column] = "low"
        
        #print(column)
        #print("Graduates:", grad_mean)
        #print("Dropouts:", drop_mean)
        #print(explanations[column])
        #print()

        
print("---")

print("---")
explanations["GDP at enrollment"] = "na" # I don't want to have this in the explanations
pprint.pprint(explanations)

---
---
{'Age at enrollment': 'low',
 'Curricular units 1st sem enrolled': 'high',
 'Curricular units 1st sem grade': 'high',
 'Curricular units 1st sem passed': 'high',
 'Curricular units 1st sem recognized from previous education or work': 'na',
 'Curricular units 1st sem without exams': 'na',
 'Curricular units 2nd sem enrolled': 'high',
 'Curricular units 2nd sem grade': 'high',
 'Curricular units 2nd sem passed': 'high',
 'Curricular units 2nd sem recognized from previous education or work': 'na',
 'Curricular units 2nd sem without exams': 'na',
 'GDP at enrollment': 'na',
 'Inflation rate at enrollment': 'na',
 'Total exams across all classes in 1st sem': 'low',
 'Total exams across all classes in 2nd sem': 'low',
 'Unemployment rate at enrollment': 'na',
 "University's position in preferences when applying": 'high'}


In [69]:
# now, lets look at categorical values.
# in this case it doesn't make sense to save on piece of information (high/low) per column.
# instead, we will figure out for each column which category one should be to be classified grad or drop respectively.
# then, we will order to columns once for grad and once for drop. So that if you want to be grad, the best 
# explanation would be if in column x you were in category a, if not then the next best if in column y you were in 
# category d, and so on. 
# Then we can choose a threshold under which the difference is not interesting. Above that threshold, we will 
# highlight for each student all of the columns where they belong in the category that is a good explanation for 
# their prediction (maybe up to a max of something, will see)

explanations_grads = []
explanations_drops = []
        
for column in df:
    
    if not df.dtypes[column] in ["int", "float64"] and not column in ["Target","explanation_num","explanation_cat", "AI prediction"]:
        
        print(column)
        grad_counts = Counter(df_grad[column])
        grad_perc = {item:grad_counts[item]/11 for item in grad_counts}
        drop_counts = Counter(df_drop[column])
        drop_perc = {item:drop_counts[item]/4 for item in drop_counts}
        #print(grad_counts)
        #print(grad_perc)
        #print(drop_counts)
        #print(drop_perc)
        
        options = list(set(drop_counts.keys()).union(set(grad_counts.keys())))
        #print(options)
        #print()
        
        best_o_grad = None
        best_o_grad_value = 0
        best_o_drop = None
        best_o_drop_value = 0
        
        for o in options:
            #print(o)
            if o in grad_perc:
                g = grad_perc[o]
            else:
                g = 0
            if o in drop_perc:
                d = drop_perc[o]
            else:
                d = 0
            diff = g-d
            if diff > best_o_grad_value:
                best_o_grad_value = diff
                best_o_grad = o
            elif diff < best_o_drop_value:
                best_o_drop_value = diff
                best_o_drop = o
            #print("Grads:", g, "Drops:", d)
            #print("Diff:", g-d)
            #print()
            
        if not best_o_grad is None:
            explanations_grads.append((column, best_o_grad, best_o_grad_value))
        if not best_o_drop is None:
            explanations_drops.append((column, best_o_drop, abs(best_o_drop_value)))
            
        print("Grads should be", best_o_grad)
        print("Drops should be", best_o_drop)
        
        #print("Graduates:", {item:grad_counts[item]/11 for item in grad_counts})
        #print("Dropouts:", {item:drop_counts[item]/4 for item in drop_counts})

    print("-")
    
explanations_grads = sorted(explanations_grads, key = lambda x:x[2], reverse = True)
explanations_drops = sorted(explanations_drops, key = lambda x:x[2], reverse = True)

# from visual inspection, it seems that a difference of 0.15 is a pretty good cutoff to only get those
# categories where there actually is an interesting difference in the data.

# finally, we add these explanations to the main dict, as a little dict where we can look up what the correct options 
# should be for grad/drop respectively.

for e in explanations_grads:
    if e[2]>0.15:
        explanations[e[0]] = {"GRADUATE":[e[1], e[2]]}
    else:
        explanations[e[0]] = {"GRADUATE":["na",0]}
for e in explanations_drops:
    if e[2]>0.15:
        explanations[e[0]]["DROPOUT"] = [e[1], e[2]]
    else:
        explanations[e[0]]["DROPOUT"] = ["na",0]
        
# there values are reduced manually only for the majority option, to not always have this as the default explanation
explanations["Debtor"]["GRADUATE"][1]=0.151
explanations['Tuition fees up to date']["GRADUATE"][1]=0.151
explanations["Daytime/evening attendance"]["DROPOUT"] = ["na", 0]

pprint.pprint(explanations)

Marital status
Grads should be Divorced
Drops should be Married
-
Application mode
Grads should be Change in course
Drops should be 2nd phase—general contingent
-
-
Course
Grads should be Communication Design
Drops should be Basic Education
-
Daytime/evening attendance
Grads should be evening
Drops should be daytime
-
Previous qualification
Grads should be Secondary education
Drops should be Basic education 3rd cycle (9th/10th/11th year) or equivalent
-
Nationality
Grads should be None
Drops should be None
-
Mother's qualification
Grads should be Higher Education—master’s degree
Drops should be 2nd cycle of the general high school course
-
Father's qualification
Grads should be Higher Education—degree
Drops should be Higher Education—bachelor’s degree
-
Mother's occupation
Grads should be Specialists in Intellectual and Scientific Activities
Drops should be Student
-
Father's occupation
Grads should be Intermediate Level Technicians and Professions
Drops should be Administrative staff


In [70]:
# in order to generate the explanations for each case, we look at the numerical columns of that case.
# we check if this person is particularly high or low compared to the mean over all people, 
# and whether the direction of high/low corresponds to the prediction by the AI.

# then, we check for which of the categorial columns the chosen category conforms to the ones identified above.

x_per_person_num = {}
x_per_person_cat = {}

for index, row in df.iterrows():
    
    # track which cols have the highest percent of other peoples lower/higher this one
    most_x_cols = []
    most_x_percent = 0
    
    # track which cols are the correct category for the prediction
    cat_x_cols = []
    
    print(row["Target"], row["AI prediction"])
    
    for column in df:
        
        # the high/low goal in the explanations dict generated above corresponds to what we want to see
        # in order to predict "graduate". That means if the prediction of the AI is "dropout", it should be
        # the opposite.
        
        # check how many others are below this person if the goal is high, or above if goal is low
        if column in explanations and not explanations[column] == "na":
            
            if explanations[column] in ["high", "low"]:

                #print(column, ":", row[column])
                #print(np.mean(df[column]))

                if row["AI prediction"] == "Graduate":
                    if explanations[column] == "high":
                        #print("is this high? G")
                        percent = len([o for o in df[column] if o < row[column]]) / 15
                    else:
                        #print("is this low? G")
                        percent = len([o for o in df[column] if o > row[column]]) / 15
                else:
                    if explanations[column] == "high":
                        #print("is this low? D")
                        percent = len([o for o in df[column] if o > row[column]]) / 15
                    else:
                        #print("is this high? D")
                        percent = len([o for o in df[column] if o < row[column]]) / 15

                #print(percent)

                if percent > most_x_percent:
                    most_x_percent = percent
                    most_x_cols = [column]
                elif percent == most_x_percent:
                    most_x_cols.append(column)
                
            # for the categorical explanations, we only have to check for which ones the student is in the correct
            # category according to their prediction.
            else:
                explaining_cat = explanations[column][row["AI prediction"].upper()]
                if explaining_cat[0] == row[column]:
                    cat_x_cols.append([column, explaining_cat[0], explaining_cat[1]]) 
                
                
        #print("-")
                
    print(most_x_cols, most_x_percent)
    x_per_person_num[index] = most_x_cols
    
    print(cat_x_cols)
    # if there are more than three cat cols, we will limit it to two, but allow more if there are more at the 
    # same exact value
    if len(cat_x_cols) > 2:
        cat_x_cols = sorted(cat_x_cols, key = lambda x:x[2], reverse = True)
        cutoff_value = cat_x_cols[1][2]
        cat_x_cols = [c for c in cat_x_cols if c[2]>=cutoff_value]
        print(cat_x_cols)
    x_per_person_cat[index] = cat_x_cols
            
    print("---")

Graduate Graduate
['Curricular units 2nd sem passed'] 0.9333333333333333
[['Previous qualification', 'Secondary education', 0.15909090909090906], ["Father's occupation", 'Intermediate Level Technicians and Professions', 0.36363636363636365], ['Debtor', 'no', 0.151], ['Tuition fees up to date', 'yes', 0.151]]
[["Father's occupation", 'Intermediate Level Technicians and Professions', 0.36363636363636365], ['Previous qualification', 'Secondary education', 0.15909090909090906]]
---
Dropout Dropout
['Total exams across all classes in 1st sem', 'Curricular units 1st sem grade', 'Total exams across all classes in 2nd sem', 'Curricular units 2nd sem passed'] 0.9333333333333333
[['Course', 'Basic Education', 0.25], ["Mother's qualification", '2nd cycle of the general high school course', 0.25], ['Debtor', 'yes', 0.25], ['Tuition fees up to date', 'no', 0.25]]
[['Course', 'Basic Education', 0.25], ["Mother's qualification", '2nd cycle of the general high school course', 0.25], ['Debtor', 'yes', 

In [71]:
df["explanation_num"] = pd.Series(x_per_person_num)
df["explanation_cat"] = pd.Series(x_per_person_cat)
df

Unnamed: 0_level_0,Marital status,Application mode,University's position in preferences when applying,Course,Daytime/evening attendance,Previous qualification,Nationality,Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 2nd sem passed,Curricular units 2nd sem grade,Curricular units 2nd sem without exams,Unemployment rate at enrollment,Inflation rate at enrollment,GDP at enrollment,Target,AI prediction,explanation_num,explanation_cat
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
648,Single,Change in course,2,Oral Hygiene,daytime,Secondary education,Portuguese,Secondary Education—12th Year of Schooling or ...,Basic Education 3rd Cycle (9th/10th/11th Year)...,Administrative staff,...,8,13.2125,0,10.8,1.4,1.74,Graduate,Graduate,[Curricular units 2nd sem passed],"[[Father's occupation, Intermediate Level Tech..."
2781,Single,1st phase—general contingent,1,Basic Education,daytime,Secondary education,Portuguese,2nd cycle of the general high school course,Basic education 1st cycle (4th/5th year) or eq...,Unskilled Workers,...,1,11.0,0,10.8,1.4,1.74,Dropout,Dropout,"[Total exams across all classes in 1st sem, Cu...","[[Course, Basic Education, 0.25], [Mother's qu..."
2883,Married,Over 23 years old,1,Social Service (evening attendance),evening,Secondary education,Portuguese,General Course of Administration and Commerce,Basic education 1st cycle (4th/5th year) or eq...,Unskilled Workers,...,6,11.5,0,8.9,1.4,3.51,Graduate,Graduate,"[Total exams across all classes in 1st sem, To...","[[Daytime/evening attendance, evening, 0.18181..."
266,Single,2nd phase—general contingent,4,Nursing,daytime,Secondary education,Portuguese,Higher Education—master’s degree,Secondary Education—12th Year of Schooling or ...,Specialists in Intellectual and Scientific Act...,...,7,14.828571,0,12.7,3.7,-1.7,Graduate,Graduate,[Curricular units 1st sem enrolled],"[[Father's occupation, Intermediate Level Tech..."
2537,Single,1st phase—general contingent,1,Advertising and Marketing Management,daytime,Secondary education,Portuguese,General commerce course,Basic Education 3rd Cycle (9th/10th/11th Year)...,"Skilled Workers in Industry, Construction, and...",...,6,15.833333,0,12.4,0.5,1.79,Graduate,Graduate,[Curricular units 2nd sem grade],"[[Previous qualification, Secondary education,..."
2791,Single,1st phase—general contingent,1,Veterinary Nursing,daytime,Secondary education,Portuguese,Supplementary Accounting and Administration,Basic Education 3rd Cycle (9th/10th/11th Year)...,Unskilled Workers,...,6,14.142857,0,11.1,0.6,2.02,Graduate,Graduate,[Curricular units 1st sem grade],"[[Father's occupation, Intermediate Level Tech..."
1576,Married,Over 23 years old,2,Advertising and Marketing Management,daytime,Basic education 3rd cycle (9th/10th/11th year)...,Portuguese,General Course of Administration and Commerce,Basic education 1st cycle (4th/5th year) or eq...,Unskilled Workers,...,4,13.75,0,12.4,0.5,1.79,Dropout,Dropout,[Age at enrollment],"[[Marital status, Married, 0.1590909090909091]..."
988,Single,1st phase—general contingent,1,Social Service,daytime,Secondary education,Portuguese,Higher Education—bachelor’s degree,Higher Education—bachelor’s degree,Intermediate Level Technicians and Professions,...,2,10.0,0,11.1,0.6,2.02,Dropout,Dropout,[Curricular units 2nd sem grade],"[[Father's qualification, Higher Education—bac..."
3232,Single,1st phase—general contingent,2,Journalism and Communication,daytime,Secondary education,Portuguese,General commerce course,Basic Education 2nd Cycle (6th/7th/8th Year) o...,Intermediate Level Technicians and Professions,...,5,12.6,0,12.7,3.7,-1.7,Graduate,Graduate,"[Age at enrollment, Total exams across all cla...","[[Previous qualification, Secondary education,..."
3214,Single,1st phase—general contingent,4,Communication Design,daytime,Secondary education,Portuguese,Higher Education—degree,Higher Education—degree,Specialists in Intellectual and Scientific Act...,...,6,15.166667,0,9.4,-0.8,-3.12,Graduate,Graduate,[University's position in preferences when app...,"[[Course, Communication Design, 0.181818181818..."


In [72]:
df.to_csv("sample_x.csv", index = "Index")