In [1]:
import numpy as np
import pandas as pd
import pprint
import matplotlib.pyplot as plt
from scipy import stats
from collections import Counter

In [2]:
df = pd.read_csv("sample_string.csv", index_col = "Index")
df

Unnamed: 0_level_0,Marital status,Application mode,University's position in preferences when applying,Course,Daytime/evening attendance,Previous qualification,Nationality,Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 2nd sem enrolled,Total exams across all classes in 2nd sem,Curricular units 2nd sem passed,Curricular units 2nd sem grade,Curricular units 2nd sem without exams,Unemployment rate at enrollment,Inflation rate at enrollment,GDP at enrollment,Target,AI prediction
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
648,Single,Change in course,2,Oral Hygiene,daytime,Secondary education,Portuguese,Secondary Education—12th Year of Schooling or ...,Basic Education 3rd Cycle (9th/10th/11th Year)...,Administrative staff,...,8,10,8,13.2125,0,10.8,1.4,1.74,Graduate,Graduate
2781,Single,1st phase—general contingent,1,Basic Education,daytime,Secondary education,Portuguese,2nd cycle of the general high school course,Basic education 1st cycle (4th/5th year) or eq...,Unskilled Workers,...,6,14,1,11.0,0,10.8,1.4,1.74,Dropout,Dropout
2883,Married,Over 23 years old,1,Social Service (evening attendance),evening,Secondary education,Portuguese,General Course of Administration and Commerce,Basic education 1st cycle (4th/5th year) or eq...,Unskilled Workers,...,6,8,6,11.5,0,8.9,1.4,3.51,Graduate,Graduate
266,Single,2nd phase—general contingent,4,Nursing,daytime,Secondary education,Portuguese,Higher Education—master’s degree,Secondary Education—12th Year of Schooling or ...,Specialists in Intellectual and Scientific Act...,...,8,8,7,14.828571,0,12.7,3.7,-1.7,Graduate,Graduate
2537,Single,1st phase—general contingent,1,Advertising and Marketing Management,daytime,Secondary education,Portuguese,General commerce course,Basic Education 3rd Cycle (9th/10th/11th Year)...,"Skilled Workers in Industry, Construction, and...",...,6,6,6,15.833333,0,12.4,0.5,1.79,Graduate,Graduate
2791,Single,1st phase—general contingent,1,Veterinary Nursing,daytime,Secondary education,Portuguese,Supplementary Accounting and Administration,Basic Education 3rd Cycle (9th/10th/11th Year)...,Unskilled Workers,...,6,8,6,14.142857,0,11.1,0.6,2.02,Graduate,Graduate
1576,Married,Over 23 years old,2,Advertising and Marketing Management,daytime,Basic education 3rd cycle (9th/10th/11th year)...,Portuguese,General Course of Administration and Commerce,Basic education 1st cycle (4th/5th year) or eq...,Unskilled Workers,...,6,9,4,13.75,0,12.4,0.5,1.79,Dropout,Dropout
988,Single,1st phase—general contingent,1,Social Service,daytime,Secondary education,Portuguese,Higher Education—bachelor’s degree,Higher Education—bachelor’s degree,Intermediate Level Technicians and Professions,...,6,12,2,10.0,0,11.1,0.6,2.02,Dropout,Dropout
3232,Single,1st phase—general contingent,2,Journalism and Communication,daytime,Secondary education,Portuguese,General commerce course,Basic Education 2nd Cycle (6th/7th/8th Year) o...,Intermediate Level Technicians and Professions,...,6,6,5,12.6,0,12.7,3.7,-1.7,Graduate,Graduate
3214,Single,1st phase—general contingent,4,Communication Design,daytime,Secondary education,Portuguese,Higher Education—degree,Higher Education—degree,Specialists in Intellectual and Scientific Act...,...,6,6,6,15.166667,0,9.4,-0.8,-3.12,Graduate,Graduate


In [7]:
# generating somewhat sensible explanations

explanations = {}

# firstly, lets look at numerical values.

# We will check for each column, if in order to predict "graduate" (as opposodes to "dropout") 
# the value should be high or low. 
# where there is no or barely any difference, we will consider the column irrelevant ("na"). 
# after visual explanations, the reasoning for all numerical columns seems to make sense.

df_grad = df[df["Target"] == "Graduate"]
df_drop = df[df["Target"] == "Dropout"]

for column in df:
    
    if df.dtypes[column] in ["int", "float64"]:
        
        grad_mean = np.mean(df_grad[column])
        drop_mean = np.mean(df_drop[column])
        
        if np.abs(grad_mean - drop_mean) < 0.5:
            explanations[column] = "na"
        else:
            if grad_mean > drop_mean:
                explanations[column] = "high"
            else:
                explanations[column] = "low"
        
        print(column)
        print("Graduates:", grad_mean)
        print("Dropouts:", drop_mean)
        print(explanations[column])
        print()

        
print("---")
        
for column in df:
    
    if not df.dtypes[column] in ["int", "float64"]:
        
        print(column)
        grad_counts = Counter(df_grad[column])
        drop_counts = Counter(df_drop[column])
        print("Graduates:", [item:grad_counts[item] for item in grad_counts])
        print("Dropouts:", )

    print()

print("---")
explanations["GDP at enrollment"] = "na" # I don't want to have this in the explanations
pprint.pprint(explanations)

University's position in preferences when applying
Graduates: 2.1818181818181817
Dropouts: 1.25
high

Age at enrollment
Graduates: 23.272727272727273
Dropouts: 26.0
low

Curricular units 1st sem recognized from previous education or work
Graduates: 0.0
Dropouts: 0.0
na

Curricular units 1st sem enrolled
Graduates: 6.2727272727272725
Dropouts: 5.75
high

Total exams across all classes in 1st sem
Graduates: 7.2727272727272725
Dropouts: 9.5
low

Curricular units 1st sem passed
Graduates: 6.0
Dropouts: 4.0
high

Curricular units 1st sem grade
Graduates: 13.401298701298701
Dropouts: 11.583333333333334
high

Curricular units 1st sem without exams
Graduates: 0.0
Dropouts: 0.0
na

Curricular units 2nd sem recognized from previous education or work
Graduates: 0.09090909090909091
Dropouts: 0.0
na

Curricular units 2nd sem enrolled
Graduates: 6.363636363636363
Dropouts: 5.75
high

Total exams across all classes in 2nd sem
Graduates: 7.818181818181818
Dropouts: 11.0
low

Curricular units 2nd sem p

TypeError: unhashable type: 'list'

In [4]:
# in order to generate the explanations for each case, we look at the numerical columns of that case.
# we check if this person is particularly high or low compared to the mean over all people, 
# and whether the direction of high/low corresponds to the prediction by the AI.

x_per_person_num = {}

for index, row in df.iterrows():
    
    # track which rows have the highest percent of other peoples lower/higher this one
    most_x_cols = []
    most_x_percent = 0
    
    print(row["Target"], row["AI prediction"])
    
    for column in df:
        
        # the high/low goal in the explanations dict generated above corresponds to what we want to see
        # in order to predict "graduate". That means if the prediction of the AI is "dropout", it should be
        # the opposite.
        
        # check how many others are below this person if the goal is high, or above if goal is low
        if column in explanations and explanations[column] in ["high", "low"]:
            
            #print(column, ":", row[column])
            #print(np.mean(df[column]))

            if row["AI prediction"] == "Graduate":
                if explanations[column] == "high":
                    #print("is this high? G")
                    percent = len([o for o in df[column] if o < row[column]]) / 15
                else:
                    #print("is this low? G")
                    percent = len([o for o in df[column] if o > row[column]]) / 15
            else:
                if explanations[column] == "high":
                    #print("is this low? D")
                    percent = len([o for o in df[column] if o > row[column]]) / 15
                else:
                    #print("is this high? D")
                    percent = len([o for o in df[column] if o < row[column]]) / 15
                    
            #print(percent)
            
            if percent > most_x_percent:
                most_x_percent = percent
                most_x_cols = [column]
            elif percent == most_x_percent:
                most_x_cols.append(column)
                
        #print("-")
                
    print(most_x_cols, most_x_percent)
    x_per_person_num[index] = most_x_cols
            
    print("---")

Graduate Graduate
['Curricular units 2nd sem passed'] 0.9333333333333333
---
Dropout Dropout
['Total exams across all classes in 1st sem', 'Curricular units 1st sem grade', 'Total exams across all classes in 2nd sem', 'Curricular units 2nd sem passed'] 0.9333333333333333
---
Graduate Graduate
['Total exams across all classes in 1st sem', 'Total exams across all classes in 2nd sem'] 0.4666666666666667
---
Graduate Graduate
['Curricular units 1st sem enrolled'] 0.9333333333333333
---
Graduate Graduate
['Curricular units 2nd sem grade'] 0.9333333333333333
---
Graduate Graduate
['Curricular units 1st sem grade'] 0.9333333333333333
---
Dropout Dropout
['Age at enrollment'] 0.9333333333333333
---
Dropout Dropout
['Curricular units 2nd sem grade'] 0.9333333333333333
---
Graduate Graduate
['Age at enrollment', 'Total exams across all classes in 2nd sem'] 0.6666666666666666
---
Graduate Graduate
["University's position in preferences when applying", 'Curricular units 2nd sem grade'] 0.8
---
Gra

In [5]:
df["explanation_num"] = pd.Series(x_per_person_num)
df

Unnamed: 0_level_0,Marital status,Application mode,University's position in preferences when applying,Course,Daytime/evening attendance,Previous qualification,Nationality,Mother's qualification,Father's qualification,Mother's occupation,...,Total exams across all classes in 2nd sem,Curricular units 2nd sem passed,Curricular units 2nd sem grade,Curricular units 2nd sem without exams,Unemployment rate at enrollment,Inflation rate at enrollment,GDP at enrollment,Target,AI prediction,explanation_num
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
648,Single,Change in course,2,Oral Hygiene,daytime,Secondary education,Portuguese,Secondary Education—12th Year of Schooling or ...,Basic Education 3rd Cycle (9th/10th/11th Year)...,Administrative staff,...,10,8,13.2125,0,10.8,1.4,1.74,Graduate,Graduate,[Curricular units 2nd sem passed]
2781,Single,1st phase—general contingent,1,Basic Education,daytime,Secondary education,Portuguese,2nd cycle of the general high school course,Basic education 1st cycle (4th/5th year) or eq...,Unskilled Workers,...,14,1,11.0,0,10.8,1.4,1.74,Dropout,Dropout,"[Total exams across all classes in 1st sem, Cu..."
2883,Married,Over 23 years old,1,Social Service (evening attendance),evening,Secondary education,Portuguese,General Course of Administration and Commerce,Basic education 1st cycle (4th/5th year) or eq...,Unskilled Workers,...,8,6,11.5,0,8.9,1.4,3.51,Graduate,Graduate,"[Total exams across all classes in 1st sem, To..."
266,Single,2nd phase—general contingent,4,Nursing,daytime,Secondary education,Portuguese,Higher Education—master’s degree,Secondary Education—12th Year of Schooling or ...,Specialists in Intellectual and Scientific Act...,...,8,7,14.828571,0,12.7,3.7,-1.7,Graduate,Graduate,[Curricular units 1st sem enrolled]
2537,Single,1st phase—general contingent,1,Advertising and Marketing Management,daytime,Secondary education,Portuguese,General commerce course,Basic Education 3rd Cycle (9th/10th/11th Year)...,"Skilled Workers in Industry, Construction, and...",...,6,6,15.833333,0,12.4,0.5,1.79,Graduate,Graduate,[Curricular units 2nd sem grade]
2791,Single,1st phase—general contingent,1,Veterinary Nursing,daytime,Secondary education,Portuguese,Supplementary Accounting and Administration,Basic Education 3rd Cycle (9th/10th/11th Year)...,Unskilled Workers,...,8,6,14.142857,0,11.1,0.6,2.02,Graduate,Graduate,[Curricular units 1st sem grade]
1576,Married,Over 23 years old,2,Advertising and Marketing Management,daytime,Basic education 3rd cycle (9th/10th/11th year)...,Portuguese,General Course of Administration and Commerce,Basic education 1st cycle (4th/5th year) or eq...,Unskilled Workers,...,9,4,13.75,0,12.4,0.5,1.79,Dropout,Dropout,[Age at enrollment]
988,Single,1st phase—general contingent,1,Social Service,daytime,Secondary education,Portuguese,Higher Education—bachelor’s degree,Higher Education—bachelor’s degree,Intermediate Level Technicians and Professions,...,12,2,10.0,0,11.1,0.6,2.02,Dropout,Dropout,[Curricular units 2nd sem grade]
3232,Single,1st phase—general contingent,2,Journalism and Communication,daytime,Secondary education,Portuguese,General commerce course,Basic Education 2nd Cycle (6th/7th/8th Year) o...,Intermediate Level Technicians and Professions,...,6,5,12.6,0,12.7,3.7,-1.7,Graduate,Graduate,"[Age at enrollment, Total exams across all cla..."
3214,Single,1st phase—general contingent,4,Communication Design,daytime,Secondary education,Portuguese,Higher Education—degree,Higher Education—degree,Specialists in Intellectual and Scientific Act...,...,6,6,15.166667,0,9.4,-0.8,-3.12,Graduate,Graduate,[University's position in preferences when app...


In [6]:
df.to_csv("sample_x-num.csv", index = "Index")