In [1]:
# from aif360.datasets import LawSchoolGPADataset

# # Load the dataset
# dataset = LawSchoolGPADataset()

# # Convert to pandas DataFrame
# df, _ = dataset.convert_to_dataframe()

In [7]:
import pandas as pd
df = pd.read_csv("./data/lsac.csv")

In [8]:
# List of columns to use (including the target variable 'pass_bar')
data_headers = [
    "decile1b", "decile3", "decile1", "sex", "race", "cluster",
    "lsat", "ugpa", "zfygpa", "dob_yr", "grad", "zgpa",
    "fulltime", "fam_inc", "parttime", "tier", "indxgrp2", "bar"
]

In [9]:
# Remove rows with missing values in any of the specified columns
df = df.dropna(subset=data_headers)

# Remove rows with blank or empty strings in any of the specified columns
df = df[~df[data_headers].isin(['', ' ']).any(axis=1)]

In [10]:
# Map 'sex' from {'1': 0, '2': 1}
df['sex'] = df['sex'].map({'1': 0, '2': 1})

# Map 'grad' from {'N': 0, 'O': 1, 'X': 2, 'Y': 3}
df['grad'] = df['grad'].map({'N': 0, 'O': 1, 'X': 2, 'Y': 3})

# Map 'indxgrp2' categories to integers
df['indxgrp2'] = df['indxgrp2'].map({
    'a under 400': 0, 'b 400-460': 1, 'c 460-520': 2,
    'd 520-580': 3, 'e 580-640': 4, 'f 640-700': 5,
    'g 700-760': 6, 'h 760-820': 7, 'i 820+': 8
})

# Map 'bar' (target variable) to 'pass_bar'
df['pass_bar'] = df['bar'].map({'0': 0, '1': 1, '': 0})

In [11]:
# Updated list of columns to use (19 features + target variable 'pass_bar')
final_columns = [
    "decile1b", "decile3", "decile1", "sex", "race", "cluster",
    "lsat", "ugpa", "zfygpa", "dob_yr", "grad", "zgpa",
    "fulltime", "fam_inc", "parttime", "tier", "indxgrp2",
    "s_public", "b_public",  # Additional features to reach 19
    "pass_bar"  # Target variable
]

# Select the updated set of columns
df = df[final_columns]

KeyError: "['s_public', 'b_public'] not in index"

In [None]:
# Verify the number of features
num_features = df.drop('pass_bar', axis=1).shape[1]
print(f"Number of features (excluding target): {num_features}")  # Should be 19

# Verify the dataset size
print(f"Dataset size: {len(df)}")

In [6]:
# Dataset size
print(f"Dataset size: {len(df)}")

# Positive and negative examples
positive_examples = df[df['pass_bar'] == 1].shape[0]
negative_examples = df[df['pass_bar'] == 0].shape[0]
print(f"Positive examples: {positive_examples}")
print(f"Negative examples: {negative_examples}")

Dataset size: 22329
Positive examples: 0
Negative examples: 0


In [4]:
# Display the first few rows of the dataset
print(df.head())

   decile1b  decile3  id  decile1 sex race cluster lsat ugpa zfygpa  ...  \
0       NaN      NaN   1      1.0   1    7       2   30  3.1  -1.79  ...   
1      10.0     10.0   2     10.0   1    7       1   44  3.5   1.33  ...   
2       5.0      4.0   3      5.0   1    7       2   29  3.5  -0.11  ...   
3       9.0      9.0   4      9.0   1    7       3   35    3   1.22  ...   
4       NaN      NaN   5      9.0   1    7       2   39  2.9   0.88  ...   

  other asian black hisp pass_bar                bar tier           index604  \
0     0     0     0    0                         NaN    2          625.78946   
1     0     0     0    0        1  a Passed 1st time    4         886.842082   
2     0     0     0    0        1  a Passed 1st time    2  649.9999869999999   
3     0     0     0    0            d Never took bar    3         694.736825   
4     0     0     0    0                         NaN    2  747.8947169999999   

     indxgrp   indxgrp2  
0  e 580-640  e 580-640  
1     g 70

In [5]:
df.shape

(27478, 36)

In [12]:
# Check data types of each column (this helps identify categorical vs numerical features)
print(df.dtypes)

decile1b    float64
decile3     float64
id            int64
decile1     float64
sex          object
race         object
cluster      object
lsat         object
ugpa         object
zfygpa       object
dob_yr       object
grad         object
zgpa         object
bar1         object
bar1_yr      object
bar2         object
bar2_yr      object
fulltime     object
fam_inc      object
age          object
gender       object
parttime     object
male         object
race1        object
race2        object
dropout      object
other         int64
asian         int64
black         int64
hisp          int64
pass_bar     object
bar          object
tier         object
index604     object
indxgrp      object
indxgrp2     object
dtype: object


In [17]:
# Check for missing (NaN) values in each column
print(df.isna().sum())

decile1b    4509
decile3     4509
id             0
decile1     2324
sex            0
race           0
cluster        0
lsat           0
ugpa           0
zfygpa         0
dob_yr         0
grad          15
zgpa           0
bar1        4375
bar1_yr        0
bar2        4375
bar2_yr        0
fulltime       0
fam_inc        0
age            0
gender         6
parttime       0
male           0
race1         20
race2         20
dropout        0
other          0
asian          0
black          0
hisp           0
pass_bar       0
bar          108
tier           0
index604       0
indxgrp        0
indxgrp2       0
dtype: int64


In [18]:
# Get summary statistics of numerical features
print(df.describe())

           decile1b       decile3            id       decile1         other  \
count  22969.000000  22969.000000  27478.000000  25154.000000  27478.000000   
mean       5.501023      5.501023  13739.500000      5.500875      0.019215   
std        2.872404      2.872404   7932.359685      2.872311      0.137284   
min        1.000000      1.000000      1.000000      1.000000      0.000000   
25%        3.000000      3.000000   6870.250000      3.000000      0.000000   
50%        6.000000      6.000000  13739.500000      6.000000      0.000000   
75%        8.000000      8.000000  20608.750000      8.000000      0.000000   
max       10.000000     10.000000  27478.000000     10.000000      1.000000   

              asian         black          hisp  
count  27478.000000  27478.000000  27478.000000  
mean       0.041924      0.068200      0.005277  
std        0.200420      0.252093      0.072452  
min        0.000000      0.000000      0.000000  
25%        0.000000      0.000000     

In [19]:
# Check unique values in each column (this can help identify categorical features)
for col in df.columns:
    print(f"Unique values in {col}: {df[col].unique()}")

Unique values in decile1b: [nan 10.  5.  9.  8.  3.  1.  4.  2.  7.  6.]
Unique values in decile3: [nan 10.  4.  9.  7.  2.  1.  3.  5.  8.  6.]
Unique values in id: [    1     2     3 ... 27476 27477 27478]
Unique values in decile1: [ 1. 10.  5.  9.  8.  3. nan  4.  2.  6.  7.]
Unique values in sex: ['1' '2' ' ']
Unique values in race: ['7' '2' '3' '4' '6' '8' '1' '5' ' ']
Unique values in cluster: ['2' '1' '3' '4' '5' '6' ' ']
Unique values in lsat: ['30' '44' '29' '35' '39' '37' '43' '41' '24.5' '34' '40' '31' '17' '33'
 '28' '26' '32.5' '23' '22' '48' '36' '20' '45' '27' '47' '46' '42' '38'
 '32' '25' '31.5' '33.5' '34.5' '19.5' ' ' '41.5' '28.3' '23.5' '25.5'
 '24' '35.5' '37.7' '32.3' '39.5' '21' '30.7' '40.5' '36.5' '30.5' '18'
 '26.5' '22.5' '35.3' '29.3' '37.5' '38.5' '29.5' '27.5' '24.7' '19'
 '18.3' '45.5' '42.5' '31.3' '33.3' '30.3' '28.5' '43.5' '17.5' '29.7'
 '21.5' '19.7' '32.7' '23.3' '28.7' '18.5' '24.3' '14' '38.3' '34.3'
 '26.7' '14.5' '22.7' '22.3' '15' '16' '38.7' 