In [95]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [66]:
df = pd.read_csv('/Users/toddhendricks/Downloads/2013_01_master.csv')
df.sample(5)

Unnamed: 0,caseCode,caseName,trialCourt,trialDocket,injReliefOnly,probability,consCase,numCompl,eeocComplAvailable,privPartyComplAvailable,...,sysGoals,sysBenPersEEOC,sysNumCompl,sysNumRecMon,sysDamages,damages_sum,source_damages_sum,cpi,damages_adj,criteria
819,EE-KS-0008,EEOC v. GMRI INC (d/b/a RED LOBSTER),D. Kan.,2:03-cv-02489-KHV-DJW,no,0.450718,no,1.0,yes,yes,...,0,0,0,0,0,60000.0,amtDefPays,0.91124,65844.359375,0
1563,EE-NY-0188,EEOC v. ZALE CORPORATION et al,E.D. N.Y.,2:01-cv-01541-RJD-ETB,no,1.0,no,6.0,no,no,...,0,0,0,0,0,250000.0,amtDefPays,0.867824,288076.6875,0
1796,EE-TN-0029,"EEOC v. CROSSGATE SERVICES, INC.",W.D. Tenn.,04 2644-BA,no,1.0,no,2.0,no,no,...,0,0,0,0,0,45000.0,amtDefPays,0.942113,47764.976562,0
2117,EE-TX-0384,"EEOC v. UNITED PARCEL SERVICE, INC.",W.D. Tex.,M098-CA-157,no,0.450718,no,,no,no,...,0,0,0,0,0,20000.0,amtReliefEEOC,0.803666,24885.955078,0
893,EE-LA-0084,"EEOC v. TRAC-WORK, INC.",W.D. La.,2:04-cv-01981-PM-APW,no,0.450718,no,1.0,yes,yes,...,0,0,0,0,0,11000.0,amtDefPays,0.942113,11675.883789,0


In [67]:
df.columns.values

array(['caseCode', 'caseName', 'trialCourt', 'trialDocket',
       'injReliefOnly', 'probability', 'consCase', 'numCompl',
       'eeocComplAvailable', 'privPartyComplAvailable',
       'fullDocMonAvailable', 'fullDocInjAvailable', 'judge1Id',
       'judge2Id', 'judge3Id', 'judge4Id', 'judge5Id', 'dateFiled',
       'secondFiling', 'numDocketEntries', 'firstAnswerDate',
       'answerEEOCDate', 'finalResDate', 'finalResType', 'sepPrivResDate',
       'sepPrivResolution', 'appealNotice', 'appealDate',
       'appealRemandDate', 'appealOutcome', 'remandResult', 'eeocOffice1',
       'eeocOffice2', 'regionalAtty1', 'regionalAtty2', 'eeocIntervened',
       'ppComplainants', 'plaintiffPrivCounsel', 'plaintiffProSeCounsel',
       'plaintiffPubIntCounsel', 'plaintiffFirmOrg1', 'plaintiffFirmOrg2',
       'plaintiffFirmOrg3', 'ADABasis', 'ADEABasis', 'EPABasis',
       'titleVIIBasis', 'allegedRaceDiscrim', 'discrimRace',
       'typeRaceOther', 'allegedColorDiscrim', 'allegedNatOrigDiscrim

In [68]:
df['titleVIIBasis'].value_counts(normalize=True)

yes    0.827042
no     0.172958
Name: titleVIIBasis, dtype: float64

In [69]:
civil_rights = df[df['titleVIIBasis'] == 'yes']

In [70]:
civil_rights['finalResType'].value_counts()

consent judgment                                  1087
voluntary dismissal-settlement                     122
none as of 04/22/08                                 71
defendants summary judgment                         55
default judgment                                    33
plaintiffs jury verdict                             32
defendants jury verdict                             27
voluntary dismissal-non-settlement                  21
plaintiffs bench verdict                             6
plaintiffs summary judgment                          5
defendants judgment as a matter of law               3
involuntary dismissal-other                          3
involuntary dismissal-failure to prosecute           1
involuntary dismissal-failure to state a claim       1
judgment on pleadings                                1
Name: finalResType, dtype: int64

In [71]:
def create_target_variable(row):
    val = 'N/A'
    if row == 'defendants summary judgment':
        val = 0
    elif row == 'defendants jury verdict':
        val = 0
    elif row == 'defendants judgment as a matter of law':
        val = 0
    elif row == 'plaintiffs jury verdict':
        val = 1
    elif row == 'plaintiffs bench verdict':
        val = 1
    elif row == 'plaintiffs summary judgment':
        val = 1
    elif row == 'consent judgment':
        val = 1
    return val

In [72]:
civil_rights['outcome'] = civil_rights.loc[:,'finalResType'].apply(create_target_variable)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [73]:
civil_rights = civil_rights[civil_rights['outcome'] != 'N/A']
civil_rights.shape

(1215, 238)

In [75]:
liability = civil_rights[['retaliationComplaint',
       'retaliationComplaint3p', 'retaliationCooperating',
       'impactAlleged', 'practiceAlleged']]
liability = pd.get_dummies(liability)

In [76]:
eeoc_issues = civil_rights[['hiringIssue', 'testingIssue',
       'trainingIssue', 'medExamIssue', 'payIssue', 'harassmentIssue',
       'disciplineIssue', 'leaveAccomIssue', 'empCondIssue',
       'promotionIssue', 'demotionIssue', 'dischargeIssue', 'otherIssue']]
eeoc_issues = pd.get_dummies(eeoc_issues)

In [77]:
eeoc_relief = civil_rights[['payRelief', 'pecunRelief', 'nonPecunRelief',
       'punitiveRelief', 'liquidatedRelief', 'injunctRelief',
       'declRelief']]
eeoc_relief = pd.get_dummies(eeoc_relief)

In [78]:
private_issues = civil_rights[['s1981PCause', 'otherFedPCause', 'otherFed',
       'stateDiscPCause', 'otherStatePCause']]
private_issues = pd.get_dummies(private_issues)

In [79]:
private_relief = civil_rights[['payPRelief',
       'pecunPRelief', 'nonPecunPRelief', 'punitivePRelief',
       'liquidatedPRelief', 'injunctPRelief', 'declPRelief']]
private_relief = pd.get_dummies(private_relief)

In [108]:
features = pd.concat([liability, eeoc_issues, eeoc_relief, private_issues, private_relief], axis=1)
X = features
y = civil_rights['outcome']
y = y.astype('int')

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [113]:
rf = RandomForestClassifier(n_estimators=150, criterion='entropy', random_state=42)

In [114]:
rf.fit(X_train, y_train)
print("The score for Random Forest is")
print("Training: {:6.2f}%".format(100*rf.score(X_train, y_train)))
print("Test set: {:6.2f}%".format(100*rf.score(X_test, y_test)))

The score for Random Forest is
Training:  95.70%
Test set:  93.27%
