In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split , GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

from datetime import datetime
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report, precision_recall_curve
import warnings
warnings.filterwarnings('ignore')

### Download Data

In [2]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

In [3]:
train.drop(['WCB Decision'], axis = 1, inplace = True)

Missing Values

In [4]:
def convert_to_datetime(df, date_cols):
    for col in date_cols:
        df[col] = pd.to_datetime(df[col], errors='coerce')
    return df

date_cols = ['Accident Date', 'Assembly Date', 'C-2 Date', 'C-3 Date', 'First Hearing Date']
train = convert_to_datetime(train, date_cols)
test = convert_to_datetime(test, date_cols)

In [5]:
# median
# for metric columns
# fill with training median as it does not get influenced by extreme outliers
def NaN_median (train, test, cols):

    for col in cols:
        if pd.api.types.is_numeric_dtype(train[col]): # Numeric
            median_to_fill = train[col].median()
            train[col].fillna(median_to_fill, inplace=True)
            test[col].fillna(median_to_fill, inplace=True)
            # print(df[col].isna().sum())
            
        elif pd.api.types.is_datetime64_any_dtype(train[col]):  # Date  
            median_date_to_fill = train[col].median() 
            train[col].fillna(median_date_to_fill, inplace=True)
            test[col].fillna(median_date_to_fill, inplace=True) 
            # print(df[col].isna().sum())
            
    return train, test

In [6]:
def convert_to_int(df, int_cols):

    for col in int_cols:
        df[col] = df[col].astype('Int64')
    return df

int_cols = ['Age at Injury', 'Birth Year', 'IME-4 Count', 'Number of Dependents']
convert_to_int(train, int_cols)
convert_to_int(test, int_cols)

#we will store this as int for KNN imputer and later store as objects
float_to_object = ['Industry Code', 'WCIO Cause of Injury Code', 'WCIO Nature of Injury Code', 'WCIO Part Of Body Code']
convert_to_int(train, float_to_object)
convert_to_int(test, float_to_object)

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,...,Medical Fee Region,OIICS Nature of Injury Description,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Number of Dependents
0,2022-12-24,19,N,2023-01-02,N,,2003,2023-01-02,NaT,INDEMNITY INSURANCE CO OF,...,IV,,31,"FALL, SLIP OR TRIP, NOC",10,CONTUSION,54,LOWER LEG,10466,1
1,2022-11-20,19,N,2023-01-02,N,,2003,2023-01-02,NaT,A I U INSURANCE COMPANY,...,IV,,75,FALLING OR FLYING OBJECT,10,CONTUSION,10,MULTIPLE HEAD INJURY,11691,1
2,2022-12-26,59,N,2023-01-02,N,0.0,1963,2022-12-31,NaT,AMGUARD INSURANCE COMPANY,...,III,,68,STATIONARY OBJECT,49,SPRAIN OR TEAR,62,BUTTOCKS,10604,0
3,2022-12-28,55,N,2023-01-02,N,0.0,0,2023-01-02,NaT,INDEMNITY INS. OF N AMERICA,...,IV,,25,FROM DIFFERENT LEVEL (ELEVATION),10,CONTUSION,53,KNEE,11411,6
4,2022-12-20,25,N,2023-01-02,N,0.0,1997,2022-12-31,NaT,NEW HAMPSHIRE INSURANCE CO,...,IV,,79,OBJECT BEING LIFTED OR HANDLED,40,LACERATION,37,THUMB,11212,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387970,2012-09-12,52,N,2024-06-05,N,,1960,2012-10-23,NaT,STATE INSURANCE FUND,...,I,,,,,,,,13619,5
387971,2024-05-22,59,N,2024-06-05,Y,0.0,1965,NaT,2024-05-28,NYC TRANSIT AUTHORITY,...,IV,,,,,,,,11776,1
387972,2024-05-06,45,N,2024-06-05,Y,0.0,1979,NaT,NaT,STATE INSURANCE FUND,...,IV,,,,,,,,11368,5
387973,2024-02-24,42,N,2024-06-05,Y,0.0,1981,NaT,2024-05-21,NEW YORK BLACK CAR OPERATORS',...,IV,,,,,,,,11432,5


In [7]:
NaN_median(train, test, ['Accident Date'])

NaN_median(train, test, ['Average Weekly Wage'])

NaN_median(train, test, ['C-2 Date'])

NaN_median(train, test, ['C-3 Date'])

NaN_median(train, test, ['First Hearing Date'])

(       Accident Date  Age at Injury Alternative Dispute Resolution  \
 0         2019-12-30             31                              N   
 1         2019-08-30             46                              N   
 2         2019-12-06             40                              N   
 3         2021-06-27           <NA>                            NaN   
 4         2019-12-30             61                              N   
 ...              ...            ...                            ...   
 593466    2021-06-27           <NA>                            NaN   
 593467    2022-12-13             72                              N   
 593468    2021-06-27           <NA>                            NaN   
 593469    2021-06-27           <NA>                            NaN   
 593470    2021-06-27           <NA>                            NaN   
 
        Assembly Date Attorney/Representative  Average Weekly Wage  Birth Year  \
 0         2020-01-01                       N                 0.

#### Birth Year 

In [8]:
# Replace 0 values ​​with NaN
train['Birth Year'].replace(0, np.nan, inplace=True)
test['Birth Year'].replace(0, np.nan, inplace=True)

# Replace NaN values with subtracting the Accident Year by the Age at Injury 
train['Accident Year'] = train['Accident Date'].dt.year
train['Birth_Year_2'] = train['Accident Year'] - train['Age at Injury']
train['Birth Year'] = train['Birth Year'].fillna(train['Birth_Year_2'])

test['Accident Year'] = test['Accident Date'].dt.year
test['Birth_Year_2'] = test['Accident Year'] - test['Age at Injury']
test['Birth Year'] = test['Birth Year'].fillna(test['Birth_Year_2'])

# The rest of the missing values ​​were filled in with the median
NaN_median(train, test, ['Birth Year'])

train.drop(columns=['Accident Year', 'Birth_Year_2'], inplace=True)
test.drop(columns=['Accident Year', 'Birth_Year_2'], inplace=True)

#### IME-4 Count

In [9]:
train['IME-4 Count'] = train['IME-4 Count'].fillna(0)
test['IME-4 Count'] = test['IME-4 Count'].fillna(0)

#### Zip Code

In [10]:
train['Zip Code'] = train['Zip Code'].apply(
    lambda x: x[:2] if isinstance(x, str) and len(x) == 5 and x.isdigit() else '0'
)

test['Zip Code'] = test['Zip Code'].apply(
    lambda x: x[:2] if isinstance(x, str) and len(x) == 5 and x.isdigit() else '0'
)

# Map the zip codes to integers
zip_code_map = {str(i): i for i in range(10, 100)}

train['Zip Code'] = train['Zip Code'].map(zip_code_map)
test['Zip Code'] = test['Zip Code'].map(zip_code_map)

train['Zip Code'] = train['Zip Code'].fillna(0)
test['Zip Code'] = test['Zip Code'].fillna(0)

print(train['Zip Code'].value_counts())
print()
print('NaN:', train['Zip Code'].isna().sum())

Zip Code
11.0    190002
10.0    105395
14.0     82984
0.0      81251
12.0     68102
         ...  
82.0         4
62.0         4
88.0         3
51.0         2
69.0         1
Name: count, Length: 91, dtype: int64

NaN: 0


In [11]:
train[float_to_object] = train[float_to_object].astype('object')
test[float_to_object] = test[float_to_object].astype('object')

In [12]:
train.dropna(subset=['Agreement Reached'], inplace=True)

In [13]:
#fillna just not to waste time with knn for now
train.fillna('u', inplace = True)
test.fillna('u', inplace = True)

In [14]:
train.isna().sum()

Accident Date                         0
Age at Injury                         0
Alternative Dispute Resolution        0
Assembly Date                         0
Attorney/Representative               0
Average Weekly Wage                   0
Birth Year                            0
C-2 Date                              0
C-3 Date                              0
Carrier Name                          0
Carrier Type                          0
Claim Identifier                      0
Claim Injury Type                     0
County of Injury                      0
COVID-19 Indicator                    0
District Name                         0
First Hearing Date                    0
Gender                                0
IME-4 Count                           0
Industry Code                         0
Industry Code Description             0
Medical Fee Region                    0
OIICS Nature of Injury Description    0
WCIO Cause of Injury Code             0
WCIO Cause of Injury Description      0


### Pre-Processing

In [15]:
train['Zip Code'] = train['Zip Code'].apply(
    lambda x: x[:2] if isinstance(x, str) and len(x) == 5 and x.isdigit() else '0'
)

test['Zip Code'] = test['Zip Code'].apply(
    lambda x: x[:2] if isinstance(x, str) and len(x) == 5 and x.isdigit() else '0'
)

# Map the zip codes to integers
zip_code_map = {str(i): i for i in range(10, 100)}

train['Zip Code'] = train['Zip Code'].map(zip_code_map)
test['Zip Code'] = test['Zip Code'].map(zip_code_map)

train['Zip Code'] = train['Zip Code'].fillna(0)
test['Zip Code'] = test['Zip Code'].fillna(0)

print(train['Zip Code'].value_counts())
print()
print('NaN:', train['Zip Code'].isna().sum())

Zip Code
0.0    574026
Name: count, dtype: int64

NaN: 0


In [16]:
# Convert categorical columns to string type
def convert_to_string(df, categorical_columns):
    for col in categorical_columns:
        if col in df.columns:
            df[col] = df[col].astype(str)
    return df

In [17]:
import statistics

def feature_engineering(df, feature_set='basic'):
    df = df.copy()
    print(f"Initial shape: {df.shape}")
    
    # Date feature engineering
    df['Assembly_to_Accident'] = (df['Assembly Date'] - df['Accident Date']).dt.days
    df['C2_to_Accident'] = (df['C-2 Date'] - df['Accident Date']).dt.days
    df['C3_to_Accident'] = (df['C-3 Date'] - df['Accident Date']).dt.days
    df['Hearing_to_Accident'] = (df['First Hearing Date'] - df['Accident Date']).dt.days
    
    # Additional feature engineering
    if 'Average Weekly Wage' in df.columns:
        # Fill missing values with the median wage
        df['Average Weekly Wage'] = df['Average Weekly Wage'].fillna(df['Average Weekly Wage'].median())
    
        # Create a temporary column for wages greater than 0
        positive_wages = df['Average Weekly Wage'] > 0
    
        # Apply qcut to positive wages only
        wage_groups = pd.qcut(
            df.loc[positive_wages, 'Average Weekly Wage'], 
            q=10, 
            labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        )
    
    # Assign the results back to the dataframe
    df['Wage_Group'] = 0  # Default to 0 (avg weekly wage = 0)
    df.loc[positive_wages, 'Wage_Group'] = wage_groups.astype(int)  # Overwrite for positive wages

    # Append additional features for 'interactions' and 'full' feature sets
    if feature_set in ['interactions', 'full']:
        # Create interaction features
        if 'Carrier Type' in df.columns and 'Industry Code' in df.columns:
            df['Carrier_Industry'] = df['Carrier Type'].astype(str) + '_' + df['Industry Code'].astype(str)
        if 'District Name' in df.columns and 'County of Injury' in df.columns:
            df['District_County'] = df['District Name'].astype(str) + '_' + df['County of Injury'].astype(str)
        
        # Create missing value indicators
        for col in df.columns:
            if df[col].isnull().sum() > 0:
                df[f'{col}_missing'] = df[col].isnull().astype(int)
    
    # assume 'df' is your DataFrame with the county names
    df['Distance to NYC'] = df['District Name'].map({
        'NYC': 0,
        'ALBANY': 155,
        'HAUPPAUGE': 45,
        'BUFFALO': 373,
        'SYRACUSE': 198,
        'ROCHESTER': 338,
        'BINGHAMTON': 173
    })

    mean_distance = df[df['District Name'] != 'STATEWIDE']['Distance to NYC'].mean()
    df.loc[df['District Name'] == 'STATEWIDE', 'Distance to NYC'] = mean_distance

    df['Modified Industry Code'] = df['Industry Code'].apply(lambda x: '32' if x in ['31', '32', '33'] 
                                                  else '44' if x in ['44', '45'] 
                                                  else '48' if x in ['48', '49'] 
                                                  else x)

    # Assuming df_train is your DataFrame
    counties = { "SUFFOLK": 45.4, "QUEENS": 8.5, "KINGS": 7.5, "NASSAU": 20.1,
            "BRONX": 10.3, "ERIE": 371.1, "NEW YORK": 0, "WESTCHESTER": 20.5,
            "MONROE": 334.8, "ORANGE": 59.5, "ONONDAGA": 194.8, "RICHMOND": 17.1,
            "ALBANY": 155.1, "DUTCHESS": 76.3, "ROCKLAND": 30.8, "SARATOGA": 143.1, 
            "NIAGARA": 373.9, "BROOME": 173.1, "ONEIDA": 203.1, "RENSSELAER": 145.9, 
            "ULSTER": 86.3, "CAYUGA": 221.9, "HERKIMER": 213.9, "CHAUTAUQUA": 407.9, 
            "ONTARIO": 264.9, "CHEMUNG": 201.9, "OSWEGO": 243.9, "FULTON": 223.1, 
            "PUTNAM": 51.9, "ST. LAWRENCE": 314.9, "JEFFERSON": 341.1, "CLINTON": 304.9, 
            "CATTARAUGUS": 371.9, "SULLIVAN": 97.3, "GENESEE": 344.9, "COLUMBIA": 120.1,
            "MADISON": 193.9, "WARREN": 194.9, "LIVINGSTON": 276.9, "DELAWARE": 137.1,
            "WASHINGTON": 204.9, "GREENE": 124.9, "ALLEGANY": 346.9, "WAYNE": 294.9,
            "CHENANGO": 181.9, "TOMPKINS": 209.9, "ORLEANS": 323.9, "SCHENECTADY": 156.1,
            "FRANKLIN": 294.9, "SENECA": 234.9, "LEWIS": 266.9, "TIOGA": 187.1, "STEUBEN": 246.9, 
            "ESSEX": 214.9, "SCHUYLER": 206.1, "OTSEGO": 165.1, "CORTLAND": 193.9, 
            "WYOMING": 313.9, "MONTGOMERY": 173.9, "SCHOHARIE": 146.1, "YATES": 243.9,"HAMILTON": 221.9
    }

    # Create a list of distances
    distances = list(counties.values())

    # Calculate the mean distance
    mean_distance = statistics.mean(distances)

    # Add the "UNKNOWN" county to the dictionary with the mean distance
    counties["UNKNOWN"] = mean_distance

    # Create a new column in the df_train DataFrame called distance_of_county
    df['distance_of_county'] = df['County of Injury'].map(counties).fillna(mean_distance)

    print(f"Shape after feature engineering: {df.shape}")
    return df

train = feature_engineering(train, 'basic')
test = feature_engineering(test, 'basic')

Initial shape: (574026, 32)
Shape after feature engineering: (574026, 40)
Initial shape: (387975, 30)
Shape after feature engineering: (387975, 38)


#### Partition

In [18]:
# Prepare the target
le = LabelEncoder()
y = le.fit_transform(train['Agreement Reached'])

In [19]:
train.drop(['Claim Injury Type'], axis = 1, inplace = True)
X = train.drop(['Agreement Reached'], axis = 1)
X_train, X_val,y_train, y_val = train_test_split(X,y,
                                                train_size = 0.7,
                                                random_state=0,
                                                shuffle = True, 
                                                stratify = y)

In [20]:
train.drop(['Agreement Reached'], axis = 1, inplace = True)

X_train_num = X_train.select_dtypes(include=np.number).set_index(X_train.index)
X_train_cat = X_train.select_dtypes(exclude=np.number).set_index(X_train.index)

# For test
X_val_num = X_val.select_dtypes(include=np.number).set_index(X_val.index)
X_val_cat = X_val.select_dtypes(exclude=np.number).set_index(X_val.index)

test_num = test.select_dtypes(include=np.number).set_index(test.index)
test_cat = test.select_dtypes(exclude=np.number).set_index(test.index)

print(X_train_num.shape)
print(X_train_cat.shape)

(401818, 14)
(401818, 24)


#### Encoding

In [21]:
# Convert all categorical columns to strings
X_train_cat = X_train_cat.astype(str)
X_val_cat = X_val_cat.astype(str)
test_cat =test_cat.astype(str)

enc1 = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train_cat_enc = enc1.fit_transform(X_train_cat)
X_val_cat_enc = enc1.transform(X_val_cat)
test_cat_enc = enc1.transform(test_cat)

X_train_cat = pd.DataFrame(X_train_cat_enc, columns=X_train_cat.columns, index=X_train_cat.index)
X_val_cat = pd.DataFrame(X_val_cat_enc, columns=X_val_cat.columns, index=X_val_cat.index)
test_cat = pd.DataFrame(test_cat_enc, columns= test_cat.columns, index= test_cat.index)

<hr>
<a class="anchor" id="scaling">
    
## 4.7 Scaling
    
</a>

In [22]:
X_train_combined = pd.concat([X_train_num, X_train_cat], axis=1)
X_val_combined = pd.concat([X_val_num, X_val_cat], axis=1)
test = pd.concat([test_num, test_cat], axis = 1)
X_train_combined.shape

(401818, 38)

In [23]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

def scaler_method(method='minmax'):
    
    if method == 'minmax':
        return MinMaxScaler()
    elif method == 'standard':
        return StandardScaler()
    elif method == 'robust':
        return RobustScaler()
    else:
        raise ValueError("Invalid scaling method. Options are 'minmax' and 'standard'.")


scaler = scaler_method('standard')

X_train_combined_scaled = scaler.fit_transform(X_train_combined)
X_train_combined_scaled = pd.DataFrame(X_train_combined_scaled, index=X_train_combined.index, columns=X_train_combined.columns)

In [24]:
X_val_combined_scaled = scaler.transform(X_val_combined)
X_val_combined_scaled = pd.DataFrame(X_val_combined_scaled, index=X_val_combined.index, columns=X_val_combined.columns)

test_scaled = scaler.transform(test)
test = pd.DataFrame(test_scaled, index=test.index, columns=test.columns)
#X_test_num_scaled

#### Balancing

In [25]:
'''X_preprocessed = preprocessor.fit_transform(X_train)
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train, y_train = smote.fit_resample(X_preprocessed, y_train)
test = preprocessor.transform(test)'''

"X_preprocessed = preprocessor.fit_transform(X_train)\nsmote = SMOTE(sampling_strategy='auto', random_state=42)\nX_train, y_train = smote.fit_resample(X_preprocessed, y_train)\ntest = preprocessor.transform(test)"

In [26]:
X_train.shape

(401818, 38)

### Model

In [27]:
'''model = Pipeline([
    ('classifier', RandomForestClassifier(
        n_estimators=100,      # Number of trees
        max_depth=None,         # Maximum depth of trees
        min_samples_split=2,    # Minimum samples required to split an internal node
        min_samples_leaf=1,     # Minimum samples required at a leaf node
        max_features='auto',    # The number of features to consider for best split
        bootstrap=True,         # Whether bootstrap samples are used when building trees
        n_jobs=-1               # Use all available CPUs for training
    ))
])
'''

"model = Pipeline([\n    ('classifier', RandomForestClassifier(\n        n_estimators=100,      # Number of trees\n        max_depth=None,         # Maximum depth of trees\n        min_samples_split=2,    # Minimum samples required to split an internal node\n        min_samples_leaf=1,     # Minimum samples required at a leaf node\n        max_features='auto',    # The number of features to consider for best split\n        bootstrap=True,         # Whether bootstrap samples are used when building trees\n        n_jobs=-1               # Use all available CPUs for training\n    ))\n])\n"

In [28]:
'''# Define the pipeline
pipeline = Pipeline([
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])

# Simplified parameter grid
param_grid = {
    'classifier__n_estimators': [100, 200],  # Number of trees
    'classifier__max_depth': [None, 10],  # Maximum tree depth
}

# GridSearchCV to tune the pipeline
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='f1_macro', cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best pipeline
best_pipeline = grid_search.best_estimator_

# Predict and evaluate with default threshold
y_pred = best_pipeline.predict(X_val)
y_pred_proba = best_pipeline.predict_proba(X_val)[:, 1]  # Probabilities for class 1

print("\nBest Parameters:", grid_search.best_params_)
print("\nClassification Report (Default Threshold 0.5):")
print(classification_report(y_val, y_pred))

# Adjust threshold
threshold = 0.6
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)

print(f"\nClassification Report (Adjusted Threshold {threshold}):")
print(classification_report(y_val, y_pred_adjusted))

y_prob_test = model.predict_proba(test)[:, 1]  # Probabilities for class 1
test_predictions = (y_prob_test > 0.7).astype(int)
test_predictions = le.inverse_transform(test_predictions)
np.savetxt('test_Agreement_Reached.csv', test_predictions, delimiter=',', fmt='%s')'''

'# Define the pipeline\npipeline = Pipeline([\n    (\'classifier\', RandomForestClassifier(class_weight=\'balanced\', random_state=42))\n])\n\n# Simplified parameter grid\nparam_grid = {\n    \'classifier__n_estimators\': [100, 200],  # Number of trees\n    \'classifier__max_depth\': [None, 10],  # Maximum tree depth\n}\n\n# GridSearchCV to tune the pipeline\ngrid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring=\'f1_macro\', cv=3, verbose=1, n_jobs=-1)\ngrid_search.fit(X_train, y_train)\n\n# Best pipeline\nbest_pipeline = grid_search.best_estimator_\n\n# Predict and evaluate with default threshold\ny_pred = best_pipeline.predict(X_val)\ny_pred_proba = best_pipeline.predict_proba(X_val)[:, 1]  # Probabilities for class 1\n\nprint("\nBest Parameters:", grid_search.best_params_)\nprint("\nClassification Report (Default Threshold 0.5):")\nprint(classification_report(y_val, y_pred))\n\n# Adjust threshold\nthreshold = 0.6\ny_pred_adjusted = (y_pred_proba >= threshold

In [29]:
'''#Logistic Regression
pipeline = Pipeline([
    ('classifier', LogisticRegression(class_weight='balanced', 
                                      random_state=42))
])

param_grid = {
    'classifier__C': [500, 5000],  # Regularization strength
    'classifier__solver': ['liblinear'],  # Solvers to use
    'classifier__penalty': ['l2'],  # Only use 'l2' for regularization
    'classifier__max_iter': [100, 200],  # Reduce iterations
    'classifier__class_weight': ['balanced'],  # Focus on class balancing
}
# GridSearchCV to tune the Logistic Regression pipeline
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='f1_macro', cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best pipeline
model = grid_search.best_estimator_

# Predict probabilities for the test set
y_prob = model.predict_proba(X_val)[:, 1]  # Probabilities for class 1

# Default threshold (0.5)
y_pred_default = (y_prob > 0.5).astype(int)
print("\nBest Parameters:", grid_search.best_params_)
print("\nClassification Report (Default Threshold 0.5):")
print(classification_report(y_val, y_pred_default))

# Adjust threshold
threshold = 0.7
y_pred_adjusted = (y_prob > threshold).astype(int)
print(f"\nClassification Report (Adjusted Threshold {threshold}):")
print(classification_report(y_val, y_pred_adjusted))


y_prob_test = model.predict_proba(test)[:, 1]  # Probabilities for class 1
test_predictions = (y_prob_test > 0.7).astype(int)
test_predictions = le.inverse_transform(test_predictions)
np.savetxt('test_Agreement_Reached.csv', test_predictions, delimiter=',', fmt='%s')'''

'#Logistic Regression\npipeline = Pipeline([\n    (\'classifier\', LogisticRegression(class_weight=\'balanced\', \n                                      random_state=42))\n])\n\nparam_grid = {\n    \'classifier__C\': [500, 5000],  # Regularization strength\n    \'classifier__solver\': [\'liblinear\'],  # Solvers to use\n    \'classifier__penalty\': [\'l2\'],  # Only use \'l2\' for regularization\n    \'classifier__max_iter\': [100, 200],  # Reduce iterations\n    \'classifier__class_weight\': [\'balanced\'],  # Focus on class balancing\n}\n# GridSearchCV to tune the Logistic Regression pipeline\ngrid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring=\'f1_macro\', cv=3, verbose=1, n_jobs=-1)\ngrid_search.fit(X_train, y_train)\n\n# Best pipeline\nmodel = grid_search.best_estimator_\n\n# Predict probabilities for the test set\ny_prob = model.predict_proba(X_val)[:, 1]  # Probabilities for class 1\n\n# Default threshold (0.5)\ny_pred_default = (y_prob > 0.5).astype(i

In [30]:
'''#Decision Tree
model = DecisionTreeClassifier(class_weight='balanced')'''

"#Decision Tree\nmodel = DecisionTreeClassifier(class_weight='balanced')"

In [31]:
X_train_combined_scaled.shape

(401818, 38)

In [32]:
np.unique(y_train)

array([0, 1], dtype=int64)

In [33]:
from xgboost import XGBClassifier
#XGBoost
model = XGBClassifier(
    n_estimators=250,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    n_jobs=2,
    tree_method='hist',
    enable_categorical=False,
    objective='binary:logistic',
    eval_metric=['logloss', 'error'],
    use_label_encoder=True
)

In [34]:
model.fit(X_train_combined_scaled, y_train)

In [46]:
def evaluate_model(model, X_train, y_train, X_test, y_test, threshold=0.5):
    model.fit(X_train, y_train)
    y_test_pred_proba = model.predict_proba(X_test)
    y_test_pred = (y_test_pred_proba[:, 1] >= threshold).astype(int)
    
    test_metrics = {
        'accuracy': accuracy_score(y_test, y_test_pred),
        'f1_macro': f1_score(y_test, y_test_pred, average='macro')
    }
    
    print("\nTest Results:")
    print("\nClassification Report:")
    print(classification_report(y_test, y_test_pred))
    print("\nMetrics:")
    for metric, value in test_metrics.items():
        print(f"{metric}: {value:.4f}")
    
    return test_metrics

test_metrics = evaluate_model(model, X_train_combined_scaled, y_train, X_val_combined_scaled, y_val, threshold=0.3)


Test Results:

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97    164172
           1       0.46      0.45      0.45      8036

    accuracy                           0.95    172208
   macro avg       0.72      0.71      0.71    172208
weighted avg       0.95      0.95      0.95    172208


Metrics:
accuracy: 0.9495
f1_macro: 0.7128


In [42]:
Agreement_Reached = model.predict(test)
Agreement_Reached = le.inverse_transform(Agreement_Reached)
np.savetxt('test_Agreement_Reached.csv', Agreement_Reached, delimiter=',',fmt='%s')

In [40]:
%store Agreement_Reached

Stored 'Agreement_Reached' (ndarray)
