In [1]:
import pandas as pd
import numpy as np
import re #regular expressions
from sklearn.model_selection import train_test_split
# model scores:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
# hyperameters 
from sklearn.model_selection import RandomizedSearchCV
# sampler
from imblearn.over_sampling import RandomOverSampler
#model:
from lightgbm import LGBMClassifier #run !pip install lightgbm if package not installed already




In [2]:
#read in the CSV
df = pd.read_csv('incident_event_log.csv')

In [3]:
#change all ? marks to NAN values so we can manipulate them easier
df_nan = df.copy()
df_nan = df.replace('?',np.nan)

In [4]:
#changing all objects to datetime values in the date columns
df_nan['opened_at']= pd.to_datetime(df_nan['opened_at'],dayfirst= True)
df_nan['resolved_at'] = pd.to_datetime(df_nan['resolved_at'], dayfirst = True)

In [5]:
#creates a target column of 0s and 1s based on the amount of days it took to resolve the incident.
#if the amount of days is greater than 2 or NAN it turns to 1 and if 2 or below it is a 0
df_nan['target_colum'] = df_nan.apply(lambda row: 1 if pd.isna(row['resolved_at'])
                                       or (row['resolved_at'] - row['opened_at']).days > 2 else 0, axis = 1)

#One day target column
df_nan['one_day_solve'] = df_nan.apply(lambda row: 1 if pd.isna(row['resolved_at'])
                                       or (row['resolved_at'] - row['opened_at']).days > 1 else 0, axis = 1)

#three day target column
df_nan['three_day_solve'] = df_nan.apply(lambda row: 1 if pd.isna(row['resolved_at'])
                                       or (row['resolved_at'] - row['opened_at']).days > 3 else 0, axis = 1)

#four day target column
df_nan['four_day_solve'] = df_nan.apply(lambda row: 1 if pd.isna(row['resolved_at'])
                                       or (row['resolved_at'] - row['opened_at']).days > 4 else 0, axis = 1)

#five day target column
df_nan['five_day_solve'] = df_nan.apply(lambda row: 1 if pd.isna(row['resolved_at'])
                                       or (row['resolved_at'] - row['opened_at']).days > 5 else 0, axis = 1)

#6 day target column
df_nan['six_day_solve'] = df_nan.apply(lambda row: 1 if pd.isna(row['resolved_at'])
                                       or (row['resolved_at'] - row['opened_at']).days > 6 else 0, axis = 1)


In [6]:
# Creating a function to prioritize 'new' and 'active' incident states that will emliminate duplicate incident numbers
"""Parmerter: takes grouped data frame and applys to each group of data
returns a data frame that only includes one instance of every value in the 'number' column
that has a correlating 'incident_state' value of 'New' if possible and then filters to 'active', 
while also including one instance of a 'number' that does not have new or active in 'incident_state'
Returns filtered data frame"""
def new_active_incident_state(group):
    if (group['incident_state'] == 'New').any():
        return group[group['incident_state'] == 'New'].iloc[0]
    elif (group['incident_state'] == 'Active').any():
        return group[group['incident_state'] == 'Active'].iloc[0]
    else:
        return group.iloc[0]  # In case neither 'new' nor 'active' exists, retain the first occurrence

# Apply this function to the DataFrame by grouping by 'number'
df_new_active = df_nan.groupby('number',group_keys=False).apply(new_active_incident_state).reset_index(drop=True)
# View the result
#df_new_active

###warning note: group_keys = Flase is not yet apart of pandas and thus will not work, however by including group_keys = False exlcudes the 'number' column form being included in the apply results and should avoid altering the sturcture of the result


  df_new_active = df_nan.groupby('number',group_keys=False).apply(new_active_incident_state).reset_index(drop=True)


In [7]:
display(df_new_active.value_counts('target_colum')) #2 day resolve target

target_colum
0    15095
1     9823
Name: count, dtype: int64

In [8]:
df_new_active.columns

Index(['number', 'incident_state', 'active', 'reassignment_count',
       'reopen_count', 'sys_mod_count', 'made_sla', 'caller_id', 'opened_by',
       'opened_at', 'sys_created_by', 'sys_created_at', 'sys_updated_by',
       'sys_updated_at', 'contact_type', 'location', 'category', 'subcategory',
       'u_symptom', 'cmdb_ci', 'impact', 'urgency', 'priority',
       'assignment_group', 'assigned_to', 'knowledge',
       'u_priority_confirmation', 'notify', 'problem_id', 'rfc', 'vendor',
       'caused_by', 'closed_code', 'resolved_by', 'resolved_at', 'closed_at',
       'target_colum', 'one_day_solve', 'three_day_solve', 'four_day_solve',
       'five_day_solve', 'six_day_solve'],
      dtype='object')

In [9]:
# reflects the same columns used in model 6 from incident_event_log_v2 with the exception of caused_by
day1_data_df = df_new_active[['caller_id', 'opened_at', 'opened_by', 'location',
       'category', 'subcategory', 'u_symptom', 'impact', 'urgency', 'priority',
       'assignment_group', 'assigned_to', 'knowledge',
       'u_priority_confirmation', 'contact_type', 'notify',
       'cmdb_ci', 'vendor', 'problem_id']]

In [10]:
#set X and Ys
X = day1_data_df
y = df_new_active['target_colum']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13) #try: ,test_size=.2)

Encode data to integers and Fill missing NANs:
* Convert dates todatetime before train test split to create the target columns
* opened_at (No Nan values): seperated into 3 columns, day of week, month, year: converted with extract_date_features

Filled with grab_number_last/ grab_last_number_multiple_columns
* opened_by (714 NaNs)
* location (51 NaNs)
* category (39 NaNs)
* subcategory
* u_symptom
* assignment_group
* assigned_to
* problem_id (241 entries)
* cmdb_ci (51 entries)
* caller Id
* vendor (cleaned with delete_s first)(15 entries)

Filled with grab_1rst_num
* urgency
* priority
* impact

Filled w/ convert_bool_to_int
* knowledge
* u_priority_confirmation

Filled with get_dummies
* contact_type
* notify



In [11]:
def extract_date_features(df, date_column):
    """parameters: dataframe, opened_at(date_column)
    drops the orginal date column and replaces with three new columns year, month, and day_of_week
    returns dataframe"""


    # Extract year, month, and day of week from the date_column
    df['year'] = df[date_column].dt.year
    df['month'] = df[date_column].dt.month
    df['day_of_week'] = df[date_column].dt.dayofweek  # Monday=0, Sunday=6
    
    # Drop the original date_column
    df = df.drop(columns=[date_column])
    
    return df

# Apply this function to both X_train and X_test
X_train = extract_date_features(X_train, 'opened_at')
X_test = extract_date_features(X_test, 'opened_at')


In [12]:
def delete_s(value):
    """
    Prameters: column with data to be procressed; specify dataframe

    returns: processed column
    call ex: df['column'] = df['column'].apply(delet_s)
    """
    if not pd.isna(value):

        # Remove 's' from the end of the string
        cleaned_value = str(value).rstrip('s')
        return cleaned_value

X_train['vendor'] = X_train['vendor'].apply(delete_s)
X_test['vendor'] = X_test['vendor'].apply(delete_s)

In [13]:
def grab_last_number_multiple_columns(df, columns):
    """parameter: data frame with columns to be processed
    columns: list of columns to be converted

    nested function: grab_number_last:
    parameters: column
    captures the number at the end of a string and checks for NaN values converting them to -1
    
    Returns: df with list of columns converted to numbers and if NaN returns -1
      
    call: df = grab_last_number_multiple_columns(df, list of columns)
    """
    

    def grab_number_last(value):

        if pd.isna(value):
            return -1  # converts NaN values to -1
        try:
            return int(str(value).split(" ")[-1])  # splits the string and then captures the last value
        except: ValueError
        return -1

    for column in columns:
        df[column] = df[column].apply(grab_number_last)

    return df


# Create a list of columns to be processed
columns_to_process = ['opened_by', 'location', 'category', 'subcategory', 'u_symptom', 
                      'assignment_group', 'assigned_to', 'problem_id', 'caller_id', 'cmdb_ci','vendor']

# Call the function on the training and testing set with the list of columns to be processed
X_train = grab_last_number_multiple_columns(X_train, columns_to_process)
X_test = grab_last_number_multiple_columns(X_test, columns_to_process)




In [14]:
for value in columns_to_process:
    display(X_train[value].value_counts())

opened_by
17     5436
24     1196
131     945
55      782
108     775
       ... 
515       1
88        1
233       1
230       1
420       1
Name: count, Length: 199, dtype: int64

location
204    4204
161    2997
143    2443
108    1567
93     1461
       ... 
194       1
178       1
193       1
183       1
13        1
Name: count, Length: 208, dtype: int64

category
 42    2712
 26    2453
 53    1999
 46    1661
 32    1115
 37     861
 9      839
 23     807
 20     804
 57     693
 61     599
 40     525
 45     479
 24     474
 35     407
 51     402
 28     377
 34     370
 44     249
 19     172
 43     129
 13      87
 55      74
 17      51
 8       41
 22      36
-1       31
 38      28
 41      27
 56      25
 7       22
 54      22
 50      19
 31      13
 33      12
 63       9
 5        9
 21       6
 59       5
 4        5
 62       5
 30       4
 47       4
 2        4
 29       3
 25       3
 10       3
 52       3
 16       2
 6        2
 27       1
 14       1
 12       1
 49       1
 3        1
 36       1
Name: count, dtype: int64

subcategory
174    4970
223    2790
175    1372
164     916
9       663
       ... 
24        1
165       1
124       1
264       1
95        1
Name: count, Length: 242, dtype: int64

u_symptom
 491    6877
-1      4306
 534    1260
 116     354
 506     283
        ... 
 441       1
 89        1
 37        1
 59        1
 19        1
Name: count, Length: 464, dtype: int64

assignment_group
 70    9591
 20    1024
-1      683
 25     569
 24     514
       ... 
 63       2
 13       1
 32       1
 36       1
 41       1
Name: count, Length: 69, dtype: int64

assigned_to
-1      5341
 17     1244
 13     1104
 33      476
 194     463
        ... 
 144       1
 204       1
 123       1
 27        1
 222       1
Name: count, Length: 199, dtype: int64

problem_id
-1      18512
 2         17
 14        11
 239        4
 64         4
        ...  
 21         1
 150        1
 180        1
 73         1
 218        1
Name: count, Length: 126, dtype: int64

caller_id
1904    332
290     185
4514     98
1441     46
3763     39
       ... 
5515      1
2779      1
3682      1
2767      1
1433      1
Name: count, Length: 4806, dtype: int64

cmdb_ci
-1     18651
 50        2
 28        2
 11        2
 49        2
 24        1
 16        1
 35        1
 47        1
 25        1
 39        1
 26        1
 8         1
 32        1
 4         1
 15        1
 9         1
 41        1
 42        1
 30        1
 3         1
 21        1
 36        1
 14        1
 43        1
 29        1
 37        1
 53        1
 40        1
 6         1
 38        1
 2         1
 45        1
 46        1
Name: count, dtype: int64

vendor
-1    18675
 1        8
 8        5
Name: count, dtype: int64

In [15]:
def grab_1rst_num(df, columns):
    """
    Parameters:
    Dataframe containing the columns to be processed.
    columns (list): list of columns containing strings with number values at the beggining of the string.

    Returns:
    Dataframe with the specified columns altered to only contain the number value. 
    """

    for column in columns:
        df[column] = df[column].apply(lambda x: int(str(x).split (" ")[0]) if x else x)

    return df

X_train = grab_1rst_num(X_train, ['urgency', 'impact', 'priority'])
X_test = grab_1rst_num(X_test, ['urgency', 'impact', 'priority'])

In [16]:
grab_num_list = ['urgency', 'impact', 'priority']

for value in grab_num_list:
    display(X_train[value].value_counts())

urgency
2    17887
3      499
1      302
Name: count, dtype: int64

impact
2    17930
3      539
1      219
Name: count, dtype: int64

priority
3    17758
4      551
2      237
1      142
Name: count, dtype: int64

In [17]:
def convert_bool_to_int(df, columns):
    """
    Convert boolean columns to 1 (True) and 0 (False).
    
    Parameters:
    DataFrame: The DataFrame containing boolean columns.
    columns (list): List of column names to convert from boolean to integers.
    
    Returns:
    DataFrame with the specified columns converted to integers.
    """
    df[columns] = df[columns].astype(int)
    return df

# call the function:
X_train = convert_bool_to_int(X_train, ['knowledge', 'u_priority_confirmation'])
X_test = convert_bool_to_int(X_test, ['knowledge', 'u_priority_confirmation'])


In [18]:
convert_bool_list = ['knowledge', 'u_priority_confirmation']

for value in convert_bool_list:
    display(X_train[value].value_counts())

knowledge
0    16007
1     2681
Name: count, dtype: int64

u_priority_confirmation
0    18673
1       15
Name: count, dtype: int64

In [19]:
def apply_get_dummies(train, test, columns):
    dummies_train = pd.get_dummies(train[columns], drop_first=True, dtype=int)
    dummies_test = pd.get_dummies(test[columns], drop_first=True, dtype=int)
    
    # Align columns of train and test to avoid mismatch
    dummies_train, dummies_test = dummies_train.align(dummies_test, join='outer', axis=1, fill_value=0)
    
    train = pd.concat([train.drop(columns, axis=1), dummies_train], axis=1)
    test = pd.concat([test.drop(columns, axis=1), dummies_test], axis=1)
    
    return train, test

X_train, X_test = apply_get_dummies(X_train, X_test, ['contact_type', 'notify'])


In [20]:
dummie_list = ['contact_type_Email', 'contact_type_IVR', 'contact_type_Phone',
       'contact_type_Self service', 'notify_Send Email']

for value in dummie_list:
    display(X_train.value_counts(value))

contact_type_Email
0    18637
1       51
Name: count, dtype: int64

contact_type_IVR
0    18680
1        8
Name: count, dtype: int64

contact_type_Phone
1    18507
0      181
Name: count, dtype: int64

contact_type_Self service
0    18569
1      119
Name: count, dtype: int64

notify_Send Email
0    18661
1       27
Name: count, dtype: int64

In [21]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18688 entries, 24589 to 338
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype
---  ------                     --------------  -----
 0   caller_id                  18688 non-null  int64
 1   opened_by                  18688 non-null  int64
 2   location                   18688 non-null  int64
 3   category                   18688 non-null  int64
 4   subcategory                18688 non-null  int64
 5   u_symptom                  18688 non-null  int64
 6   impact                     18688 non-null  int64
 7   urgency                    18688 non-null  int64
 8   priority                   18688 non-null  int64
 9   assignment_group           18688 non-null  int64
 10  assigned_to                18688 non-null  int64
 11  knowledge                  18688 non-null  int32
 12  u_priority_confirmation    18688 non-null  int32
 13  cmdb_ci                    18688 non-null  int64
 14  vendor                   

In [22]:
#adds the target column
full_train = pd.concat([X_train, y_train], axis = 1)

In [23]:
#callculates the correlation for each column compared to the target column
full_train.corr()['target_colum'].sort_values()

month                       -0.178867
assignment_group            -0.133884
u_symptom                   -0.058287
subcategory                 -0.034210
contact_type_Phone          -0.019732
u_priority_confirmation     -0.011264
cmdb_ci                     -0.007404
location                    -0.005366
year                        -0.003887
urgency                     -0.002468
priority                     0.000071
caller_id                    0.008045
impact                       0.008105
contact_type_Self service    0.009759
vendor                       0.012142
contact_type_Email           0.014473
category                     0.018402
contact_type_IVR             0.025654
notify_Send Email            0.029856
problem_id                   0.029935
opened_by                    0.034799
day_of_week                  0.058124
assigned_to                  0.083164
knowledge                    0.129673
target_colum                 1.000000
Name: target_colum, dtype: float64

In [26]:
X_train.columns

Index(['caller_id', 'year', 'month', 'day_of_week', 'opened_by', 'location',
       'category', 'subcategory', 'u_symptom', 'impact', 'urgency', 'priority',
       'assignment_group', 'assigned_to', 'knowledge',
       'u_priority_confirmation', 'contact_type_Email', 'contact_type_IVR',
       'contact_type_Phone', 'contact_type_Self service', 'notify_Send Email',
       'cmdb_ci', 'vendor', 'problem_id'],
      dtype='object')

In [25]:

#Reordered the columns to refelct the same column order from model 6 in the incident_event_log_v2.ipynb to give the exact same results.
X_train = X_train[['caller_id', 'year', 'month', 'day_of_week', 'opened_by', 'location',
       'category', 'subcategory', 'u_symptom', 'impact', 'urgency', 'priority',
       'assignment_group', 'assigned_to', 'knowledge',
       'u_priority_confirmation', 'contact_type_Email', 'contact_type_IVR',
       'contact_type_Phone', 'contact_type_Self service', 'notify_Send Email',
       'cmdb_ci', 'vendor', 'problem_id']]

X_test = X_test[['caller_id', 'year', 'month', 'day_of_week', 'opened_by', 'location',
       'category', 'subcategory', 'u_symptom', 'impact', 'urgency', 'priority',
       'assignment_group', 'assigned_to', 'knowledge',
       'u_priority_confirmation', 'contact_type_Email', 'contact_type_IVR',
       'contact_type_Phone', 'contact_type_Self service', 'notify_Send Email',
       'cmdb_ci', 'vendor', 'problem_id']]

In [27]:
# Initialize the oversampler
oversampler = RandomOverSampler(random_state=13)

# Apply sampling on the training data
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)



In [28]:
train_list = [y_train, y_test, y_train_resampled]

for value in train_list:
    display(value.value_counts())


target_colum
0    11321
1     7367
Name: count, dtype: int64

target_colum
0    3774
1    2456
Name: count, dtype: int64

target_colum
0    11321
1    11321
Name: count, dtype: int64

In [40]:
def store_results(model_name, model, X_train_resampled, y_train_resampled, X_test, y_test, y_pred, training_df_name, existing_results=None):
    """
    Parameters: model_name, model, X_train_resampled, y_train_resampled, X_test, y_test, y_pred, training_df_name
    after the initial call creating model_results add parameter: existing_results=None
    without this parameter the list will reset

    returns: appended model_results list of dictionaries of model data
    """
    # Initialize a list to store results of different models
    if existing_results is None:
        model_results = []
    else:
        model_results = existing_results
    
    # Check if the model has attributes from RandomizedSearchCV or GridSearchCV
    if hasattr(model, 'best_params_') and hasattr(model, 'best_score_'):
        best_params = model.best_params_
        best_score = model.best_score_
    else:
        # If the model doesn't have these attributes, use the model's default parameters
        best_params = model.get_params()
        best_score = None

    # Create a dictionary for the current model's results
    results_dict = {
        'Model': model_name,
        'Training DF': training_df_name,
        'Best Parameters': best_params,
        'Best Cross-Validation Score': best_score,
        'Balanced Accuracy Score': balanced_accuracy_score(y_test, y_pred),
        'Training Score': model.score(X_train_resampled, y_train_resampled),
        'Testing Score': model.score(X_test, y_test),
        'Classification Report': classification_report(y_test, y_pred, output_dict=True)
    }

    # Append the current results to the list
    model_results.append(results_dict)
    
    # Return the updated list
    return model_results


In [41]:
def display_results(model_results):
    # Print out results for all models in the list
    for idx, model_result in enumerate(model_results, 1):
        print(f"Model {idx}:")
        for key, value in model_result.items():
            if key == 'Classification Report':
                # Convert classification report to pandas DataFrame for tabular display
                report_df = pd.DataFrame(value).transpose()
                print(f"{key}:\n", report_df)
            elif key == 'Best Parameters':
                print(f"{key}:")
                for param, param_value in value.items():
                    print(f"    {param}: {param_value}")
            else:
                print(f"{key}: {value}")
        print("\n" + "="*40 + "\n")  # Separator between models

# Usage example:
# Call `store_results` when you want to add a new model's results
# model_results = store_results('LGBMClassifier', random_search_lgbm, X_train_resampled, y_train_resampled, X_test, y_test, y_pred_lgbm)

# Call `display_results` when you want to display the existing results without adding a new model
# display_results(model_results)

# Model 6: Best Model from `incident_event_log_v2.ipynb`

**Model**: LGBMClassifier  
**Training DataFrame**: `full_df`  

### Best Parameters:
| Parameter          | Value                        |
|--------------------|------------------------------|
| `reg_lambda`       | 7.0                          |
| `num_leaves`       | 80                           |
| `subsample`        | 0.8500000000000001           |
| `n_estimators`     | 450                          |
| `min_child_weight` | 8                            |
| `metric`           | binary_logloss               |
| `max_depth`        | 3                            |
| `max_bin`          | 75                           |
| `learning_rate`    | 0.1                          |
| `colsample_bytree` | 0.30000000000000004          |
| `class_weight`     | None                         |
| `boosting_type`    | gbdt                         |

### Performance:
| Metric                     | Score                 |
|-----------------------------|-----------------------|
| **Best Cross-Validation Score** | 0.7317378840708587    |
| **Balanced Accuracy Score**     | 0.7287811858610862    |
| **Training Score**              | 0.7493595972087271    |
| **Testing Score**               | 0.7287811858610862    |

### Classification Report:
| Class | Precision | Recall   | F1-Score | Support  |
|-------|-----------|----------|----------|----------|
| **0** | 0.810210  | 0.714891 | 0.759572 | 3774     |
| **1** | 0.628966  | 0.742671 | 0.681105 | 2456     |
| **Accuracy**        | 0.725843  |          |          |          |
| **Macro Avg**       | 0.719588  | 0.728781 | 0.720339 | 6230     |
| **Weighted Avg**    | 0.738760  | 0.725843 | 0.728639 | 6230     |

=======================================


In [30]:
model_6_param = {
'subsample': 0.8500000000000001,
 'reg_lambda': 7.0,
 'num_leaves': 10,
 'n_estimators': 450,
 'min_child_weight': 8,
 'metric': 'binary_logloss',
 'max_depth': 3,
 'max_bin': 75,
 'learning_rate': 0.1,
 'colsample_bytree': 0.30000000000000004,
 'class_weight': None,
 'boosting_type': 'gbdt'
 }

In [31]:
lgbm_clf_6 = LGBMClassifier(random_state=13, **model_6_param)

lgbm_clf_6.fit(X_train_resampled, y_train_resampled)

y_pred_lgbm_6 = lgbm_clf_6.predict(X_test, random_state = 13)

[LightGBM] [Info] Number of positive: 11321, number of negative: 11321
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001625 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 668
[LightGBM] [Info] Number of data points in the train set: 22642, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [32]:
display(f'balanced accuracy score: {balanced_accuracy_score(y_test, y_pred_lgbm_6)}')
print(classification_report( y_test,y_pred_lgbm_6))

display(f'Training Score: {lgbm_clf_6.score(X_train_resampled, y_train_resampled)}')
display(f'Testing Score: {lgbm_clf_6.score(X_test, y_test)}')

'balanced accuracy score: 0.7287811858610862'

              precision    recall  f1-score   support

           0       0.81      0.71      0.76      3774
           1       0.63      0.74      0.68      2456

    accuracy                           0.73      6230
   macro avg       0.72      0.73      0.72      6230
weighted avg       0.74      0.73      0.73      6230



'Training Score: 0.7493595972087271'

'Testing Score: 0.7258426966292135'

In [33]:


# Define parameter ranges
colsample_bytree_list = np.arange(0.2, 0.4, 0.1)
reg_lambda = [0.1, 0.5, 1.0, 3.0, 5.0, 7.0, 10.0]
max_depth_list = np.arange(2, 4, 1, dtype=int)
max_bin_list = np.arange(15, 100, 15)
n_estimators_list = np.arange(200, 500, 50)
learning_rate_list = np.arange(0.08, 0.16, 0.02)
min_child_weight_list = np.arange(6, 10, 1, dtype=int)
num_leaves_list = [20, 31, 50, 80, 120]

# Create the parameter grid
lgbm_clf_param_grid = {
    'learning_rate': learning_rate_list,
    'max_depth': max_depth_list,
    'min_child_weight': min_child_weight_list,
    'colsample_bytree': colsample_bytree_list,
    'subsample': np.arange(0.8, .9, 0.05),  
    'reg_lambda': reg_lambda,  
    'num_leaves': num_leaves_list,
    'max_bin': max_bin_list,
    'n_estimators': n_estimators_list,  
    'boosting_type': ['gbdt', 'dart',],  
    'class_weight': [None, 'balanced'],  
    'metric': ['binary_logloss', 'auc']  
}



In [34]:

# Create the model
lgbm_clf = LGBMClassifier(random_state=13)


# Perform Random search
random_search_lgbm = RandomizedSearchCV(estimator=lgbm_clf, param_distributions=lgbm_clf_param_grid, 
                                      n_iter=100, cv=None, n_jobs=-1, verbose=3, random_state=13, scoring='balanced_accuracy')
random_search_lgbm.fit(X_train_resampled, y_train_resampled)

# Get the best parameters
best_params_lgbm = random_search_lgbm.best_params_
# Predict
y_pred_lgbm = random_search_lgbm.predict(X_test)

training_df_name='day1_data_df'



Fitting 5 folds for each of 100 candidates, totalling 500 fits
[LightGBM] [Info] Number of positive: 11321, number of negative: 11321
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000982 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 668
[LightGBM] [Info] Number of data points in the train set: 22642, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [35]:
display(f'balanced accuracy score: {balanced_accuracy_score(y_test, y_pred_lgbm)}')
print(classification_report( y_test,y_pred_lgbm))

display(f'Training Score: {random_search_lgbm.score(X_train_resampled, y_train_resampled)}')
display(f'Testing Score: {random_search_lgbm.score(X_test, y_test)}')

'balanced accuracy score: 0.7287811858610862'

              precision    recall  f1-score   support

           0       0.81      0.71      0.76      3774
           1       0.63      0.74      0.68      2456

    accuracy                           0.73      6230
   macro avg       0.72      0.73      0.72      6230
weighted avg       0.74      0.73      0.73      6230



'Training Score: 0.7493595972087271'

'Testing Score: 0.7287811858610862'

In [129]:
best_params_lgbm

{'subsample': 0.8500000000000001,
 'reg_lambda': 5.0,
 'num_leaves': 80,
 'n_estimators': 450,
 'min_child_weight': 6,
 'metric': 'binary_logloss',
 'max_depth': 3,
 'max_bin': 90,
 'learning_rate': 0.14,
 'colsample_bytree': 0.30000000000000004,
 'class_weight': 'balanced',
 'boosting_type': 'gbdt'}

In [36]:
best_lgbm = random_search_lgbm.best_estimator_
# Get the feature importance array
importances = best_lgbm.feature_importances_
# List the top 10 most important features
feature_importances = pd.Series(importances, index=X_train_resampled.columns).sort_values(ascending=False)
print("\nFeature Importances:")
print(feature_importances)


Feature Importances:
assigned_to                  674
subcategory                  337
location                     319
assignment_group             249
day_of_week                  243
category                     190
opened_by                    188
u_symptom                    134
caller_id                    127
month                         77
priority                      67
problem_id                    58
impact                        37
year                          32
knowledge                     23
contact_type_Self service     23
urgency                       14
contact_type_Phone            10
cmdb_ci                        3
contact_type_Email             2
u_priority_confirmation        0
contact_type_IVR               0
notify_Send Email              0
vendor                         0
dtype: int32


In [44]:
"""This code restarts the dictionary, only run if you want to clear the model_results list of dictionaries"""
model_results = store_results(
    'LGBMClassifier',
    random_search_lgbm,
    X_train_resampled,
    y_train_resampled,
    X_test,
    y_test,
    y_pred_lgbm,
    training_df_name)


In [46]:
"""This code restarts the dictionary, only run if you want to clear the model_results list of dictionaries"""
model_results = store_results(
    'LGBMClassifier w/out RandomizedSearch',
    lgbm_clf_6,
    X_train_resampled,
    y_train_resampled,
    X_test,
    y_test,
    y_pred_lgbm_6,
    training_df_name,
    existing_results=model_results  #pass in the results from the previous model)
)

In [47]:
display_results(model_results)

Model 1:
Model: LGBMClassifier
Training DF: day1_data_df
Best Parameters:
    subsample: 0.8500000000000001
    reg_lambda: 7.0
    num_leaves: 80
    n_estimators: 450
    min_child_weight: 8
    metric: binary_logloss
    max_depth: 3
    max_bin: 75
    learning_rate: 0.1
    colsample_bytree: 0.30000000000000004
    class_weight: None
    boosting_type: gbdt
Best Cross-Validation Score: 0.7322670223636689
Balanced Accuracy Score: 0.7287811858610862
Training Score: 0.7493595972087271
Testing Score: 0.7287811858610862
Classification Report:
               precision    recall  f1-score      support
0              0.810210  0.714891  0.759572  3774.000000
1              0.628966  0.742671  0.681105  2456.000000
accuracy       0.725843  0.725843  0.725843     0.725843
macro avg      0.719588  0.728781  0.720339  6230.000000
weighted avg   0.738760  0.725843  0.728639  6230.000000


Model 2:
Model: LGBMClassifier w/out RandomizedSearch
Training DF: day1_data_df
Best Parameters:
    boost

## Conclusion

My final results showcase two models: one utilizing RandomizedSearch for hyperparameter tuning and the other relying on manually set parameters. Despite the difference in approach, both models produced nearly identical performance, with the key metric being a **balanced accuracy score of 73%**. The slight **2% overfit** observed between training and testing scores demonstrates that the model generalizes well, though there is still some room for optimization.

### Next Steps:
1. **Refining the Target Variable**:  
   Currently, the model predicts whether an incident ticket is resolved in 2 days or less. However, there are alternative target columns that could provide further insights, such as:
   - Binary column for incidents resolved in 1 day or less
   - Binary column for incidents resolved in 3 days or less
   - Binary column for incidents resolved in 4 days or less

   Testing these alternate targets could yield more nuanced models that address a broader range of scenarios within the dataset.

2. **Exploring Neural Networks**:  
   Neural networks, with their ability to capture complex relationships within the data, could potentially provide a significant boost in prediction accuracy. I plan to explore this avenue to further improve model performance.

Overall, this project has provided valuable hands-on experience with machine learning techniques, and I am excited to continue refining and expanding these models to further enhance their effectiveness.
