In [1]:
# importing required python libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

# Loading the dataset
dataset1=pd.read_csv("User_and_Admin_Activity_Data_1.csv",index_col=None)
df2=dataset1
df2 = pd.get_dummies(df2, drop_first=True)
indep_X=df2.drop('IsThreat',axis=1)
dep_Y=df2['IsThreat']


# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.2, random_state=42)

# Training a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [3]:
# Getting  feature importance
importance = rf.feature_importances_

# Creating a DataFrame to hold features and their importance scores
feature_importance_df = pd.DataFrame({
    'Feature': X_test.columns,
    'Importance': importance
})

# Sorting the features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Displaying the sorted features by importance
print(feature_importance_df)

# Optionally, select the top N most important features (e.g., top 5 features)
top_features = feature_importance_df.head(10)['Feature']
top_importance = feature_importance_df.head(10)['Importance']

TOP_feature_importance_df = pd.DataFrame({
    'TOP_Feature': top_features,
    'TOP_Importance': top_importance
})
TOP_feature_importance_df = TOP_feature_importance_df.sort_values(by='TOP_Importance', ascending=False)
print("\nTop 10 important features:")
print(TOP_feature_importance_df)

                              Feature  Importance
179            Operation_UserLoggedIn    0.095598
151            Operation_FileAccessed    0.064605
201                 Workload_OneDrive    0.058346
0                          RecordType    0.058087
265           ClientIP_203.28.245.168    0.057587
..                                ...         ...
249           ClientIP_191.37.156.206    0.000000
19   CreationDate_2025-07-18 06:57:00    0.000000
22   CreationDate_2025-07-18 07:00:00    0.000000
252           ClientIP_193.17.162.143    0.000000
100  CreationDate_2025-07-19 00:42:00    0.000000

[375 rows x 2 columns]

Top 10 important features:
                         TOP_Feature  TOP_Importance
179           Operation_UserLoggedIn        0.095598
151           Operation_FileAccessed        0.064605
201                Workload_OneDrive        0.058346
0                         RecordType        0.058087
265          ClientIP_203.28.245.168        0.057587
359  ApplicationDisplayName_Ex

In [5]:
# Permutation feature importance
from sklearn.inspection import permutation_importance
result = permutation_importance(rf, X_test, y_test, n_repeats=10, random_state=0, n_jobs=-1)
perm_imp_df = pd.DataFrame({'Feature': X_test.columns, 'Permutation Importance': result.importances_mean}).sort_values('Permutation Importance', ascending=False)
print(perm_imp_df)

                                               Feature  Permutation Importance
156                            Operation_FilePreviewed                0.048148
265                            ClientIP_203.28.245.168                0.025926
151                             Operation_FileAccessed                0.008148
187    UserId_ShameemMohamed@spark1077.onmicrosoft.com                0.007407
202                  Workload_SecurityComplianceCenter                0.007407
..                                                 ...                     ...
131                   CreationDate_2025-07-19 02:06:00                0.000000
130                   CreationDate_2025-07-19 01:41:00                0.000000
129                   CreationDate_2025-07-19 01:23:00                0.000000
128                   CreationDate_2025-07-19 01:22:00                0.000000
374  LogonError_UserStrongAuthClientAuthNRequiredIn...                0.000000

[375 rows x 2 columns]


# To keep feature importance selected columns,

In [6]:
columns_to_keep = ['Operation_FileAccessed','Operation_FileModified','Operation_FileUploaded',
                   'Operation_FileDownloaded','Operation_MoveToDeletedItems','IsRiskyHour','Operation_UserLoginFailed','GeoLocation_IND',
                   'ClientIP_203.28.245.168','ClientIP_192.168.1.100','ResultStatus_unknown','IsThreat']
df_filtered = df2[columns_to_keep]

In [9]:
df_filtered

Unnamed: 0,Operation_FileAccessed,Operation_FileModified,Operation_FileUploaded,Operation_FileDownloaded,Operation_MoveToDeletedItems,IsRiskyHour,Operation_UserLoginFailed,GeoLocation_IND,ClientIP_203.28.245.168,ClientIP_192.168.1.100,ResultStatus_unknown,IsThreat
0,False,False,False,False,True,1,False,True,True,False,False,True
1,False,False,False,False,False,0,False,True,False,False,True,True
2,False,False,False,False,False,1,False,True,True,False,False,False
3,False,False,False,False,False,0,False,True,False,False,True,True
4,False,False,False,False,False,1,False,True,True,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...
666,False,False,False,False,False,1,False,True,True,False,False,False
667,True,False,False,False,False,1,False,True,False,False,False,True
668,False,False,False,False,False,1,True,False,False,True,False,True
669,True,False,False,False,False,1,False,True,False,False,False,True


In [11]:
df_filtered.to_csv('Feature_Importance_Columns.csv', index=False)