In [33]:
!pip install imbalanced-learn



In [34]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [35]:
tree = ET.parse('research-summer-24/data-generation/cluster_point/SYNTH_OUTPUT2_cluster_point_process.xml')
root = tree.getroot()

In [36]:
import pandas as pd
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


# Step 1: Parse the XML file and load data into a DataFrame
tree = ET.parse("research-summer-24/data-generation/cluster_point/SYNTH_OUTPUT2_cluster_point_process.xml") ## at least more than 300 elements in the training set (lows 1000s)
root = tree.getroot()

# Initialize a list to store event data
events = []

# Extract attributes from each event element
for event in root.findall(".//event"):
    event_data = {
        'time': int(event.get('time')),
        'x': float(event.get('x')),
        'y': float(event.get('y')),
        'type': event.get('type')  # Target variable
    }
    events.append(event_data)

# Convert the list of events to a DataFrame
df = pd.DataFrame(events)

# Step 2: Calculate time differences between consecutive events
df['time_diff'] = df['time'].diff().fillna(0)

# Step 3: Transform data to include previous calls as features
n = 5  # Number of calls to consider (4 previous, 1 target)
processed_data = []

for i in range(n - 1, len(df)):
    features = {}

    # Include time_diff, x, y, and type of the previous n-1 calls as features
    for j in range(n - 1):
        call = df.iloc[i - (n - 1 - j)]
        features[f'time_diff_{j+1}'] = call['time_diff']
        features[f'x_{j+1}'] = call['x']
        features[f'y_{j+1}'] = call['y']
        features[f'type_{j+1}'] = call['type']
    
    # Set the current call's type as the target
    features['type'] = df.iloc[i]['type']
    processed_data.append(features)

# Step 4: Convert the list to a DataFrame and one-hot encode previous types
df_transformed = pd.DataFrame(processed_data)
df_transformed = pd.get_dummies(df_transformed, columns=[f'type_{j+1}' for j in range(n - 1)])

# Step 5: Separate features and target variable, convert target to categorical
X = df_transformed.drop(columns=['type'])
y = df_transformed['type'].astype('category').cat.codes  # Encodes target as numeric

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Number of calls in training data: {len(X_train)}")
print(f"Number of calls in test data: {len(X_test)}")

# Apply undersampling to balance classes
undersample = RandomUnderSampler(sampling_strategy='auto')
X_train_resampled, y_train_resampled = undersample.fit_resample(X_train, y_train)

"""param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

# Perform grid search
grid_search = GridSearchCV(RandomForestClassifier(random_state=42, class_weight='balanced'), 
                           param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)

# Use the best model
rf_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")"""

# Check the class distribution after undersampling
print(f"Class distribution after undersampling: {Counter(y_train_resampled)}")
 
# Step 6: Train the RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)

# Evaluate the model on the test set
y_pred = rf_model.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {score:.4f}") ### known frequency of different classes. balance, experimenter's notebook. size of the training set after balancing has to be much larger than number of parameters

type_mapping = dict(enumerate(df['type'].astype('category').cat.categories))
print(type_mapping)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=type_mapping.values()))

# Print confusion matrix
print("\nConfusion Matrix:")
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

# Prediction function for the next call type
def predict_next_call(df, model, n=5):
    features = {}

    # Extract the last n-1 calls to construct features for prediction
    for j in range(n - 1):
        call = df.iloc[-(n - 1 - j)]
        features[f'time_diff_{j+1}'] = call['time_diff']
        features[f'x_{j+1}'] = call['x']
        features[f'y_{j+1}'] = call['y']
        features[f'type_{j+1}'] = call['type']
    
    # Convert to DataFrame and one-hot encode
    next_call_features = pd.DataFrame([features])
    next_call_features = pd.get_dummies(next_call_features, columns=[f'type_{j+1}' for j in range(n - 1)])
    next_call_features = next_call_features.reindex(columns=X.columns, fill_value=0)
    
    # Predict the next call type
    prediction = model.predict(next_call_features)
    return prediction[0]

# Example usage to predict the next call type
predicted_type = predict_next_call(df, rf_model, n)
print(f"Predicted type for the next call: {predicted_type}")

class_frequencies = Counter(y_train)
print(class_frequencies) ## understand what those numbers mean, interpret the confusion matrix

## comment out grid search and use 200 estimators and increase training set significantly
## seed the random number generator for the testing set so that testing set is not subset of training set

Number of calls in training data: 1055
Number of calls in test data: 264
Class distribution after undersampling: Counter({0: 159, 1: 159, 2: 159})
Test set accuracy: 0.5303
{0: 'EMS', 1: 'Fire', 2: 'Law'}

Classification Report:
              precision    recall  f1-score   support

         EMS       0.40      0.59      0.48        63
        Fire       0.35      0.60      0.44        42
         Law       0.77      0.49      0.60       159

    accuracy                           0.53       264
   macro avg       0.51      0.56      0.51       264
weighted avg       0.62      0.53      0.55       264


Confusion Matrix:
[[37 12 14]
 [ 8 25  9]
 [47 34 78]]
Predicted type for the next call: 1
Counter({2: 635, 0: 261, 1: 159})
