<a href="https://colab.research.google.com/github/ShaliniAnandaPhD/PIXEL-PIONEERS-TUTORIALS/blob/main/Advanages_of_Monosemanticity_in_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Notion page with more explaination: [Notion page](https://www.notion.so/shalini-ananda-phd/Unraveling-the-Mysteries-of-Machine-Learning-A-Monosemantic-Approach-fef2afbbd50d4910a62b174a87927ec2)

In [1]:
!pip install pandas numpy tensorflow scikit-learn




In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import DictionaryLearning


In [15]:
import pandas as pd

def load_data(exemption_notices_path, working_forest_path):
    # Load the datasets
    exemption_notices = pd.read_csv(exemption_notices_path)
    working_forest = pd.read_csv(working_forest_path)

    # Display the column names to understand the structure
    print("Exemption Notices Columns:")
    print(exemption_notices.columns)
    print("\nWorking Forest Columns:")
    print(working_forest.columns)

    # Assuming 'OBJECTID' is the common key for merging
    common_key = 'OBJECTID'  # Replace this with the actual common key if different

    # Check if the common key exists in both dataframes
    if common_key not in exemption_notices.columns or common_key not in working_forest.columns:
        raise KeyError(f"'{common_key}' not found in one of the dataframes")

    # Merge datasets on the common key
    data = pd.merge(exemption_notices, working_forest, on=common_key, suffixes=('_exempt', '_forest'))

    # Display the merged data columns to check for GIS_ACRES
    print("Merged Data Columns:")
    print(data.columns)

    # Ensure GIS_ACRES is included in the merged dataset
    if 'GIS_ACRES_exempt' not in data.columns and 'GIS_ACRES_forest' not in data.columns:
        raise KeyError("'GIS_ACRES' column not found in the merged dataframe")

    # Choose the appropriate GIS_ACRES column
    if 'GIS_ACRES_exempt' in data.columns:
        labels = data['GIS_ACRES_exempt']
        features = data.drop(columns=['GIS_ACRES_exempt'])
    else:
        labels = data['GIS_ACRES_forest']
        features = data.drop(columns=['GIS_ACRES_forest'])

    # Perform data cleaning and preprocessing
    # Example preprocessing steps:
    features = features.fillna(0)  # Fill missing values with 0
    features = features.select_dtypes(include=[int, float])  # Select only numerical columns

    return features, labels

# Paths to the provided datasets
exemption_notices_path = "/content/CAL_FIRE_Exemption_Notices_All_TA83_580667334888250359.csv"
working_forest_path = "/content/CAL_FIRE_Working_Forest_Management_Plans_and_Notices_TA83_8835581698934070752.csv"

# Load and preprocess the data
features, labels = load_data(exemption_notices_path, working_forest_path)

# Display the first few rows of features and labels to verify
print("Features:")
print(features.head())
print("\nLabels:")
print(labels.head())




Exemption Notices Columns:
Index(['OBJECTID', 'REPORTD_AC', 'REGION', 'EX_YEAR', 'EX_NUM', 'COUNTY',
       'LANDOWNER', 'EX_TYPE', 'ACCEPTED', 'EXPIRATION', 'COMPLETED',
       'COMMENTS', 'GIS_ACRES', 'HD_NUM', 'GLOBALID', 'Shape__Area',
       'Shape__Length'],
      dtype='object')

Working Forest Columns:
Index(['OBJECTID', 'GIS_ACRES', 'REGION', 'WFMP_YEAR', 'WFN_YEAR', 'WFMP_NUM',
       'WFN_NUM', 'COUNTY', 'TIMBEROWNR', 'LANDOWNER', 'SILVI_1', 'SILVI_2',
       'SILVI_CAT', 'YARD', 'UNIT', 'PLAN_STAT', 'ACCEPTED', 'COMPLETED',
       'COMMENTS', 'SPATL_MOD', 'MODIFIED', 'HD_NUM', 'GLOBALID',
       'Shape__Area', 'Shape__Length'],
      dtype='object')
Merged Data Columns:
Index(['OBJECTID', 'REPORTD_AC', 'REGION_exempt', 'EX_YEAR', 'EX_NUM',
       'COUNTY_exempt', 'LANDOWNER_exempt', 'EX_TYPE', 'ACCEPTED_exempt',
       'EXPIRATION', 'COMPLETED_exempt', 'COMMENTS_exempt', 'GIS_ACRES_exempt',
       'HD_NUM_exempt', 'GLOBALID_exempt', 'Shape__Area_exempt',
       'Shape__Leng

In [16]:
from sklearn.decomposition import DictionaryLearning
import pandas as pd

# Function to map neuron activation patterns
def map_activation_patterns(features):
    # Ensure there are no empty rows
    features = features[features.any(axis=1)]

    # Apply dictionary learning to extract features
    dict_learner = DictionaryLearning(n_components=10, random_state=42)  # Adjust n_components as needed
    transformed_features = dict_learner.fit_transform(features)
    return transformed_features

# Map neuron activation patterns
mapped_features = map_activation_patterns(features)

# Display the first few rows of the mapped features to verify
print("Mapped Features:")
print(pd.DataFrame(mapped_features).head())



Mapped Features:
              0              1    2    3    4           5             6  \
0  0.000000e+00       0.000000  0.0  0.0  0.0     0.00000 -10060.416654   
1  0.000000e+00       0.000000  0.0  0.0  0.0 -3637.22492      0.000000   
2  7.673137e+06 -121704.095856  0.0  0.0  0.0     0.00000      0.000000   
3  1.909834e+06  -50133.525746  0.0  0.0  0.0     0.00000      0.000000   
4  0.000000e+00    4187.928254  0.0  0.0  0.0     0.00000      0.000000   

              7    8             9  
0      0.000000  0.0  44720.518974  
1  17449.408054  0.0      0.000000  
2      0.000000  0.0      0.000000  
3      0.000000  0.0      0.000000  
4  85535.818651  0.0      0.000000  


In [17]:
import tensorflow as tf
import numpy as np

def train_predictive_model(features, labels):
    # Convert to numpy arrays
    X_train = features
    y_train = labels.values

    # Build and compile the model
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1, activation='linear')
    ])
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

    # Evaluate the model
    y_pred = model.predict(X_train)
    y_pred = y_pred.flatten()

    # Calculate metrics
    mse = np.mean((y_train - y_pred) ** 2)
    mae = np.mean(np.abs(y_train - y_pred))

    print(f"Mean Squared Error: {mse:.4f}")
    print(f"Mean Absolute Error: {mae:.4f}")

    return model

# Convert mapped features to DataFrame
mapped_features_df = pd.DataFrame(mapped_features)

# Train the predictive model
model = train_predictive_model(mapped_features_df, labels)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Mean Squared Error: 1150168445.9265
Mean Absolute Error: 18946.3766


In [18]:
def generate_custom_response(model, data, threshold=50):
    # Use the trained model to generate predictions
    predictions = model.predict(data)
    # Generate recommendations based on the threshold
    recommendations = ["High Risk" if pred > threshold else "Low Risk" for pred in predictions.flatten()]
    return recommendations

# Generate custom responses using the trained model and mapped features
recommendations = generate_custom_response(model, pd.DataFrame(mapped_features))

# Display the generated recommendations
print("Recommendations:")
print(recommendations)


Recommendations:
['High Risk', 'Low Risk', 'Low Risk', 'Low Risk', 'Low Risk', 'High Risk', 'High Risk', 'High Risk', 'Low Risk', 'High Risk', 'High Risk', 'High Risk', 'Low Risk', 'High Risk', 'High Risk', 'Low Risk', 'High Risk', 'High Risk', 'High Risk', 'High Risk']


In [19]:
def decision_support_system(model, data, threshold=50):
    # Use the trained model to generate predictions
    predictions = model.predict(data)

    decisions, alerts = [], []

    for pred in predictions.flatten():
        if pred > threshold:
            decisions.append("Deploy Resources")
            alerts.append("High Risk: Evacuate Area")
        else:
            decisions.append("Monitor Area")
            alerts.append("Low Risk: No Immediate Action Required")

    return decisions, alerts

# Implement decision support system
decisions, alerts = decision_support_system(model, pd.DataFrame(mapped_features))

# Display decisions and alerts
print("Decisions:")
print(decisions)
print("\nAlerts:")
print(alerts)


Decisions:
['Deploy Resources', 'Monitor Area', 'Monitor Area', 'Monitor Area', 'Monitor Area', 'Deploy Resources', 'Deploy Resources', 'Deploy Resources', 'Monitor Area', 'Deploy Resources', 'Deploy Resources', 'Deploy Resources', 'Monitor Area', 'Deploy Resources', 'Deploy Resources', 'Monitor Area', 'Deploy Resources', 'Deploy Resources', 'Deploy Resources', 'Deploy Resources']

Alerts:
['High Risk: Evacuate Area', 'Low Risk: No Immediate Action Required', 'Low Risk: No Immediate Action Required', 'Low Risk: No Immediate Action Required', 'Low Risk: No Immediate Action Required', 'High Risk: Evacuate Area', 'High Risk: Evacuate Area', 'High Risk: Evacuate Area', 'Low Risk: No Immediate Action Required', 'High Risk: Evacuate Area', 'High Risk: Evacuate Area', 'High Risk: Evacuate Area', 'Low Risk: No Immediate Action Required', 'High Risk: Evacuate Area', 'High Risk: Evacuate Area', 'Low Risk: No Immediate Action Required', 'High Risk: Evacuate Area', 'High Risk: Evacuate Area', 'Hig

NON MONOSEMANTIC

In [29]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout

def load_and_preprocess_data(exemption_notices_path, working_forest_path):
    # Load datasets
    exemption_notices = pd.read_csv(exemption_notices_path)
    working_forest_data = pd.read_csv(working_forest_path)

    # Merge datasets on 'OBJECTID'
    merged_data = pd.merge(exemption_notices, working_forest_data, on='OBJECTID')

    # Inspect the columns to choose relevant features
    print("Merged Data Columns:", merged_data.columns)

    # Select relevant columns and handle missing values
    features = merged_data[['REPORTD_AC', 'EX_YEAR', 'GIS_ACRES_x']].fillna(0)
    labels = merged_data['PLAN_STAT']

    # Encode string labels to numerical values
    label_encoder = LabelEncoder()
    labels_encoded = label_encoder.fit_transform(labels)

    # Scale the features
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    # Check for data imbalance
    print("Label distribution:", pd.Series(labels_encoded).value_counts())

    return features_scaled, labels_encoded

# Paths to the provided datasets
exemption_notices_path = "/content/CAL_FIRE_Exemption_Notices_All_TA83_580667334888250359.csv"
working_forest_path = "/content/CAL_FIRE_Working_Forest_Management_Plans_and_Notices_TA83_8835581698934070752.csv"

# Load and preprocess data
features, labels = load_and_preprocess_data(exemption_notices_path, working_forest_path)


Merged Data Columns: Index(['OBJECTID', 'REPORTD_AC', 'REGION_x', 'EX_YEAR', 'EX_NUM', 'COUNTY_x',
       'LANDOWNER_x', 'EX_TYPE', 'ACCEPTED_x', 'EXPIRATION', 'COMPLETED_x',
       'COMMENTS_x', 'GIS_ACRES_x', 'HD_NUM_x', 'GLOBALID_x', 'Shape__Area_x',
       'Shape__Length_x', 'GIS_ACRES_y', 'REGION_y', 'WFMP_YEAR', 'WFN_YEAR',
       'WFMP_NUM', 'WFN_NUM', 'COUNTY_y', 'TIMBEROWNR', 'LANDOWNER_y',
       'SILVI_1', 'SILVI_2', 'SILVI_CAT', 'YARD', 'UNIT', 'PLAN_STAT',
       'ACCEPTED_y', 'COMPLETED_y', 'COMMENTS_y', 'SPATL_MOD', 'MODIFIED',
       'HD_NUM_y', 'GLOBALID_y', 'Shape__Area_y', 'Shape__Length_y'],
      dtype='object')
Label distribution: 2    9
0    8
1    3
Name: count, dtype: int64


In [34]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from imblearn.over_sampling import SMOTE
from keras.models import Sequential
from keras.layers import Dense, Dropout
import numpy as np

def load_and_preprocess_data(exemption_notices_path, working_forest_path):
    # Load datasets
    exemption_notices = pd.read_csv(exemption_notices_path)
    working_forest_data = pd.read_csv(working_forest_path)

    # Merge datasets on 'OBJECTID'
    merged_data = pd.merge(exemption_notices, working_forest_data, on='OBJECTID')

    # Inspect the columns to choose relevant features
    print("Merged Data Columns:", merged_data.columns)

    # Select relevant columns and handle missing values
    features = merged_data[['REPORTD_AC', 'EX_YEAR', 'GIS_ACRES_x', 'WFMP_YEAR', 'WFN_YEAR', 'YARD', 'SILVI_1', 'SILVI_2']].fillna(0)
    labels = merged_data['PLAN_STAT']

    # One-hot encode categorical features
    features = pd.get_dummies(features, columns=['YARD', 'SILVI_1', 'SILVI_2'], drop_first=True)

    # Encode string labels to numerical values
    label_encoder = LabelEncoder()
    labels_encoded = label_encoder.fit_transform(labels)

    # Scale the features
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    # Check for data imbalance
    print("Label distribution before SMOTE:", pd.Series(labels_encoded).value_counts())

    # Apply SMOTE to balance the classes with adjusted n_neighbors
    smote = SMOTE(random_state=42, k_neighbors=2)
    features_resampled, labels_resampled = smote.fit_resample(features_scaled, labels_encoded)

    print("Label distribution after SMOTE:", pd.Series(labels_resampled).value_counts())

    return features_resampled, labels_resampled

# Paths to the provided datasets
exemption_notices_path = "/content/CAL_FIRE_Exemption_Notices_All_TA83_580667334888250359.csv"
working_forest_path = "/content/CAL_FIRE_Working_Forest_Management_Plans_and_Notices_TA83_8835581698934070752.csv"

# Load and preprocess data
features, labels = load_and_preprocess_data(exemption_notices_path, working_forest_path)


Merged Data Columns: Index(['OBJECTID', 'REPORTD_AC', 'REGION_x', 'EX_YEAR', 'EX_NUM', 'COUNTY_x',
       'LANDOWNER_x', 'EX_TYPE', 'ACCEPTED_x', 'EXPIRATION', 'COMPLETED_x',
       'COMMENTS_x', 'GIS_ACRES_x', 'HD_NUM_x', 'GLOBALID_x', 'Shape__Area_x',
       'Shape__Length_x', 'GIS_ACRES_y', 'REGION_y', 'WFMP_YEAR', 'WFN_YEAR',
       'WFMP_NUM', 'WFN_NUM', 'COUNTY_y', 'TIMBEROWNR', 'LANDOWNER_y',
       'SILVI_1', 'SILVI_2', 'SILVI_CAT', 'YARD', 'UNIT', 'PLAN_STAT',
       'ACCEPTED_y', 'COMPLETED_y', 'COMMENTS_y', 'SPATL_MOD', 'MODIFIED',
       'HD_NUM_y', 'GLOBALID_y', 'Shape__Area_y', 'Shape__Length_y'],
      dtype='object')
Label distribution before SMOTE: 2    9
0    8
1    3
Name: count, dtype: int64
Label distribution after SMOTE: 2    9
0    9
1    9
Name: count, dtype: int64


In [36]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from imblearn.over_sampling import SMOTE
from keras.models import Sequential
from keras.layers import Dense, Dropout
import numpy as np

def load_and_preprocess_data(exemption_notices_path, working_forest_path):
    # Load datasets
    exemption_notices = pd.read_csv(exemption_notices_path)
    working_forest_data = pd.read_csv(working_forest_path)

    # Merge datasets on 'OBJECTID'
    merged_data = pd.merge(exemption_notices, working_forest_data, on='OBJECTID')

    # Inspect the columns to choose relevant features
    print("Merged Data Columns:", merged_data.columns)

    # Select relevant columns and handle missing values
    features = merged_data[['REPORTD_AC', 'EX_YEAR', 'GIS_ACRES_x', 'WFMP_YEAR', 'WFN_YEAR', 'YARD', 'SILVI_1', 'SILVI_2']].fillna(0)
    labels = merged_data['PLAN_STAT']

    # One-hot encode categorical features
    features = pd.get_dummies(features, columns=['YARD', 'SILVI_1', 'SILVI_2'], drop_first=True)

    # Encode string labels to numerical values
    label_encoder = LabelEncoder()
    labels_encoded = label_encoder.fit_transform(labels)

    # Scale the features
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    # Check for data imbalance
    print("Label distribution before SMOTE:", pd.Series(labels_encoded).value_counts())

    # Apply SMOTE to balance the classes with adjusted n_neighbors
    smote = SMOTE(random_state=42, k_neighbors=1)
    features_resampled, labels_resampled = smote.fit_resample(features_scaled, labels_encoded)

    print("Label distribution after SMOTE:", pd.Series(labels_resampled).value_counts())

    return features_resampled, labels_resampled

# Paths to the provided datasets
exemption_notices_path = "/content/CAL_FIRE_Exemption_Notices_All_TA83_580667334888250359.csv"
working_forest_path = "/content/CAL_FIRE_Working_Forest_Management_Plans_and_Notices_TA83_8835581698934070752.csv"

# Load and preprocess data
features, labels = load_and_preprocess_data(exemption_notices_path, working_forest_path)


Merged Data Columns: Index(['OBJECTID', 'REPORTD_AC', 'REGION_x', 'EX_YEAR', 'EX_NUM', 'COUNTY_x',
       'LANDOWNER_x', 'EX_TYPE', 'ACCEPTED_x', 'EXPIRATION', 'COMPLETED_x',
       'COMMENTS_x', 'GIS_ACRES_x', 'HD_NUM_x', 'GLOBALID_x', 'Shape__Area_x',
       'Shape__Length_x', 'GIS_ACRES_y', 'REGION_y', 'WFMP_YEAR', 'WFN_YEAR',
       'WFMP_NUM', 'WFN_NUM', 'COUNTY_y', 'TIMBEROWNR', 'LANDOWNER_y',
       'SILVI_1', 'SILVI_2', 'SILVI_CAT', 'YARD', 'UNIT', 'PLAN_STAT',
       'ACCEPTED_y', 'COMPLETED_y', 'COMMENTS_y', 'SPATL_MOD', 'MODIFIED',
       'HD_NUM_y', 'GLOBALID_y', 'Shape__Area_y', 'Shape__Length_y'],
      dtype='object')
Label distribution before SMOTE: 2    9
0    8
1    3
Name: count, dtype: int64
Label distribution after SMOTE: 2    9
0    9
1    9
Name: count, dtype: int64


In [37]:
from keras.optimizers import Adam

# Build simplified neural network model
non_monosemantic_model = Sequential([
    Dense(64, input_dim=features.shape[1], activation='relu'),  # Adjust input_dim to match the number of features
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Use sigmoid activation for binary classification
])

# Compile model with a lower learning rate
optimizer = Adam(learning_rate=0.001)
non_monosemantic_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Train model
history_non_monosemantic = non_monosemantic_model.fit(features, labels, epochs=50, batch_size=16, validation_split=0.2)

# Evaluate the model
loss, accuracy = non_monosemantic_model.evaluate(features, labels)
print(f"Loss: {loss}, Accuracy: {accuracy}")

# Generate custom responses
def generate_custom_response_non_monosemantic(model, data, threshold=0.5):
    predictions = model.predict(data)
    responses = ["High Risk" if pred > threshold else "Low Risk" for pred in predictions]
    return responses

# Generate responses
responses_non_monosemantic = generate_custom_response_non_monosemantic(non_monosemantic_model, features)

# Display sample predictions and responses
for i, (pred, resp) in enumerate(zip(non_monosemantic_model.predict(features), responses_non_monosemantic)):
    print(f"Data Point {i+1}: Predicted Risk = {pred[0]:.2f}, Response = {resp}")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Loss: 0.036752503365278244, Accuracy: 0.5925925970077515
Data Point 1: Predicted Risk = 0.67, Response = High Risk
Data Point 2: Predicted Risk = 0.67, Response = High Risk
Data Point 3: Predicted Risk = 0.44, Response = Low Risk
Data Point 4: Predicted Risk = 0.45, Response = Low Risk
Data Point 5: Predicted Risk = 0.45, Response = Low Risk
Data Point 6: Predicted Risk = 0.82, Response = High Risk
Data Po