In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

# Config
DATA_DIR = "/Users/ramiab/Desktop/Mineral-Predictions-Local"
CNN_PATH = os.path.join(DATA_DIR, "Training", "data", "output", "CNN_output.csv")
GB_PATH = os.path.join(DATA_DIR, "Training", "data", "output", "GB_output.csv")
OUTPUT_DIR = os.path.join(DATA_DIR, "Training", "data", "output")
LABEL_NAMES = ['AU', 'AG', 'CU', 'CO', 'NI']

# Weights for probability calculation
CNN_WEIGHT = 0.7
GB_WEIGHT = 0.3

print("Loading predictions...")
cnn_df = pd.read_csv(CNN_PATH)
gb_df = pd.read_csv(GB_PATH)

print("Creating fusion predictions...")
# Initialize with just the columns we want to keep
fused_df = pd.DataFrame()
fused_df['UNIQUE_ID'] = cnn_df['UNIQUE_ID']
fused_df['Easting'] = cnn_df['Easting']
fused_df['Northing'] = cnn_df['Northing']

print("\nProcessing minerals...")
for mineral in tqdm(LABEL_NAMES, desc="Processing minerals"):
    # Fused prediction (0: neither, 1: one model, 2: both models)
    fused_df[f'{mineral}_pred'] = np.where(
        (cnn_df[f'{mineral}_pred'] == 1) & (gb_df[f'{mineral}_pred'] == 1),
        2,  # Both models agree
        np.where(
            (cnn_df[f'{mineral}_pred'] == 1) | (gb_df[f'{mineral}_pred'] == 1),
            1,  # At least one model
            0   # Neither model
        )
    )
    
    # Weighted probability
    fused_df[f'{mineral}_prob'] = (CNN_WEIGHT * cnn_df[f'{mineral}_prob'] + 
                                  GB_WEIGHT * gb_df[f'{mineral}_prob'])
    
    # Calculate statistics
    high_conf = (fused_df[f'{mineral}_pred'] == 2).sum()
    single_pred = (fused_df[f'{mineral}_pred'] == 1).sum()
    total_pos = high_conf + single_pred
    
    print(f"\n{mineral} Statistics:")
    print(f"High Confidence (Both Models): {high_conf:,}")
    print(f"Single Model Prediction: {single_pred:,}")
    print(f"Total Positive Predictions: {total_pos:,}")
    print(f"Percentage Positive: {(total_pos/len(fused_df))*100:.2f}%")
    print(f"Percentage High Confidence: {(high_conf/len(fused_df))*100:.2f}%")

# Save fused predictions
output_path = os.path.join(OUTPUT_DIR, "output_fused.csv")
print(f"\nSaving fused predictions to {output_path}")
fused_df.to_csv(output_path, index=False)

print("\nFusion complete!")
print("\nFinal column order:")
print(fused_df.columns.tolist())

Loading predictions...


python(14249) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Creating fusion predictions...

Processing minerals...


Processing minerals: 100%|████████████████████████| 5/5 [00:00<00:00, 64.60it/s]


AU Statistics:
High Confidence (Both Models): 6
Single Model Prediction: 13,261
Total Positive Predictions: 13,267
Percentage Positive: 1.20%
Percentage High Confidence: 0.00%

AG Statistics:
High Confidence (Both Models): 547
Single Model Prediction: 78,680
Total Positive Predictions: 79,227
Percentage Positive: 7.16%
Percentage High Confidence: 0.05%

CU Statistics:
High Confidence (Both Models): 31
Single Model Prediction: 19,783
Total Positive Predictions: 19,814
Percentage Positive: 1.79%
Percentage High Confidence: 0.00%

CO Statistics:
High Confidence (Both Models): 133
Single Model Prediction: 37,090
Total Positive Predictions: 37,223
Percentage Positive: 3.36%
Percentage High Confidence: 0.01%

NI Statistics:
High Confidence (Both Models): 1,273
Single Model Prediction: 55,996
Total Positive Predictions: 57,269
Percentage Positive: 5.17%
Percentage High Confidence: 0.11%

Saving fused predictions to /Users/ramiab/Desktop/Mineral-Predictions-Local/Training/data/output/output_f





Fusion complete!

Final column order:
['UNIQUE_ID', 'Easting', 'Northing', 'AU_pred', 'AU_prob', 'AG_pred', 'AG_prob', 'CU_pred', 'CU_prob', 'CO_pred', 'CO_prob', 'NI_pred', 'NI_prob']


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr

# Load fused results
fused_df = pd.read_csv(os.path.join(OUTPUT_DIR, "output_fused.csv"))

# Print overall statistics
print("=== MINERAL PREDICTION DISTRIBUTION ===")
for mineral in LABEL_NAMES:
    zeros = (fused_df[f'{mineral}_pred'] == 0).sum()
    ones = (fused_df[f'{mineral}_pred'] == 1).sum()
    twos = (fused_df[f'{mineral}_pred'] == 2).sum()
    total = len(fused_df)
    
    print(f"\n{mineral} Distribution:")
    print(f"No Prediction (0): {zeros:,} ({zeros/total*100:.2f}%)")
    print(f"Single Model (1): {ones:,} ({ones/total*100:.2f}%)")
    print(f"High Confidence (2): {twos:,} ({twos/total*100:.2f}%)")

# Create spatial plots for each mineral
plt.figure(figsize=(20, 15))

for idx, mineral in enumerate(LABEL_NAMES, 1):
    plt.subplot(2, 3, idx)
    
    # Plot base (zeros) - small gray dots
    mask_0 = fused_df[f'{mineral}_pred'] == 0
    plt.scatter(fused_df.loc[mask_0, 'Easting'], 
               fused_df.loc[mask_0, 'Northing'],
               c='gray', alpha=0.1, s=1)
    
    # Plot single predictions (ones) - medium red dots
    mask_1 = fused_df[f'{mineral}_pred'] == 1
    plt.scatter(fused_df.loc[mask_1, 'Easting'], 
               fused_df.loc[mask_1, 'Northing'],
               c='red', alpha=0.6, s=20)
    
    # Plot high confidence (twos) - large green dots
    mask_2 = fused_df[f'{mineral}_pred'] == 2
    plt.scatter(fused_df.loc[mask_2, 'Easting'], 
               fused_df.loc[mask_2, 'Northing'],
               c='green', alpha=0.8, s=50)
    
    plt.title(f'{mineral} Predictions\nHigh Confidence: {(mask_2.sum()/len(fused_df))*100:.2f}%')
    plt.xlabel('Easting')
    plt.ylabel('Northing')

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'mineral_predictions_map.png'), dpi=300, bbox_inches='tight')
plt.close()

# Create correlation heatmap between minerals
plt.figure(figsize=(12, 10))
correlation_matrix = np.zeros((len(LABEL_NAMES), len(LABEL_NAMES)))

for i, min1 in enumerate(LABEL_NAMES):
    for j, min2 in enumerate(LABEL_NAMES):
        corr = pearsonr(fused_df[f'{min1}_prob'], fused_df[f'{min2}_prob'])[0]
        correlation_matrix[i, j] = corr

sns.heatmap(correlation_matrix, 
            annot=True, 
            xticklabels=LABEL_NAMES, 
            yticklabels=LABEL_NAMES,
            cmap='RdYlBu_r',
            vmin=-1, vmax=1)
plt.title('Mineral Prediction Correlations')
plt.savefig(os.path.join(OUTPUT_DIR, 'mineral_correlations.png'), dpi=300, bbox_inches='tight')
plt.close()

# Calculate clustering statistics
print("\n=== SPATIAL CLUSTERING ANALYSIS ===")
for mineral in LABEL_NAMES:
    # Get positive predictions
    positive_mask = fused_df[f'{mineral}_pred'] > 0
    positive_points = fused_df[positive_mask][['Easting', 'Northing']].values
    
    if len(positive_points) > 0:
        # Calculate nearest neighbor distances
        from sklearn.neighbors import NearestNeighbors
        nbrs = NearestNeighbors(n_neighbors=2).fit(positive_points)
        distances, _ = nbrs.kneighbors(positive_points)
        avg_dist = distances[:, 1].mean()  # Using second nearest neighbor
        
        print(f"\n{mineral} Clustering:")
        print(f"Average distance between predictions: {avg_dist:.2f} meters")
        print(f"Number of clusters: TODO - implement DBSCAN")

# Generate summary statistics table
summary_stats = pd.DataFrame(index=LABEL_NAMES)
for mineral in LABEL_NAMES:
    summary_stats.loc[mineral, 'Total Positives'] = (fused_df[f'{mineral}_pred'] > 0).sum()
    summary_stats.loc[mineral, 'High Confidence'] = (fused_df[f'{mineral}_pred'] == 2).sum()
    summary_stats.loc[mineral, 'Avg Probability'] = fused_df[f'{mineral}_prob'].mean()
    summary_stats.loc[mineral, 'Max Probability'] = fused_df[f'{mineral}_prob'].max()

print("\n=== SUMMARY STATISTICS ===")
print(summary_stats.round(4))

# Save statistics to file
with open(os.path.join(OUTPUT_DIR, 'prediction_statistics.txt'), 'w') as f:
    f.write("=== MINERAL PREDICTION STATISTICS ===\n\n")
    for mineral in LABEL_NAMES:
        zeros = (fused_df[f'{mineral}_pred'] == 0).sum()
        ones = (fused_df[f'{mineral}_pred'] == 1).sum()
        twos = (fused_df[f'{mineral}_pred'] == 2).sum()
        total = len(fused_df)
        
        f.write(f"\n{mineral} Distribution:\n")
        f.write(f"No Prediction (0): {zeros:,} ({zeros/total*100:.2f}%)\n")
        f.write(f"Single Model (1): {ones:,} ({ones/total*100:.2f}%)\n")
        f.write(f"High Confidence (2): {twos:,} ({twos/total*100:.2f}%)\n")

=== MINERAL PREDICTION DISTRIBUTION ===

AU Distribution:
No Prediction (0): 1,094,022 (98.80%)
Single Model (1): 13,261 (1.20%)
High Confidence (2): 6 (0.00%)

AG Distribution:
No Prediction (0): 1,028,062 (92.84%)
Single Model (1): 78,680 (7.11%)
High Confidence (2): 547 (0.05%)

CU Distribution:
No Prediction (0): 1,087,475 (98.21%)
Single Model (1): 19,783 (1.79%)
High Confidence (2): 31 (0.00%)

CO Distribution:
No Prediction (0): 1,070,066 (96.64%)
Single Model (1): 37,090 (3.35%)
High Confidence (2): 133 (0.01%)

NI Distribution:
No Prediction (0): 1,050,020 (94.83%)
Single Model (1): 55,996 (5.06%)
High Confidence (2): 1,273 (0.11%)

=== SPATIAL CLUSTERING ANALYSIS ===

AU Clustering:
Average distance between predictions: 3392.20 meters
Number of clusters: TODO - implement DBSCAN

AG Clustering:
Average distance between predictions: 1587.66 meters
Number of clusters: TODO - implement DBSCAN

CU Clustering:
Average distance between predictions: 2530.48 meters
Number of clusters:

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import BallTree
import os

# Load datasets
DATA_DIR = "/Users/ramiab/Desktop/Mineral-Predictions-Local"
fused_df = pd.read_csv(os.path.join(DATA_DIR, "Training", "data", "output", "output_fused.csv"))
rocks_df = pd.read_csv(os.path.join(DATA_DIR, "Training", "data", "preprocessed", "rock_features.csv"))
LABEL_NAMES = ['AU', 'AG', 'CU', 'CO', 'NI']

for mineral in LABEL_NAMES:
    # Create figure with extra space for stats
    fig = plt.figure(figsize=(20, 16))  # Taller figure to accommodate stats
    
    # Left plot - Fused predictions
    plt.subplot(2, 2, 1)
    # Plot zeros as background
    mask_zero = fused_df[f'{mineral}_pred'] == 0
    plt.scatter(fused_df.loc[mask_zero, 'Easting'], 
               fused_df.loc[mask_zero, 'Northing'],
               c='gray', alpha=0.1, s=1)
    
    # Plot anomalous predictions
    mask_positive = fused_df[f'{mineral}_pred'] > 0
    plt.scatter(fused_df.loc[mask_positive, 'Easting'], 
               fused_df.loc[mask_positive, 'Northing'],
               c='red', alpha=0.6, s=20, 
               label=f'Predicted Anomalous ({mask_positive.sum():,} of {len(fused_df):,} points)')
    plt.title(f'{mineral} Model Predictions\nAnomalous Predictions: {(mask_positive.sum()/len(fused_df)*100):.2f}%')
    plt.legend()
    plt.axis('off')
    
    # Right plot - Actual rock samples
    plt.subplot(2, 2, 2)
    # Use same background from grid
    plt.scatter(fused_df.loc[mask_zero, 'Easting'], 
               fused_df.loc[mask_zero, 'Northing'],
               c='gray', alpha=0.1, s=1)
    
    # Plot anomalous samples
    mask_anomalous = rocks_df[f'{mineral}_target'] == 1
    plt.scatter(rocks_df.loc[mask_anomalous, 'Easting'], 
               rocks_df.loc[mask_anomalous, 'Northing'],
               c='yellow', marker='*', alpha=1, s=100, 
               label=f'Actual Anomalous ({mask_anomalous.sum():,} of {len(rocks_df):,} samples)')
    plt.title(f'{mineral} Actual Rock Samples\nAnomalous Samples: {(mask_anomalous.sum()/len(rocks_df)*100):.2f}%')
    plt.legend()
    plt.axis('off')
    
    # Calculate statistics
    pred_positive = fused_df[mask_positive][['Easting', 'Northing']].values
    actual_anomalous = rocks_df[mask_anomalous][['Easting', 'Northing']].values
    
    # Create statistics text
    stats_text = []
    stats_text.append(f"=== {mineral} Proximity Analysis ===")
    
    if len(pred_positive) > 0 and len(actual_anomalous) > 0:
        tree = BallTree(pred_positive)
        distances, _ = tree.query(actual_anomalous, k=1)
        
        # Calculate matches at different distances
        distances_km = [1, 2, 5, 10]
        for dist in distances_km:
            matches = (distances < dist*1000).sum()
            stats_text.append(f"Predictions within {dist}km of anomalous samples: "
                            f"{matches:,} ({matches/len(actual_anomalous)*100:.2f}%)")
        
        avg_distance = np.mean(distances)/1000
        median_distance = np.median(distances)/1000
        stats_text.append(f"\nAverage distance to nearest prediction: {avg_distance:.2f}km")
        stats_text.append(f"Median distance to nearest prediction: {median_distance:.2f}km")
    
    # Add density statistics
    area_total = (fused_df['Easting'].max() - fused_df['Easting'].min()) * \
                (fused_df['Northing'].max() - fused_df['Northing'].min()) / 1e6
    
    stats_text.append(f"\n=== Density Analysis ===")
    stats_text.append(f"Total area covered: {area_total:.2f} km²")
    stats_text.append(f"Prediction density: {len(pred_positive)/area_total:.2f} predictions/km²")
    stats_text.append(f"Sample density: {len(actual_anomalous)/area_total:.2f} samples/km²")
    
    # Add statistics text to bottom of figure
    plt.figtext(0.1, 0.3, '\n'.join(stats_text), fontsize=12, va='top')
    
    plt.tight_layout()
    plt.subplots_adjust(bottom=0.4)  # Make room for stats
    
    # Save plot
    mineral_dir = os.path.join(DATA_DIR, "Training", "data", "output", f"{mineral}_analysis")
    os.makedirs(mineral_dir, exist_ok=True)
    
    plt.savefig(os.path.join(mineral_dir, f'{mineral}_comparison.png'), 
                dpi=300, bbox_inches='tight')
    plt.close()

print("Analysis complete! Check individual mineral folders for results.")

Analysis complete! Check individual mineral folders for results.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import rasterio
from rasterio.transform import from_origin
import os

# Load fused predictions
DATA_DIR = "/Users/ramiab/Desktop/Mineral-Predictions-Local"
fused_df = pd.read_csv(os.path.join(DATA_DIR, "Training", "data", "output", "output_fused.csv"))

# Create custom colormap for better visualization
colors = ['#313695', '#4575b4', '#74add1', '#abd9e9', '#fee090', '#fdae61', '#f46d43', '#d73027']
custom_cmap = LinearSegmentedColormap.from_list("custom", colors)

# Create figure with transparent background
plt.figure(figsize=(20, 30))  # Adjusted for Quebec's shape
ax = plt.gca()
ax.set_facecolor('none')
plt.gcf().set_facecolor('none')

# Create scatter plot
scatter = plt.scatter(fused_df['Easting'], 
                     fused_df['Northing'],
                     c=fused_df['AU_prob'],
                     cmap=custom_cmap,
                     s=100,  # Increased point size for better visibility
                     alpha=0.6,
                     vmin=0,
                     vmax=fused_df['AU_prob'].quantile(0.99))

# Remove axes
plt.axis('off')

# Save temporary PNG
temp_file = os.path.join(DATA_DIR, "Training", "data", "output", 'temp_au_heatmap.png')
plt.savefig(temp_file,
            dpi=300,
            bbox_inches='tight',
            pad_inches=0,
            transparent=True)
plt.close()

# Calculate bounds for georeferencing
x_min, x_max = fused_df['Easting'].min(), fused_df['Easting'].max()
y_min, y_max = fused_df['Northing'].min(), fused_df['Northing'].max()

# Define resolution (1km)
resolution = 1000
width = int((x_max - x_min) / resolution)
height = int((y_max - y_min) / resolution)

# Create the transform
transform = from_origin(x_min, y_max, resolution, resolution)

# Create GeoTIFF
output_tiff = os.path.join(DATA_DIR, "Training", "data", "output", 'au_heatmap.tif')
with rasterio.open(
    output_tiff,
    'w',
    driver='GTiff',
    height=height,
    width=width,
    count=4,  # RGBA
    dtype=rasterio.uint8,
    crs='EPSG:32198',
    transform=transform,
) as dst:
    # Read the temporary PNG
    with rasterio.open(temp_file) as src:
        data = src.read()
        dst.write(data)

# Clean up temporary file
os.remove(temp_file)

print(f"Heatmap saved as: {output_tiff}")

  dataset = DatasetReader(path, driver=driver, sharing=sharing, **kwargs)


Heatmap saved as: /Users/ramiab/Desktop/Mineral-Predictions-Local/Training/data/output/au_heatmap.tif


In [4]:
import pandas as pd

# Read the CSV
df = pd.read_csv('../data/output/samples.csv')

minerals = ['AU', 'AG', 'CU', 'CO', 'NI']

print("Positive samples for each mineral:")
for mineral in minerals:
   positives = (df[f'{mineral}_pred'] > 0).sum()
   total = len(df)
   ratio = positives/total * 100
   print(f"{mineral}: {positives:,} ({ratio:.2f}%)")

Positive samples for each mineral:
AU: 13,267 (1.20%)
AG: 79,227 (7.16%)
CU: 19,814 (1.79%)
CO: 37,223 (3.36%)
NI: 57,269 (5.17%)
