In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
organizations_nsidcorg_glacier_inventory_path = kagglehub.dataset_download('organizations/nsidcorg/glacier-inventory')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load the dataset
file_path = '/kaggle/input/glacier-inventory/database.csv'  # Update this with the correct file path
df = pd.read_csv(file_path)

# Display first few rows of the dataset
print("Dataset Head:\n", df.head())

# Check for missing values column-wise
print("\nMissing Values per Column:\n", df.isnull().sum())

# Drop columns with too many missing values (e.g., >50% missing data)
threshold = 0.5
df = df.loc[:, df.isnull().mean() < threshold]

# Fill remaining missing values with appropriate values (e.g., mean for numeric columns)
df = df.fillna(df.mean(numeric_only=True))

# You can also use forward-fill or backward-fill for categorical data
df = df.fillna(method='ffill')

# Ensure the dataset is not empty after handling missing values
if df.empty:
    raise ValueError("The dataset is empty after cleaning. Please check the data.")

# Exploratory Data Analysis (EDA)
# Example of visualizing glacier locations on a map (latitude and longitude)
plt.figure(figsize=(10,6))
sns.scatterplot(x='Longitude', y='Latitude', hue='Primary Class', data=df)
plt.title('Glacier Locations by Primary Class')
plt.show()

# Feature Engineering: Converting Categorical Data into Numeric
df_encoded = pd.get_dummies(df[['Political Unit', 'Continent', 'Primary Class']], drop_first=True)

# Selecting important features for the model
features = df[['Latitude', 'Longitude']].join(df_encoded)

# Placeholder for target (Replace this with actual High Risk vs Low Risk classification)
target = np.random.choice([0, 1], size=len(df))  # Replace this with the actual target

# Verify the shape of features and target
print("Feature Shape: ", features.shape)
print("Target Shape: ", target.shape)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

# Model Building: RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Prediction
y_pred = model.predict(X_test)

# Model Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Feature Importance
plt.figure(figsize=(12,6))
feature_importances = pd.Series(model.feature_importances_, index=features.columns)
feature_importances.nlargest(10).plot(kind='barh')
plt.title('Top 10 Important Features')
plt.show()


In [None]:
# Import necessary libraries
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point

# Load the dataset
file_path = '/kaggle/input/glacier-inventory/database.csv'  # Update this with the correct file path
df = pd.read_csv(file_path)

# Ensure there are no missing lat/lon values for the map plot
df = df.dropna(subset=['Latitude', 'Longitude'])

# Create geometry points using Latitude and Longitude
geometry = [Point(xy) for xy in zip(df['Longitude'], df['Latitude'])]

# Convert the DataFrame to a GeoDataFrame
geo_df = gpd.GeoDataFrame(df, geometry=geometry)

# Load a world map from geopandas datasets
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# Plot the glaciers on the world map
plt.figure(figsize=(15, 10))
ax = world.plot(figsize=(15, 10), color='lightblue', edgecolor='black')

# Plot the glaciers (latitude/longitude) on top of the world map
geo_df.plot(ax=ax, marker='o', color='red', markersize=5)

# Set plot title and labels
plt.title('Glacier Locations on World Map', fontsize=15)
plt.xlabel('Longitude', fontsize=12)
plt.ylabel('Latitude', fontsize=12)

# Show the plot
plt.show()


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, roc_auc_score
from sklearn.impute import SimpleImputer  # For handling missing values
import geopandas as gpd
from shapely.geometry import Point

# Load the dataset
file_path = '/kaggle/input/glacier-inventory/database.csv'  # Update this with the correct file path
df = pd.read_csv(file_path, low_memory=False)

# Data Cleaning: Handle missing values
# Impute missing values using mean for numerical columns
imputer = SimpleImputer(strategy='mean')

# Drop rows with missing 'Latitude' and 'Longitude' as these are important for geographical visualization
df = df.dropna(subset=['Latitude', 'Longitude'])

# Feature Engineering: Convert categorical features into numerical (e.g., Political Unit, Continent)
df_encoded = pd.get_dummies(df[['Political Unit', 'Continent', 'Primary Class']], drop_first=True)

# Selecting important features for the model
features = df[['Latitude', 'Longitude']].join(df_encoded)

# Impute missing values in features
features = pd.DataFrame(imputer.fit_transform(features), columns=features.columns)

# Placeholder: Creating a target variable for classification
# In real scenarios, use actual labels (1 = High Risk, 0 = Low Risk)
target = np.random.choice([0, 1], size=len(df))  # Replace with actual High/Low Risk labels

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

# Model Building: Using multiple classifiers
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Support Vector Classifier": SVC(probability=True)
}

for model_name, model in models.items():
    # Fit the model
    model.fit(X_train, y_train)

    # Prediction
    y_pred = model.predict(X_test)

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)

    # Plot Confusion Matrix Heatmap
    plt.figure(figsize=(6,4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Low Risk", "High Risk"], yticklabels=["Low Risk", "High Risk"])
    plt.title(f'Confusion Matrix for {model_name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

    # Model Evaluation
    print(f"\nClassification Report for {model_name}:\n", classification_report(y_test, y_pred))
    print(f"\nAccuracy Score for {model_name}:\n", accuracy_score(y_test, y_pred))

    # Calculate ROC AUC
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    auc = roc_auc_score(y_test, y_pred_prob)

    # Plot ROC curve
    plt.figure(figsize=(8, 5))
    plt.plot(fpr, tpr, label=f'ROC curve (area = {auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for {model_name}')
    plt.legend(loc='lower right')
    plt.show()

# Visualizing feature importance for Random Forest
plt.figure(figsize=(12, 6))
feature_importances = pd.Series(models["Random Forest"].feature_importances_, index=features.columns)
feature_importances.nlargest(10).plot(kind='barh')
plt.title('Top 10 Important Features (Random Forest)')
plt.show()

# Identify glaciers at high risk of outburst
high_risk_glaciers = df.iloc[np.where(models["Random Forest"].predict(features) == 1)]

# Displaying the high-risk glaciers
print("High Risk Glaciers:\n", high_risk_glaciers[['Glacier ID', 'Glacier Name', 'Latitude', 'Longitude']])

# Visualization: Plotting glaciers at high risk on a world map
# Create geometry points using Latitude and Longitude
geometry = [Point(xy) for xy in zip(high_risk_glaciers['Longitude'], high_risk_glaciers['Latitude'])]

# Convert the DataFrame to a GeoDataFrame
geo_df = gpd.GeoDataFrame(high_risk_glaciers, geometry=geometry)

# Load a world map from geopandas datasets
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# Plot the high-risk glaciers on the world map
plt.figure(figsize=(15, 10))
ax = world.plot(figsize=(15, 10), color='lightblue', edgecolor='black')

# Plot the high-risk glaciers (latitude/longitude) on top of the world map
geo_df.plot(ax=ax, marker='o', color='red', markersize=5)

# Set plot title and labels
plt.title('High-Risk Glacier Locations on World Map', fontsize=15)
plt.xlabel('Longitude', fontsize=12)
plt.ylabel('Latitude', fontsize=12)

# Show the plot
plt.show()
