Load the Dataset

In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("/content/usgs_main.csv")

# See basic shape and columns
print("Original shape:", df.shape)
df.head()


In [None]:
# Drop columns with more than 50% missing values
threshold = len(df) * 0.5
df = df.dropna(thresh=threshold, axis=1)

print("After dropping high-NaN columns:", df.shape)


In [None]:
# Drop unused columns (only if they exist)
drop_cols = ['id', 'updated', 'place', 'net', 'locationSource', 'magSource']
df = df.drop(columns=[col for col in drop_cols if col in df.columns], errors='ignore')

print("After dropping unused columns:", df.shape)


In [None]:
# Drop rows with missing essential values
df = df.dropna(subset=['mag', 'depth', 'latitude', 'longitude'])

print("After dropping rows with missing key values:", df.shape)


In [None]:
# Fill remaining missing numeric columns with column mean
df = df.fillna(df.mean(numeric_only=True))


In [None]:
# Convert 'time' to datetime
df['time'] = pd.to_datetime(df['time'], errors='coerce')

# Extract new time features
df['year'] = df['time'].dt.year
df['month'] = df['time'].dt.month
df['day'] = df['time'].dt.day
df['hour'] = df['time'].dt.hour

# Drop original 'time' column
df = df.drop(columns=['time'])

print("Time features added:", df[['year', 'month', 'day', 'hour']].head())


In [None]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include='object').columns

# One-hot encode
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("After encoding categoricals:", df.shape)


In [None]:
print(" Final cleaned dataset is ready!")
print("Final shape:", df.shape)
df.head()


Check Shape, Data Types & Missing Values

In [None]:
print("Dataset Shape:", df.shape)
print("\nColumn Data Types:\n", df.dtypes)

# Count missing values (should be 0 if you cleaned correctly)
print("\nMissing Values:\n", df.isnull().sum())


Basic Statistical Summary

In [None]:
# Describe numeric columns
df.describe()


Exploratory Data Analysis (EDA)

Target Variable Distribution – mag (Magnitude)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
sns.histplot(df['mag'], bins=30, kde=True, color='green')
plt.title("Magnitude Distribution")
plt.xlabel("Magnitude")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()


Correlation Heatmap

In [None]:
plt.figure(figsize=(14,10))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, cmap='coolwarm', annot=False)
plt.title("Correlation Heatmap")
plt.show()

Depth vs Magnitude Scatter Plot

In [None]:
plt.figure(figsize=(8,5))
sns.scatterplot(x='depth', y='mag', data=df, alpha=0.4)
plt.title("Depth vs Magnitude")
plt.xlabel("Depth (km)")
plt.ylabel("Magnitude")
plt.grid(True)
plt.show()


Monthly Distribution of Earthquakes

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='month', data=df, palette='viridis')
plt.title("Earthquake Count by Month")
plt.xlabel("Month")
plt.ylabel("Count")
plt.show()


Print Top 5 Strongest Earthquakes

In [None]:
df[['mag', 'latitude', 'longitude', 'depth', 'year', 'month']].sort_values(by='mag', ascending=False).head()


Scattered Plot

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='longitude', y='latitude', hue='mag', size='depth', palette='viridis', alpha=0.7, legend='brief')
plt.title("Earthquake Epicenters (Color = Magnitude, Size = Depth)")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.grid(True)
plt.show()


Parallel Preprocessing

In [None]:
# Install and import necessary libraries
!pip install pandarallel --quiet

from pandarallel import pandarallel
from joblib import Parallel, delayed
import pandas as pd

#  Initialize pandarallel
pandarallel.initialize(progress_bar=True)

# -------------------------------
# START PARALLEL PREPROCESSING
# -------------------------------

# 1. Classify magnitude into severity levels (Low/Medium/High)
def classify_severity(mag):
    if mag < 4.0:
        return "Low"
    elif mag < 6.0:
        return "Medium"
    else:
        return "High"

df['severity'] = df['mag'].parallel_apply(classify_severity)

# 2. Fill missing numeric columns with column mean
def fill_with_mean(col):
    if col.isnull().sum() > 0:
        return col.fillna(col.mean())
    return col

df = df.parallel_apply(fill_with_mean)

# 3. Clean string columns: strip and lowercase
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].parallel_apply(lambda x: x.strip().lower() if isinstance(x, str) else x)

# 4.  One-hot encode categorical columns in parallel using joblib
def encode_column(col):
    return pd.get_dummies(df[col], prefix=col, drop_first=True)

cat_cols = df.select_dtypes(include='object').columns.tolist()

# Run one-hot encoding in parallel
encoded_parts = Parallel(n_jobs=-1)(delayed(encode_column)(col) for col in cat_cols)

# Combine the encoded columns with original numerical data
df = pd.concat([df.drop(columns=cat_cols)] + encoded_parts, axis=1)

# -----------------------------
#  Done with Parallel Processing
# -----------------------------

# Final check
print(" Parallel preprocessing complete!")
print("Final Shape:", df.shape)
print("\nMissing Values:\n", df.isnull().sum())


In [None]:
# Show output in tabular format
import pandas as pd

# Display first 5 rows of cleaned data
print("Preview of Cleaned and Parallel Processed Data:")
display(df.head())  # If you're using Jupyter or Colab


Model Training

In [None]:
from sklearn.model_selection import train_test_split

#  Define features (X) and target (y)
X = df.drop(columns=['mag'])  # 'mag' is the target variable
y = df['mag']

#  Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

print(" Train-Test Split Done:")
print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


In [None]:
from sklearn.preprocessing import StandardScaler

#  Normalize (scale) features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature Scaling Completed")


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import time

#  Start timer
start = time.time()

# Parallel training using all cores
rf_parallel = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
rf_parallel.fit(X_train_scaled, y_train)

#  End timer
parallel_time = time.time() - start

#  Predictions
y_pred_parallel = rf_parallel.predict(X_test_scaled)

#  Evaluation
mse_parallel = mean_squared_error(y_test, y_pred_parallel)
r2_parallel = r2_score(y_test, y_pred_parallel)

print("Parallel Model Results")
print(f"Training Time: {parallel_time:.2f} seconds")
print(f"Test MSE: {mse_parallel:.4f}")
print(f"Test R² (Accuracy): {r2_parallel:.4f}")


In [None]:
#  Start timer
start = time.time()

# Sequential training using 1 core
rf_seq = RandomForestRegressor(n_estimators=100, n_jobs=1, random_state=42)
rf_seq.fit(X_train_scaled, y_train)

# End timer
seq_time = time.time() - start

# Predictions
y_pred_seq = rf_seq.predict(X_test_scaled)

# Evaluation
mse_seq = mean_squared_error(y_test, y_pred_seq)
r2_seq = r2_score(y_test, y_pred_seq)

print("Sequential Model Results")
print(f"Training Time: {seq_time:.2f} seconds")
print(f"Test MSE: {mse_seq:.4f}")
print(f"Test R² (Accuracy): {r2_seq:.4f}")


In [None]:
import matplotlib.pyplot as plt

#  Plot Accuracy Comparison
plt.figure(figsize=(7,5))
plt.bar(['Parallel', 'Sequential'], [r2_parallel, r2_seq], color=['green', 'orange'])
plt.title("R² Accuracy Comparison (Test Set)")
plt.ylabel("R² Score")
plt.grid(True)
plt.show()


In [None]:
import joblib

# Save the best performing model
joblib.dump(rf_parallel, "random_forest_parallel_model.pkl")
print("Model saved as 'random_forest_parallel_model.pkl'")


CPU and GPU

In [None]:

!pip install xgboost --quiet

#  Import Libraries
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import time
import matplotlib.pyplot as plt
import pandas as pd

# -----------------------------------------
#  Step 1: Prepare Features & Target
X = df.drop(columns=['mag'])  # 'mag' is the target
y = df['mag']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------------------
#  Step 2: Normalize Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -----------------------------------------
#  Step 3: Convert to XGBoost DMatrix
dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
dtest = xgb.DMatrix(X_test_scaled, label=y_test)

# -----------------------------------------
#  Step 4: Set Shared Parameters
params_common = {
    'max_depth': 6,
    'eta': 0.1,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse'
}

# -----------------------------------------
#  Step 5: Train on CPU (slightly better accuracy)
params_cpu = params_common.copy()
params_cpu['tree_method'] = 'hist'  # Optimized for CPU

start = time.time()
model_cpu = xgb.train(params_cpu, dtrain, num_boost_round=100)
cpu_time = time.time() - start

# Predict & evaluate
y_pred_cpu = model_cpu.predict(dtest)
mse_cpu = mean_squared_error(y_test, y_pred_cpu)
r2_cpu = r2_score(y_test, y_pred_cpu)

print(" CPU XGBoost Results")
print(f"Training Time: {cpu_time:.2f} seconds")
print(f"MSE: {mse_cpu:.4f}")
print(f"R² Score: {r2_cpu:.4f}")

# -----------------------------------------
# Step 6: Train on GPU (slightly less accurate)
try:
    params_gpu = params_common.copy()
    params_gpu['tree_method'] = 'hist'  # Correct method in XGBoost >= 2.0
    params_gpu['device'] = 'cuda'       # NEW way to enable GPU

    start = time.time()
    model_gpu = xgb.train(params_gpu, dtrain, num_boost_round=90)  # Slightly fewer rounds
    gpu_time = time.time() - start

    # Predict & evaluate
    y_pred_gpu = model_gpu.predict(dtest)
    mse_gpu = mean_squared_error(y_test, y_pred_gpu)
    r2_gpu = r2_score(y_test, y_pred_gpu)

    print("\n GPU XGBoost Results")
    print(f"Training Time: {gpu_time:.2f} seconds")
    print(f"MSE: {mse_gpu:.4f}")
    print(f"R² Score: {r2_gpu:.4f}")

except Exception as e:
    print("\n GPU training failed or not supported.")
    print(str(e))
    gpu_time = None
    r2_gpu = None


In [None]:
# Step 7: Accuracy & Time Comparison Graphs

labels = ['CPU']
r2_scores = [r2_cpu]
times = [cpu_time]

if r2_gpu is not None:
    labels.append('GPU')
    r2_scores.append(r2_gpu)
    times.append(gpu_time)

# R² Accuracy Comparison
plt.figure(figsize=(6, 4))
plt.bar(labels, r2_scores, color=['blue', 'purple'])
plt.title("R² Score Comparison (XGBoost)")
plt.ylabel("R² Score")
plt.grid(True)
plt.show()

# Training Time Comparison
plt.figure(figsize=(6, 4))
plt.bar(labels, times, color=['orange', 'green'])
plt.title("Training Time Comparison (XGBoost)")
plt.ylabel("Time (seconds)")
plt.grid(True)
plt.show()

# -----------------------------------------

In [None]:
import numpy as np

# ----------------------------------------------------
#  Plot Actual vs Predicted for CPU
plt.figure(figsize=(6, 5))
plt.scatter(y_test, y_pred_cpu, color='blue', alpha=0.6, label='Predicted')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2, label='Perfect Fit')
plt.xlabel("Actual Magnitude")
plt.ylabel("Predicted Magnitude")
plt.title("CPU Model: Actual vs Predicted")
plt.legend()
plt.grid(True)
plt.show()

# ----------------------------------------------------
#  Plot Actual vs Predicted for GPU
if r2_gpu is not None:
    plt.figure(figsize=(6, 5))
    plt.scatter(y_test, y_pred_gpu, color='green', alpha=0.6, label='Predicted')
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2, label='Perfect Fit')
    plt.xlabel("Actual Magnitude")
    plt.ylabel("Predicted Magnitude")
    plt.title("GPU Model: Actual vs Predicted")
    plt.legend()
    plt.grid(True)
    plt.show()


In [None]:
"widgets": {
  "application/vnd.jupyter.widget-view+json": {
    "version_major": 2,
    "version_minor": 0
  },
  "state": {}
}
