In [None]:
# Import necessary libraries
import obspy
import pandas as pd
import matplotlib.pyplot as plt

# For Miniseed data (seismic data)
# Make sure the path points to a real file in your data folder
miniseed_file_path = 'data/lunar/training/data/S12_GradeA/xa.s12.00.mhz.1970-01-19HR00_evid00002.mseed'  # Assuming miniseed extension

# Read the miniseed file using obspy (this is only for miniseed format files)
try:
    st = obspy.read(miniseed_file_path)
    st.plot()  # Plot seismic trace to visualize data
except FileNotFoundError as e:
    print(f"File not found: {e}")

# For CSV data (if you're working with CSVs)
csv_file_path = 'data/lunar/training/data/S12_GradeA/xa.s12.00.mhz.1970-01-19HR00_evid00002.csv'  # Check the correct path and file extension

try:
    df = pd.read_csv(csv_file_path)
    print(df.head())  # Display the first few rows to confirm the data
    df.plot()  # Plot to visualize any trends or anomalies
    plt.show()
except FileNotFoundError as e:
    print(f"File not found: {e}")


In [None]:
# Apply a bandpass filter between 0.5 Hz and 10 Hz, which is commonly used for seismic data
filtered_data = st.copy()
filtered_data.filter('bandpass', freqmin=0.5, freqmax=3)
filtered_data.plot()


In [None]:
print(df.columns)  # Display the available columns


In [None]:
# Convert the 'time_abs(%Y-%m-%dT%H:%M:%S.%f)' column to datetime format
df['time_abs(%Y-%m-%dT%H:%M:%S.%f)'] = pd.to_datetime(df['time_abs(%Y-%m-%dT%H:%M:%S.%f)'], format='%Y-%m-%dT%H:%M:%S.%f')

# Check if the conversion was successful by printing the first few rows
print(df.head())


In [None]:
# Plot velocity over time
plt.figure(figsize=(10,6))
plt.plot(df['time_abs(%Y-%m-%dT%H:%M:%S.%f)'], df['velocity(m/s)'], color='blue')
plt.title('Lunar Seismic Velocity Over Time')
plt.xlabel('Time')
plt.ylabel('Velocity (m/s)')
plt.grid(True)
plt.show()


In [None]:
# Simple rolling mean to smooth the data
df['velocity_smoothed'] = df['velocity(m/s)'].rolling(window=10).mean()

# Plot smoothed velocity over time
plt.figure(figsize=(10,6))
plt.plot(df['time_abs(%Y-%m-%dT%H:%M:%S.%f)'], df['velocity_smoothed'], color='red')
plt.title('Smoothed Lunar Seismic Velocity Over Time')
plt.xlabel('Time')
plt.ylabel('Smoothed Velocity (m/s)')
plt.grid(True)
plt.show()


Data Preprocessing

In [20]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Drop rows with missing values
df = df.dropna()

# Feature selection (we will use time_rel(sec) as our feature and velocity(m/s) as target)
X = df[['time_rel(sec)']]  # Features
y = df['velocity(m/s)']    # Target

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the data using StandardScaler (good practice for models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Model Selection

In [None]:
# Import Random Forest model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the training set
rf_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test_scaled)

# Evaluate the model using Mean Squared Error and R^2 score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


Model Visualization

In [None]:
# Plot the actual vs predicted values
plt.figure(figsize=(10,6))
plt.scatter(X_test, y_test, color='blue', label='Actual Values')
plt.scatter(X_test, y_pred, color='red', label='Predicted Values', alpha=0.5)
plt.title('Actual vs Predicted Seismic Velocity')
plt.xlabel('Time (seconds)')
plt.ylabel('Velocity (m/s)')
plt.legend()
plt.grid(True)
plt.show()


Predicted Value is always 0, need to figure out why

Check the variance in velocity(m/s)
Let's check if the velocity(m/s) values are small, which might explain why the model is predicting close to 0.

In [None]:
# Check the variance of the velocity column
print("Mean of velocity(m/s):", df['velocity(m/s)'].mean())
print("Variance of velocity(m/s):", df['velocity(m/s)'].var())
print("Minimum value of velocity(m/s):", df['velocity(m/s)'].min())
print("Maximum value of velocity(m/s):", df['velocity(m/s)'].max())

Scale the Target Variable (velocity(m/s)) and Train the Model Again:

In [None]:
# Scale the target variable (velocity)
scaler = StandardScaler()  # Using StandardScaler to normalize both features and target

# Scaling features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Scaling the target variable (velocity)
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))  # Scaling target
y_test_scaled = scaler.transform(y_test.values.reshape(-1, 1))

# Train the Random Forest model on the scaled data
rf_model.fit(X_train_scaled, y_train_scaled.ravel())

# Make predictions on the test set
y_pred_scaled = rf_model.predict(X_test_scaled)

# Inverse transform the predicted values to return them to original scale
y_pred = scaler.inverse_transform(y_pred_scaled.reshape(-1, 1))

# Evaluate the model again
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

# Check predicted values
print("First few predicted values:", y_pred[:10])


Adding Acceleration as a Feature

In [None]:
# Calculate acceleration (change in velocity over time)
df['acceleration'] = df['velocity(m/s)'].diff() / df['time_rel(sec)'].diff()

# Drop any NaN values introduced by the diff function
df = df.dropna()

# Update features (X) and target (y) to include the new acceleration feature
X = df[['time_rel(sec)', 'acceleration']]
y = df['velocity(m/s)']

# Split the dataset into training and testing sets again
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features and target again
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = scaler.transform(y_test.values.reshape(-1, 1))

# Train the Random Forest model
rf_model.fit(X_train_scaled, y_train_scaled.ravel())

# Make predictions on the test set
y_pred_scaled = rf_model.predict(X_test_scaled)

# Inverse transform the predictions back to the original scale
y_pred = scaler.inverse_transform(y_pred_scaled.reshape(-1, 1))

# Evaluate the model again
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")
print("First few predicted values:", y_pred[:10])


VISUALIZING IT TO MAKE SENSE HOPEFULLY

In [None]:
import seaborn as sns

# Scatter plot of the relationship between time_rel and velocity
plt.figure(figsize=(12, 6))
sns.scatterplot(data=df, x='time_rel(sec)', y='velocity(m/s)')
plt.title('Velocity vs Time Relative')
plt.xlabel('Time (seconds)')
plt.ylabel('Velocity (m/s)')
plt.show()


SUPPORT VECTOR REGRESSION (USE THIS SINCE IT GOT BEST RESULT)

In [None]:
from sklearn.svm import SVR

# Initialize and train the SVR model
svr_model = SVR(kernel='rbf')  # You can also experiment with 'linear' or 'poly' kernels
svr_model.fit(X_train_scaled, y_train_scaled.ravel())

# Make predictions on the test set
y_pred_scaled = svr_model.predict(X_test_scaled)

# Inverse transform the predictions back to the original scale
y_pred = scaler.inverse_transform(y_pred_scaled.reshape(-1, 1))

# Evaluate the model again
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")
print("First few predicted values:", y_pred[:10])


TRYNA USE ALL THE DATA

In [None]:
# Assuming 'time_abs(%Y-%m-%dT%H:%M:%S.%f)' and 'time_rel(sec)' are relevant features
X = df[['time_abs(%Y-%m-%dT%H:%M:%S.%f)', 'time_rel(sec)']]  # Add more features if available
y = df['velocity(m/s)']

# Convert 'time_abs' to datetime and then to integer safely
X.loc[:, 'time_abs(%Y-%m-%dT%H:%M:%S.%f)'] = pd.to_datetime(X['time_abs(%Y-%m-%dT%H:%M:%S.%f)']).astype(int)

# Normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the model (you can choose any model, here is SVR)
model = SVR(kernel='rbf')
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


In [None]:
import obspy
import pandas as pd
import matplotlib.pyplot as plt

# For Miniseed data (seismic data)
# Make sure the path points to a real file in your data folder
miniseed_file_path = 'data/lunar/training/data/S12_GradeA/xa.s12.00.mhz.1970-01-19HR00_evid00002.mseed'  # Assuming miniseed extension

# Read the miniseed file using obspy (this is only for miniseed format files)
try:
    st = obspy.read(miniseed_file_path)
    st.plot()  # Plot seismic trace to visualize data
except FileNotFoundError as e:
    print(f"File not found: {e}")

# For CSV data (if you're working with CSVs)
csv_file_path = 'data/lunar/training/data/S12_GradeA/xa.s12.00.mhz.1970-01-19HR00_evid00002.csv'  # Check the correct path and file extension

try:
    df = pd.read_csv(csv_file_path)
    print(df.head())  # Display the first few rows to confirm the data
    df.plot()  # Plot to visualize any trends or anomalies
    plt.show()
except FileNotFoundError as e:
    print(f"File not found: {e}")
# Step 1: Define a threshold for seismic activity based on velocity
threshold = 1e-9  # Adjust this based on domain knowledge or exploratory data analysis

# Step 2: Create a new column 'seismic_activity'
df['seismic_activity'] = (df['velocity(m/s)'].abs() > threshold).astype(int)

# Step 3: Check the distribution of the labels
print(df['seismic_activity'].value_counts())


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 1: Convert the datetime column to numeric format (timestamps in seconds)
df['time_abs_numeric'] = pd.to_datetime(df['time_abs(%Y-%m-%dT%H:%M:%S.%f)']).astype(int) / 10**9  # Convert to seconds

# Step 2: Define features (X) and target (y)
y = df['seismic_activity']
X = df.drop(columns=['seismic_activity', 'time_abs(%Y-%m-%dT%H:%M:%S.%f)'])  # Drop original datetime column

# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train the logistic regression model
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Step 5: Predict on the test set
y_pred = clf.predict(X_test)

# Step 6: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Step 7: Output the results
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 7: Predict on the test set
y_pred = clf.predict(X_test)

# Step 8: Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Step 9: Display the accuracy
print(f"Accuracy: {accuracy}")

# Optional: Display confusion matrix and classification report for detailed evaluation
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Confusion Matrix:")
print(conf_matrix)

print("Classification Report:")
print(class_report)