# Task 1

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Function to get the target column
def get_target_column(file_path):
  """
  Get the name of the target column from the CSV file.
  
  Parameters:
  file_path (str): Path to the CSV file.
  
  Returns:
  str: Name of the target column."""
  return df.columns[-1]
# Function to load the dataset
def load_data(file_path):
    """
    Load the dataset from the given file path.
    
    Parameters:
    file_path (str): Path to the CSV file.
    
    Returns:
    pd.DataFrame: Loaded dataset.
    """
    return pd.read_csv(file_path)

# Function to display basic information about the dataset
def display_basic_info(df):
    """
    Display basic information about the dataset.
    
    Parameters:
    df (pd.DataFrame): The dataset to inspect.
    """
    print("Dataset Info:")
    print(df.info())

# Function to check for missing values
def check_missing_values(df):
    """
    Check for missing values in the dataset.
    
    Parameters:
    df (pd.DataFrame): The dataset to check.
    
    Returns:
    pd.Series: Missing values count for each column.
    """
    return df.isnull().sum()

# Function to display summary statistics of the dataset
def display_summary_statistics(df):
    """
    Display summary statistics of the dataset.
    
    Parameters:
    df (pd.DataFrame): The dataset to summarize.
    """
    print("\nSummary Statistics:")
    print(df.describe())

# Function to visualize missing values
def plot_missing_values(df):
    """
    Plot a heatmap of missing values in the dataset.
    
    Parameters:
    df (pd.DataFrame): The dataset to visualize.
    """
    plt.figure(figsize=(10, 5))
    sns.heatmap(df.isnull(), cmap='viridis', cbar=False, yticklabels=False)
    plt.title("Missing Values Heatmap")
    plt.show()

# Function to plot the distribution of the target variable (e.g., DON concentration)
def plot_target_distribution(df, target_column):
    """
    Plot the distribution of the target variable.
    
    Parameters:
    df (pd.DataFrame): The dataset containing the target column.
    target_column (str): The name of the target column.
    """
    plt.figure(figsize=(8, 5))
    sns.histplot(df[target_column], bins=30, kde=True)
    plt.title("Distribution of DON Concentration")
    plt.xlabel("DON Concentration")
    plt.ylabel("Frequency")
    plt.show()

# Function to visualize outliers in spectral features
def plot_boxplot(df):
    """
    Plot a boxplot to detect outliers in spectral features.
    
    Parameters:
    df (pd.DataFrame): The dataset to visualize.
    """
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=df.iloc[:, :-1])  # Assuming last column is the target
    plt.xticks(rotation=90)
    plt.title("Boxplot of Spectral Features")
    plt.show()

# Main function to execute the data exploration
def data_exploration(file_path, target_column):
    """
    Perform data exploration by loading the data, checking for missing values,
    displaying basic info, and visualizing key aspects.
    
    Parameters:
    file_path (str): Path to the CSV file.
    target_column (str): The name of the target column.
    """
    # Load dataset
    df = load_data(file_path)
    
    # Display basic info
    display_basic_info(df)
    
    # Check for missing values
    missing_values = check_missing_values(df)
    print("\nMissing Values:")
    print(missing_values)
    
    # Display summary statistics
    display_summary_statistics(df)
    
    # Visualize missing values
    plot_missing_values(df)
    
    # Plot target distribution
    plot_target_distribution(df, target_column)
    
    # Visualize outliers in spectral features
    plot_boxplot(df)

file_path = "/Users/srinivaskalyan/Downloads/ImageAI/MLE-Assignment.csv"  # Update this with your actual file path
target_column = get_target_column(file_path)  # Replace with the actual name of the target column
data_exploration(file_path, target_column)


# Task 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Function to load data
def load_data(file_path):
    return pd.read_csv(file_path)

# Function to handle missing values
def handle_missing_values(df):
    df = df.copy()
    df.dropna(inplace=True)  # Drop rows with missing values
    return df

# Function to detect and remove outliers using IQR
def remove_outliers(df):
    df = df.copy()
    Q1 = df.iloc[:, 1:-1].quantile(0.25)
    Q3 = df.iloc[:, 1:-1].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    mask = ~((df.iloc[:, 1:-1] < lower_bound) | (df.iloc[:, 1:-1] > upper_bound)).any(axis=1)
    return df[mask]

# Function to cap outliers (Winsorization)
def cap_outliers(df):
    df = df.copy()
    Q1 = df.iloc[:, 1:-1].quantile(0.25)
    Q3 = df.iloc[:, 1:-1].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df.iloc[:, 1:-1] = np.where(df.iloc[:, 1:-1] < lower_bound, lower_bound, df.iloc[:, 1:-1])
    df.iloc[:, 1:-1] = np.where(df.iloc[:, 1:-1] > upper_bound, upper_bound, df.iloc[:, 1:-1])
    return df

# # Function to normalize spectral data
# def normalize_data(df):
#     df.iloc[:, 1:-1] = (df.iloc[:, 1:-1] - df.iloc[:, 1:-1].min()) / (df.iloc[:, 1:-1].max() - df.iloc[:, 1:-1].min())
#     return df

# Function to plot boxplots before and after outlier removal
def plot_outliers(df_before, df_after, df_capped):
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    sns.boxplot(data=df_before.iloc[:, 1:-1], ax=axes[0]).set_title("Before Outlier Removal")
    sns.boxplot(data=df_after.iloc[:, 1:-1], ax=axes[1]).set_title("After Outlier Removal")
    sns.boxplot(data=df_capped.iloc[:, 1:-1], ax=axes[2]).set_title("After Capping Outliers")
    plt.xticks(rotation=90)
    plt.show()

# Function to plot mean spectral reflectance
def plot_mean_reflectance(df):
    mean_reflectance = df.iloc[:, 1:-1].mean()
    std_reflectance = df.iloc[:, 1:-1].std()
    wavelengths = df.columns[1:-1]
    plt.figure(figsize=(12, 6))
    plt.plot(wavelengths, mean_reflectance, label="Mean Reflectance", color='b')
    plt.fill_between(wavelengths, mean_reflectance - std_reflectance, mean_reflectance + std_reflectance, color='b', alpha=0.2)
    plt.xticks(rotation=90)
    plt.xlabel("Wavelength")
    plt.ylabel("Normalized Reflectance")
    plt.title("Average Spectral Reflectance with Standard Deviation")
    plt.legend()
    plt.show()

# Function to plot feature correlation heatmap
def plot_correlation_heatmap(df):
    plt.figure(figsize=(12, 8))
    sns.heatmap(df.iloc[:, 1:-1].corr(), cmap="coolwarm", annot=False, fmt=".2f", linewidths=0.5)
    plt.title("Feature Correlation Heatmap")
    plt.show()

# Main function to execute preprocessing
def preprocess_data(file_path):
    df = load_data(file_path)
    df = handle_missing_values(df)
    df_no_outliers = remove_outliers(df)
    df_capped = cap_outliers(df)
    # df_normalized = normalize_data(df_capped)
    
    print(f"Original dataset size: {df.shape[0]} samples")
    print(f"Cleaned dataset size (outliers removed): {df_no_outliers.shape[0]} samples")
    print(f"Samples removed: {df.shape[0] - df_no_outliers.shape[0]}")
    
    plot_outliers(df, df_no_outliers, df_capped)
    plot_mean_reflectance(df_capped)
    plot_correlation_heatmap(df_capped)
    
    return df_capped

df_preprocessed = preprocess_data(file_path)

# Training the Neural Networks

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# Function to normalize data using Min-Max Scaling
def normalize_data(X):
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    joblib.dump(scaler, 'minmax_scaler.pkl')  # Save the scaler for future use
    return X_scaled

# Function to apply PCA
def apply_pca(X):
    pca = PCA(n_components=X.shape[1])
    X_pca = pca.fit_transform(X)
    joblib.dump(pca, 'pca_model_best.pkl')  # Save PCA model
    return X_pca

# Function to create the neural network model
def build_model(input_dim):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_dim,)),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(1)  # Regression output
    ])
    
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# Function to train and evaluate the model
def train_and_evaluate_model(X_train, X_test, y_train, y_test):
    model = build_model(X_train.shape[1])
    
    history = model.fit(X_train, y_train, epochs=100, batch_size=16, 
                        validation_data=(X_test, y_test), verbose=1)

    # Predict on test data
    y_pred = model.predict(X_test)

    # Compute evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print(f"Neural Network Performance:")
    print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R²: {r2:.4f}")

    # Plot training history
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('MSE Loss')
    plt.legend()
    plt.title('Training vs Validation Loss')

    plt.subplot(1, 2, 2)
    plt.plot(history.history['mae'], label='Train MAE')
    plt.plot(history.history['val_mae'], label='Validation MAE')
    plt.xlabel('Epochs')
    plt.ylabel('MAE')
    plt.legend()
    plt.title('Training vs Validation MAE')

    plt.show()

    model.save('/content/don_concentration_predictor.h5')
    print("Model saved successfully!")

X = df_preprocessed.iloc[:, 1:-1].values  # Spectral features
y = df_preprocessed.iloc[:, -1].values    # Target (DON concentration)

# Normalize Data
X_scaled = normalize_data(X)
# Apply PCA
X_pca = apply_pca(X_scaled)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Train and evaluate the model
train_and_evaluate_model(X_train, X_test, y_train, y_test)




# Training Random Forest and XG Boost

In [None]:
def train_random_forest(X_train, X_test, y_train, y_test):
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    
    return rf, y_pred

# Function to train an XGBoost model
def train_xgboost(X_train, X_test, y_train, y_test):
    xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)

    return xgb, y_pred

rf_model, y_pred_rf = train_random_forest(X_train, X_test, y_train, y_test)
evaluate_model(y_test, y_pred_rf, "Random Forest")

# Train and evaluate XGBoost
xgb_model, y_pred_xgb = train_xgboost(X_train, X_test, y_train, y_test)
evaluate_model(y_test, y_pred_xgb, "XGBoost")

# Save models
joblib.dump(rf_model, 'random_forest_model.pkl')
joblib.dump(xgb_model, 'xgboost_model.pkl')

In [None]:
import joblib
import numpy as np
from tensorflow import keras

# Load the XGBoost model
xgb_model = joblib.load('xgboost_model.pkl')

# Load the PCA model
pca = joblib.load('/content/pca_model_best.pkl')

# Step 2: Example new data (500 features)
new_spectrum = np.random.rand(500)

# Step 3: Apply PCA to reduce the features to 448
if new_spectrum.shape[0] > 448:
    new_spectrum = new_spectrum[:448]  # Truncate the input to 448 features if it's larger
transformed_spectrum = pca.transform(new_spectrum.reshape(1, -1))

# Step 4: Make prediction using the loaded XGBoost model
prediction_xgb = xgb_model.predict(transformed_spectrum)
prediction_xgb_value = float(prediction_xgb[0])
print(f"XGBoost Prediction: {prediction_xgb_value}")
