# Importing modules and setting up paths

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Adding path to scripts so that modules from ../scripts can be imported
sys.path.append(os.path.abspath('../scripts'))

## Importing functions from the project

In [None]:
from preprocessing_feature_engineering import load_and_preprocess_data, split_and_scale
from model_selection import (
    train_models_with_gridsearch,
    evaluate_models,
    plot_confusion_matrix,
    plot_learning_curve,
    save_best_model,
    update_readme
)
from predict import predict_and_save

## Load and initial analysis of data (train.csv)

In [None]:
# Path to the training file
path_train = '../data/train.csv'

# Reading and initial analysis of data
try:
    df = pd.read_csv(path_train)
    print('Data shape:', df.shape)
    print('\nFirst 5 rows:')
    display(df.head())
    print('\nCover_Type class distribution:')
    print(df['Cover_Type'].value_counts(normalize=True))
    print('\nMissing values:')
    print(df.isnull().sum())
except FileNotFoundError:
    print(f"Error: File {path_train} not found")

## Load test.csv and compare with train.csv

In [None]:
# Reading test.csv and comparing with training data
try:
    test_df = pd.read_csv('../data/test.csv')
    print('\nData comparison:')
    print(f'train.csv shape: {df.shape}')
    print(f'test.csv shape: {test_df.shape}')
    print('Train columns:', df.columns.tolist())
    print('Test columns:', test_df.columns.tolist())
    
    print('\nMissing values in test.csv:')
    print(test_df.isnull().sum())
    
    # Compare distributions for the first 3 numerical columns (excluding Cover_Type)
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    numeric_cols = numeric_cols.drop('Cover_Type', errors='ignore')
    
    os.makedirs('../results/plots', exist_ok=True)
    for col in numeric_cols[:3]:
        plt.figure(figsize=(8, 4))
        sns.kdeplot(df[col], label='train', alpha=0.5)
        sns.kdeplot(test_df[col], label='test', alpha=0.5)
        plt.title(f'Distribution of {col}')
        plt.legend()
        plt.savefig(f'../results/plots/distribution_{col}.png')
        plt.show()
except FileNotFoundError:
    print("test.csv not found")
except Exception as e:
    print(f"Error during comparison: {e}")

    # Plot of Cover_Type class distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='Cover_Type', data=test_df)
plt.title('Cover_Type class distribution for test')
plt.xlabel('Cover Type')
plt.ylabel('Count')
plt.savefig('../results/plots/class_distribution.png')
plt.show()

## Additional plots

In [None]:
# Plot of Cover_Type class distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='Cover_Type', data=df)
plt.title('Cover_Type class distribution')
plt.xlabel('Cover Type')
plt.ylabel('Count')
plt.savefig('../results/plots/class_distribution.png')
plt.show()

# Correlation matrix
plt.figure(figsize=(15, 15))
sns.heatmap(df.corr(), cmap='coolwarm', annot=False)
plt.title('Correlation matrix')
plt.savefig('../results/plots/correlation_matrix.png')
plt.show()

## Analysis of new features

In [None]:
# Apply feature engineering to generate new features
df_processed = load_and_preprocess_data('../data/train.csv')

plt.figure(figsize=(10, 6))
sns.boxplot(x='Cover_Type', y='Distance_to_hydrology', data=df_processed)
plt.title('Distribution of Distance_to_hydrology by class')
plt.savefig('../results/plots/distance_to_hydrology.png')
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(x='Cover_Type', y='Shadow_importance', data=df_processed)
plt.title('Distribution of Shadow_importance by class')
plt.savefig('../results/plots/shadow_importance.png')
plt.show()

## Data preprocessing, splitting, and model training

In [None]:
# Load and preprocess data
df_processed = load_and_preprocess_data(path_train)
X_train, X_test, y_train, y_test = split_and_scale(df_processed)
print('Training set size:', X_train.shape) 
print('Test set size:', X_test.shape)

# Training models
results, best_model = train_models_with_gridsearch(X_train, y_train)
test_accuracies, test_f1_scores = evaluate_models(results, X_test, y_test)
for name, result in results.items():
    print(f"\nModel: {name}")
    print(f"Best parameters: {result['best_params']}")
    cm_df = plot_confusion_matrix(result["best_model"], X_test, y_test)
    print(cm_df)   
       

## Analysis of the best model and result visualization

In [None]:
# Analysis of the best model
cm_df = plot_confusion_matrix(best_model, X_test, y_test)
print('Confusion Matrix:\n', cm_df)
         
plot_learning_curve(best_model, X_train, y_train, cv=5) 

## Save the model, update README and make predictions on test data

In [None]:
# Save the best model
save_best_model(best_model)  

# Update README with training results
 
update_readme(results, test_f1_scores)


# Run predictions and save results for test.csv
predict_and_save('../data/test.csv',  '../results/best_model.pkl', '../results/test_predictions.csv') 