# imports

In [None]:
!pip install catboost
!pip install xgboost lightgbm catboost scikit-learn pandas matplotlib seaborn

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

 # 1: Data Loading and Initial Exploration

In [None]:
# Load the data
train = pd.read_csv('/content/drive/MyDrive/DataSets/OFP/train.csv')
test = pd.read_csv('/content/drive/MyDrive/DataSets/OFP/test.csv')
sample_sub = pd.read_csv('/content/drive/MyDrive/DataSets/OFP/sample_submission.csv')
original = pd.read_csv('/content/drive/MyDrive/DataSets/OFP/Fertilizer_Prediction.csv')


# 1. Basic Information
print("="*50)
print("Basic Dataset Information")
print("="*50)
print("\nTrain shape:", train.shape)
print("Test shape:", test.shape)
print("\nTrain columns:", train.columns.tolist())
print("\nMissing values in train set:", train.isnull().sum().sum())
print("Missing values in test set:", test.isnull().sum().sum())

# 2. Data Description
print("\n" + "="*50)
print("Numerical Features Description")
print("="*50)
num_cols = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']
print(train[num_cols].describe().transpose())

print("\n" + "="*50)
print("Categorical Features Description")
print("="*50)
cat_cols = ['Soil Type', 'Crop Type', 'Fertilizer Name']
for col in cat_cols:
    print(f"\n{col} unique values:", train[col].nunique())
    print(train[col].value_counts(normalize=True).head(10))

# 2: Basic EDA

### Split Distribution Histograms (Test and Train as separate plots first, then combined)

In [None]:
# Numerical features to analyze
num_cols = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']

# First show individual train/test distributions
print("="*60)
print("Individual Train/Test Distributions")
print("="*60)
for col in num_cols:
    plt.figure(figsize=(12, 5))

    # Train distribution
    plt.subplot(1, 2, 1)
    sns.histplot(train[col], kde=True, color='blue')
    plt.title(f'Train {col} Distribution')
    plt.xlabel('')

    # Test distribution
    plt.subplot(1, 2, 2)
    sns.histplot(test[col], kde=True, color='orange')
    plt.title(f'Test {col} Distribution')
    plt.xlabel('')

    plt.tight_layout()
    plt.show()

# Then show combined comparison
print("\n" + "="*60)
print("Combined Distribution Comparison")
print("="*60)
plt.figure(figsize=(15, 20))
for i, col in enumerate(num_cols, 1):
    plt.subplot(3, 2, i)
    sns.histplot(train[col], color='blue', label='Train', kde=True, alpha=0.6)
    sns.histplot(test[col], color='orange', label='Test', kde=True, alpha=0.6)
    plt.title(f'{col} Distribution', fontsize=12)
    plt.legend()

plt.tight_layout()
plt.show()

### Target Data Distribution (Fertilizer Name)

In [None]:
# Set style and color palette
plt.style.use('seaborn-v0_8')
sns.set_style("whitegrid")
custom_palette = sns.color_palette("husl", len(train['Fertilizer Name'].unique()))

# 1. Box Plot of Numerical Features by Fertilizer Type
for col in num_cols:
    plt.figure(figsize=(10, 5))  # Smaller figure size
    ax = sns.boxplot(x='Fertilizer Name', y=col, data=train,
                    palette=custom_palette,
                    width=0.6,  # Narrower boxes
                    fliersize=3)  # Smaller outlier markers

    plt.title(f'{col} by Fertilizer Type', fontsize=12, pad=10)
    plt.xticks(rotation=45, ha='right', fontsize=9)
    plt.yticks(fontsize=9)
    plt.ylabel(col, fontsize=10)
    plt.xlabel('Fertilizer Name', fontsize=10)

    # Add horizontal grid lines
    ax.yaxis.grid(True, linestyle='--', alpha=0.4)
    ax.set_axisbelow(True)

    plt.tight_layout()
    plt.show()

# 2. Combined Nutrient Box Plot
plt.figure(figsize=(12, 6))
melted_df = train.melt(id_vars=['Fertilizer Name'],
                      value_vars=['Nitrogen', 'Potassium', 'Phosphorous'],
                      var_name='Nutrient',
                      value_name='Value')

ax = sns.boxplot(x='Nutrient', y='Value', hue='Fertilizer Name',
                data=melted_df, palette=custom_palette,
                width=0.7, linewidth=1)

plt.title('Nutrient Levels by Fertilizer Type', fontsize=12, pad=10)
plt.xlabel('Nutrient', fontsize=10)
plt.ylabel('Value', fontsize=10)
plt.xticks(fontsize=9)
plt.yticks(fontsize=9)

# legend
handles, labels = ax.get_legend_handles_labels()
plt.legend(handles, labels, bbox_to_anchor=(1.05, 1),
           loc='upper left', fontsize=8, title='Fertilizer',
           title_fontsize=9)

plt.grid(axis='y', linestyle='--', alpha=0.3)
plt.tight_layout()
plt.show()

# 3. Target Distribution Trio (Improved)
plt.figure(figsize=(15, 5))

# Count Plot
plt.subplot(1, 3, 1)
sns.countplot(y='Fertilizer Name', data=train,
             order=train['Fertilizer Name'].value_counts().index,
             palette=custom_palette)
plt.title('Fertilizer Distribution (Count)', fontsize=12)
plt.xlabel('Count', fontsize=10)
plt.ylabel('', fontsize=10)
plt.xticks(fontsize=9)
plt.yticks(fontsize=9)

# Pie Chart
plt.subplot(1, 3, 2)
train['Fertilizer Name'].value_counts().plot.pie(
    autopct=lambda p: f'{p:.1f}%' if p > 5 else '',
    startangle=90,
    colors=custom_palette,
    wedgeprops={'linewidth': 0.5, 'edgecolor': 'white'},
    textprops={'fontsize': 9}
)
plt.title('Percentage Distribution', fontsize=12)
plt.ylabel('')

# Box Plot of Counts
plt.subplot(1, 3, 3)
sns.boxplot(x=train['Fertilizer Name'].value_counts().values,
           color='skyblue', width=0.4)
plt.title('Fertilizer Count Distribution', fontsize=12)
plt.xlabel('Records per Type', fontsize=10)
plt.xticks(fontsize=9)

plt.tight_layout()
plt.show()

### Numeric Data Distribution Across Train and Test

In [None]:
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# 1. Pair Plot for Train Set
print("\n" + "="*60)
print("Pair Plot for Train Set")
print("="*60 + "\n")

sns.pairplot(train)

# 2. Pair Plot for Train Set
print("\n" + "="*60)
print("Train Set: Pairwise Relationships")
print("="*60 + "\n")

plt.figure(figsize=(15, 15))
train_pair = sns.pairplot(
    train[num_cols + ['Fertilizer Name']],
    hue='Fertilizer Name',
    palette='viridis',
    plot_kws={'alpha': 0.7, 's': 20, 'edgecolor': 'k', 'linewidth': 0.3},
    diag_kind='kde',
    corner=True,
    height=2.5
)
train_pair.fig.suptitle(
    'Train Set: Pairwise Feature Relationships by Fertilizer Type',
    y=1.02,
    fontsize=14
)
plt.show()

# 3. Correlation Heatmap (Train Set)
print("\n" + "="*60)
print("Train Set Correlation Analysis")
print("="*60 + "\n")

plt.figure(figsize=(10, 8))
corr_matrix = train[num_cols].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(
    corr_matrix,
    annot=True,
    fmt=".2f",
    cmap='coolwarm',
    mask=mask,
    vmin=-1,
    vmax=1,
    linewidths=0.5,
    annot_kws={"size": 10}
)
plt.title('Train Set: Numerical Features Correlation', fontsize=14)
plt.xticks(rotation=45)
plt.show()

### Categorical Data Distribution in Pie Charts

In [None]:
cat_cols = ['Soil Type', 'Crop Type']

## Categorical Feature Analysis
print("\n" + "="*60)
print("Individual Categorical Feature Distributions")
print("="*60 + "\n")

# Custom color palette
palette = sns.color_palette('pastel')

for col in cat_cols:
    # Train Distribution - Pie Chart
    plt.figure(figsize=(5, 5))
    train[col].value_counts().plot.pie(autopct='%1.1f%%',
                                     startangle=90,
                                     colors=palette,
                                     wedgeprops={'linewidth': 0.5, 'edgecolor': 'white'},
                                     textprops={'fontsize': 8})
    plt.title(f'Train {col}\n', fontsize=10)
    plt.ylabel('')
    plt.show()

    # Train Distribution - Bar Chart
    plt.figure(figsize=(6, 3))
    sns.countplot(y=col, data=train, order=train[col].value_counts().index, palette=palette)
    plt.title(f'Train {col}\n', fontsize=10)
    plt.xlabel('Count', fontsize=8)
    plt.ylabel('', fontsize=8)
    plt.tick_params(axis='both', which='major', labelsize=8)
    plt.show()

    # Test Distribution - Pie Chart
    plt.figure(figsize=(5, 5))
    test[col].value_counts().plot.pie(autopct='%1.1f%%',
                                    startangle=90,
                                    colors=palette,
                                    wedgeprops={'linewidth': 0.5, 'edgecolor': 'white'},
                                    textprops={'fontsize': 8})
    plt.title(f'Test {col}\n', fontsize=10)
    plt.ylabel('')
    plt.show()

    # Test Distribution - Bar Chart
    plt.figure(figsize=(6, 3))
    sns.countplot(y=col, data=test, order=test[col].value_counts().index, palette=palette)
    plt.title(f'Test {col}\n', fontsize=10)
    plt.xlabel('Count', fontsize=8)
    plt.ylabel('', fontsize=8)
    plt.tick_params(axis='both', which='major', labelsize=8)
    plt.show()

## Comparison Between Train and Test
print("\n" + "="*60)
print("Train-Test Distribution Comparison")
print("="*60 + "\n")

for col in cat_cols:
    # Prepare data
    train_counts = train[col].value_counts(normalize=True).reset_index()
    train_counts.columns = [col, 'Train']
    test_counts = test[col].value_counts(normalize=True).reset_index()
    test_counts.columns = [col, 'Test']
    merged = pd.merge(train_counts, test_counts, on=col, how='outer').fillna(0)

    # Plot
    plt.figure(figsize=(8, 4))
    merged.set_index(col).plot(kind='bar', color=['skyblue', 'salmon'])
    plt.title(f'{col} Comparison\n', fontsize=10)
    plt.xlabel('', fontsize=8)
    plt.ylabel('Percentage', fontsize=8)
    plt.xticks(rotation=45, fontsize=8)
    plt.yticks(fontsize=8)
    plt.legend(title='Dataset', fontsize=8)
    plt.grid(axis='y', linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()

### Correlation Analysis

In [None]:
print("\n" + "="*60)
print("Target-Focused Correlation Analysis")
print("="*60 + "\n")

# 1. Prepare encoded data for correlation analysis
# Encode categorical features for correlation calculation
encoded_data = train.copy()
le = LabelEncoder()
for col in cat_cols:
    encoded_data[col] = le.fit_transform(encoded_data[col])
encoded_data['Fertilizer Name'] = le.fit_transform(encoded_data['Fertilizer Name'])

# 2. Calculate correlations with target
corr_with_target = encoded_data.corr()[['Fertilizer Name']].drop('Fertilizer Name')
corr_with_target.columns = ['Correlation']
corr_with_target['Absolute_Correlation'] = corr_with_target['Correlation'].abs()
corr_with_target = corr_with_target.sort_values('Absolute_Correlation', ascending=False)

# 3. Top 10 Features Correlated with Target
plt.figure(figsize=(10, 6))
top_features = corr_with_target.head(10).sort_values('Correlation', ascending=True)
colors = ['red' if x < 0 else 'green' for x in top_features['Correlation']]
top_features['Correlation'].plot(kind='barh', color=colors)
plt.title('Top 10 Features Correlated with Fertilizer Type', fontsize=14)
plt.xlabel('Correlation Coefficient', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.axvline(x=0, color='black', linestyle='--', linewidth=0.5)
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Add correlation values on bars
for i, v in enumerate(top_features['Correlation']):
    plt.text(v, i, f"{v:.2f}", color='black', ha='left' if v < 0 else 'right', va='center')

plt.tight_layout()
plt.show()

# 4. Detailed Correlation Table
print("\nDetailed Correlation with Fertilizer Name:")
display(corr_with_target.style.background_gradient(cmap='coolwarm', vmin=-1, vmax=1)
                           .format("{:.2f}")
                           .set_caption("Feature Correlations with Target"))

# 5. Feature-Target Relationship Visualization
top_3_features = corr_with_target.index[:3]
plt.figure(figsize=(15, 5))
for i, feature in enumerate(top_3_features, 1):
    plt.subplot(1, 3, i)
    if feature in num_cols:
        sns.boxplot(x='Fertilizer Name', y=feature, data=train, palette='viridis')
    else:
        sns.countplot(x='Fertilizer Name', hue=feature, data=train, palette='viridis')
    plt.title(f"{feature} by Fertilizer Type", fontsize=12)
    plt.xticks(rotation=45)
    if i != 1:
        plt.ylabel('')
plt.tight_layout()
plt.show()

# 6. Correlation Interpretation
print("\n" + "="*60)
print("Key Insights from Correlation Analysis")
print("="*60)
print(f"\nMost Positively Correlated: {corr_with_target.index[0]} (r = {corr_with_target.iloc[0,0]:.2f})")
print(f"Most Negatively Correlated: {corr_with_target.index[-1]} (r = {corr_with_target.iloc[-1,0]:.2f})")
print("\nRecommendations:")
print("- Prioritize features with |r| > 0.3 in modeling")
print("- Investigate strongly negative correlations for potential inverse relationships")
print("- Consider feature interactions for top correlated pairs")

# 3: Feature Engineering and Preprocessing

In [None]:
# Drop the 'id' column and prepare features/target
X = train.drop(columns=['id', 'Fertilizer Name'])
y = train['Fertilizer Name']

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Create quantile bin encoding function
def quantile_bin_encode(df, cols, q=5, labels=['very low', 'low', 'medium', 'high', 'very high']):
    df_transformed = df.copy()

    for col in cols:
        # Step 1: Bin into quantiles
        binned = pd.qcut(df_transformed[col], q=q, labels=labels)

        # Step 2: Map labels to ordinal integers
        label_map = {label: idx for idx, label in enumerate(labels)}
        df_transformed[f"{col}_bin"] = binned.map(label_map).astype('int64')

    return df_transformed

# Apply binning to datasets
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
X_binned = quantile_bin_encode(X, numerical_cols)
original_binned = quantile_bin_encode(original, numerical_cols)
test_binned = quantile_bin_encode(test, numerical_cols)

# Data Augmentation
# Prepare multiple copies of original dataset
orig_copy = original.copy()
n = 6  # Number of copies
for i in range(n):
    original = pd.concat([original, orig_copy], axis=0, ignore_index=True)

# 4: Model Training and Evaluation

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder

# Define mapk function
def mapk(actual, predicted, k=3):
    """
    Computes the mean average precision at k (MAP@k)

    Args:
        actual (list): List of true labels
        predicted (list): List of predicted labels (order matters)
        k (int): Number of predictions to consider

    Returns:
        float: MAP@k score
    """
    return np.mean([1.0 / (predicted[i][:k].index(actual[i]) + 1)
                   if actual[i] in predicted[i][:k] else 0.0
                   for i in range(len(actual))])

# Store scores and initialize lists
f1_scores = []
map3_scores = []
models = []
all_y_true = []
all_y_pred = []

# Prepare K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_binned, y_encoded)):
    print(f"\n***** Fold {fold + 1} *****")

    # Prepare fold data
    X_train = X_binned.iloc[train_idx].copy()
    X_val = X_binned.iloc[val_idx].copy()
    y_train = y_encoded[train_idx]
    y_val = y_encoded[val_idx]

    # Combine original with train data
    X_train = pd.concat([X_train, original_binned], ignore_index=True)
    y_train = np.concatenate([y_train, le.transform(original_binned['Fertilizer Name'])])

    # Drop target column from training data
    X_train.drop(columns=['Fertilizer Name'], inplace=True)

    # Convert features to category
    for col in X_train.select_dtypes(include='object').columns:
        X_train[col] = X_train[col].astype('category')
    for col in X_val.select_dtypes(include='object').columns:
        X_val[col] = X_val[col].astype('category')

    cat_features = X_train.select_dtypes(include='category').columns.tolist()

    # Initialize CatBoost model
    model = CatBoostClassifier(
        iterations=10000,
        depth=6,
        learning_rate=0.03,
        early_stopping_rounds=100,
        task_type="GPU",
        loss_function="MultiClass",
        eval_metric="MultiClass",
        l2_leaf_reg=0.86,
        bootstrap_type="Bayesian",
        bagging_temperature=0.5,
        random_strength=2.65,
        border_count=124,
        verbose=500
    )

    # Train model
    model.fit(X_train, y_train, cat_features=cat_features, eval_set=(X_val, y_val))

    # Make predictions
    y_pred = model.predict(X_val)
    y_probs = model.predict_proba(X_val)

    # Store predictions and true labels
    all_y_true.extend(y_val)
    all_y_pred.extend(y_pred)

    # Calculate metrics
    report = classification_report(y_val, y_pred, output_dict=True)
    f1_macro = report["macro avg"]["f1-score"]
    f1_scores.append(f1_macro)

    # MAP@3 calculation
    top3_preds = np.argsort(y_probs, axis=1)[:, -3:][:, ::-1]
    map3 = mapk(y_val.tolist(), top3_preds.tolist(), k=3)
    map3_scores.append(map3)
    models.append(model)

    print(f"F1 (macro): {f1_macro:.4f} | MAP@3: {map3:.4f}")

# Final Results
print("\n***** Final CV Results *****")
print(f"Avg F1: {np.mean(f1_scores):.4f}")
print(f"Avg MAP@3: {np.mean(map3_scores):.4f}")

# 5: Prediction and Submission

In [None]:
# Prepare test data
for col in cat_features:
    test_binned[col] = test_binned[col].astype('category')

# Make predictions
all_preds = np.zeros((test_binned.shape[0], len(le.classes_)))
X_test = test_binned.drop(columns='id')

for model in models:
    probs = model.predict_proba(X_test)
    all_preds += probs

avg_preds = all_preds / len(models)
top3_preds = np.argsort(probs, axis=1)[:, -3:][:, ::-1]
top3_labels = le.inverse_transform(top3_preds.ravel()).reshape(top3_preds.shape)

# Create submission
submission = pd.DataFrame({
    'id': test['id'],
    'Fertilizer Name': [' '.join(row) for row in top3_labels]
})

submission.to_csv('submission.csv', index=False)
submission.head()