In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

# > Overview of the Data

In [None]:
train_df = pd.read_csv("/kaggle/input/playground-series-s5e6/train.csv")

In [None]:
print(f"Dataset Shape: {train_df.shape}")

print("\nData Info:")
train_df.info()

print("\nNumerical Features Summary:")
display(train_df.describe())

print("\nFirst 10 Rows of the Dataset:")
display(train_df.head(10))

# Univariate Analysis

# Numerical Features

In [None]:
numerical_features = [
    "Temparature",
    "Humidity",
    "Moisture",
    "Nitrogen",
    "Potassium",
    "Phosphorous", 
]

for feature in numerical_features:
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    sns.histplot(train_df[feature], kde=True, bins=30)
    plt.title(f"Histogram of {feature}")
    plt.xlabel(feature)
    plt.ylabel("Frequency")

    plt.subplot(1, 2, 2)
    sns.boxplot(x=train_df[feature])
    plt.title(f"Box Plot of {feature}")

    plt.tight_layout()
    plt.show()

    print(f"\nStatistics for {feature}:")
    print(f"Skewness: {train_df[feature].skew():.2f}")
    print(f"Number of Missing Values: {train_df[feature].isnull().sum()}")

# Categorical Features

In [None]:
for feature in ["Soil Type", "Crop Type"]:
    counts = train_df[feature].value_counts()

    # Plot pie chart
    plt.figure(figsize=(6, 6))
    plt.pie(counts, labels=counts.index, autopct='%1.1f%%', startangle=90)
    plt.title(f"Distribution of {feature}")
    plt.axis("equal")
    plt.show()

    # Print unique and missing values
    print(f"Number of Unique {feature}: {train_df[feature].nunique()}")
    print(f"Missing Values in {feature}: {train_df[feature].isnull().sum()}")


# KDE plot

In [None]:
colors = sns.color_palette('husl', len(numerical_features))

rows = -(-len(numerical_features) // 4)
plt.figure(figsize=(20, 5 * rows))

for i, (col, color) in enumerate(zip(numerical_features, colors), 1):
    plt.subplot(rows, 4, i)
    sns.kdeplot(data=train_df, x=col, fill=True, color=color)
    plt.title(f'KDE Plot of {col}', fontsize=14, color=color)
    plt.xlabel(col)
    plt.ylabel('Density')

plt.tight_layout()
plt.show()

# Scatter plot

In [None]:
numeric_df = train_df.select_dtypes(include='number')

sns.pairplot(numeric_df, corner=True, plot_kws={'alpha': 0.5})
plt.suptitle('Pairwise Scatter Plots', y=1.02)
plt.show()

# Bivariate Analysis

# Numerical Features vs Label

In [None]:
for feature in numerical_features[:-1]:  
    plt.figure(figsize=(8, 6))
    sns.scatterplot(
        x=train_df[feature], y=train_df["Fertilizer Name"], alpha=0.5
    )
    plt.title(f"{feature} vs. Fertilizer Name")
    plt.xlabel(feature)
    plt.ylabel("Fertilizer Name")
    plt.show()

correlation_matrix = train_df[numerical_features].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix of Numerical Features")
plt.show()

# Categorical Features vs Label

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x="Soil Type", hue="Fertilizer Name", data=train_df)
plt.title("Distribution of Fertilizer Name across Soil Types")
plt.xlabel("Soil Type")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.legend(title="Fertilizer Name", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns
import pandas as pd

cross_tab = pd.crosstab(train_df["Soil Type"], train_df["Fertilizer Name"])

plt.figure(figsize=(12, 6))
sns.heatmap(cross_tab, annot=True, fmt="d", cmap="YlGnBu")
plt.title("Soil Type vs. Fertilizer Name (Counts)")
plt.ylabel("Soil Type")
plt.xlabel("Fertilizer Name")
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x="Crop Type", hue="Fertilizer Name", data=train_df)
plt.title("Distribution of Fertilizer Name across Crop Types")
plt.xlabel("Crop Type")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.legend(title="Fertilizer Name", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
cross_tab = pd.crosstab(train_df["Crop Type"], train_df["Fertilizer Name"])

plt.figure(figsize=(12, 6))
sns.heatmap(cross_tab, annot=True, fmt="d", cmap="YlGnBu")
plt.title("Crop Type vs. Fertilizer Name (Counts)")
plt.ylabel("Crop Type")
plt.xlabel("Fertilizer Name")
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()


# Trend plot

In [None]:
colors = sns.color_palette('husl', len(numerical_features))

rows = -(-len(numerical_features) // 4)
plt.figure(figsize=(20, 5 * rows))

for i, (col, color) in enumerate(zip(numerical_features, colors), 1):
    plt.subplot(rows, 4, i)
    sns.lineplot(data=train_df[col], color=color)
    plt.title(f'Trend Plot of {col}', fontsize=14, color=color)
    plt.xlabel('Index')
    plt.ylabel(col)

plt.tight_layout()
plt.show()

# Kde+Trend

In [None]:
colors = sns.color_palette('husl', len(numerical_features))
rows = -(-len(numerical_features) // 4)
plt.figure(figsize=(20, 5 * rows))

for i, (col, color) in enumerate(zip(numerical_features, colors), 1):
    plt.subplot(rows, 4, i)
    sns.kdeplot(data=train_df, x=col, fill=True, color=color)
    sns.lineplot(data=train_df[col].sort_values().reset_index(drop=True), color='black', linewidth=1)
    plt.title(f'KDE + Trend of {col}', fontsize=14, color=color)
    plt.xlabel(col)
    plt.ylabel('Density')

plt.tight_layout()
plt.show()

# Violin Plot

In [None]:
colors = sns.color_palette('husl', len(numerical_features))
rows = -(-len(numerical_features) // 4)
plt.figure(figsize=(20, 5 * rows))

for i, (col, color) in enumerate(zip(numerical_features, colors), 1):
    plt.subplot(rows, 4, i)
    sns.violinplot(data=train_df, y=col, color=color)
    plt.title(f'Violin Plot of {col}', fontsize=14, color=color)
    plt.xlabel('')
    plt.ylabel(col)

plt.tight_layout()
plt.show()


# Thanks for visiting. If you find any useful insights, please consider upvoting!