In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# --- 1. Load the dataset ---
print("--- Loading the dataset ---")
try:
    df = pd.read_csv('data/crop_yield.csv')
    print("Dataset loaded successfully!")
    print(f"Initial dataset shape: {df.shape}")
except FileNotFoundError:
    print("Error: 'data/crop_yield 2.csv' not found. Please ensure the file is in the 'data' directory.")
    exit()

print("\n--- Initial Data Inspection (First 5 rows) ---")
print(df.head())

print("\n--- Data Information (Data Types and Non-Null Counts) ---")
df.info()

print("\n--- Descriptive Statistics for Numerical Columns ---")
print(df.describe())

# --- 2. Perform initial data cleaning and preprocessing ---

print("\n--- Checking for missing values ---")
missing_values_before = df.isnull().sum()
print("Missing values before dropping:")
print(missing_values_before[missing_values_before > 0])

# As requested: Handle missing values by dropping rows with any missing data
df.dropna(inplace=True)
print(f"\nDataset shape after dropping rows with missing values: {df.shape}")

# Correcting data types (if necessary, often pandas infers correctly but good to check)
# Ensure numerical columns are indeed numeric
# 'Area', 'Production', 'Annual_Rainfall', 'Fertilizer', 'Pesticide', 'Yield' are expected to be numerical
numerical_cols = ['Area', 'Production', 'Annual_Rainfall', 'Fertilizer', 'Pesticide', 'Yield']
for col in numerical_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce') # Coerce errors will turn non-convertible values into NaN

# After coercing, check for new NaNs and drop them if any were created
missing_values_after_coerce = df.isnull().sum()
if (missing_values_after_coerce > 0).any():
    print("\n--- Missing values created after coercing to numeric types ---")
    print(missing_values_after_coerce[missing_values_after_coerce > 0])
    df.dropna(inplace=True)
    print(f"Dataset shape after dropping new NaNs: {df.shape}")


# --- 3. Identify Features and Target Variable ---
# Independent Variables (Features)
# We will select a subset of numerical features for linear regression.
# Categorical features like 'Crop', 'Season', 'State' would need one-hot encoding for linear regression,
# but for the initial shortlisting round, we'll focus on the most directly impactful numerical ones.
features = ['Annual_Rainfall', 'Fertilizer', 'Pesticide', 'Area']

# Target Variable
target = 'Yield'

print(f"\nSelected Features for the model: {features}")
print(f"Selected Target Variable: {target}")

# --- 4. Begin exploratory data analysis (EDA) ---
print("\n--- Performing Exploratory Data Analysis ---")

# Set aesthetic style for plots
sns.set_style("whitegrid")
plt.figure(figsize=(15, 10))

# Distribution of Target Variable (Yield)
plt.subplot(2, 3, 1)
sns.histplot(df[target], kde=True)
plt.title(f'Distribution of {target}')

# Distributions of Key Features
for i, feature in enumerate(features):
    plt.subplot(2, 3, i + 2)
    sns.histplot(df[feature], kde=True)
    plt.title(f'Distribution of {feature}')

plt.tight_layout()
plt.show()

# Box plots to check for outliers in numerical columns
plt.figure(figsize=(15, 8))
for i, col in enumerate(numerical_cols):
    plt.subplot(2, 3, i + 1)
    sns.boxplot(y=df[col])
    plt.title(f'Box Plot of {col}')
plt.tight_layout()
plt.show()


# Correlation Matrix Heatmap
print("\n--- Correlation Matrix Heatmap ---")
# Select only numerical columns for correlation calculation
df_numeric = df[numerical_cols]
correlation_matrix = df_numeric.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix of Numerical Features')
plt.show()

print("\n--- Correlation with Target Variable (Yield) ---")
print(correlation_matrix[target].sort_values(ascending=False))

print("\nEDA complete. Ready for model training.")

--- Loading the dataset ---
Error: 'data/crop_yield 2.csv' not found. Please ensure the file is in the 'data' directory.

--- Initial Data Inspection (First 5 rows) ---


NameError: name 'df' is not defined

: 