# ðŸŒ¾ Data Exploration and Cleaning â€” Project Samarth

In [None]:

# Step 6B: Clean and normalize rainfall data
print("Cleaning rainfall data...")
print("=" * 60)

# Convert columns to lowercase
df_rain.columns = [col.lower() for col in df_rain.columns]

# Rename for consistency
df_rain.rename(columns={
    "subdivision": "subdivision",
    "year": "year",
    "annual": "annual_rainfall"
}, inplace=True)

# Drop rows with missing rainfall or year
df_rain = df_rain.dropna(subset=["subdivision", "year", "annual_rainfall"])

# Convert year to integer
df_rain["year"] = pd.to_numeric(df_rain["year"], errors="coerce").astype(int)

print("âœ… Cleaned rainfall data shape:", df_rain.shape)
print("Unique subdivisions found:", len(df_rain["subdivision"].unique()))
print(df_rain["subdivision"].unique()[:10])

# Summary statistics
print("\nRainfall Summary:")
display(df_rain[["annual_rainfall", "jf", "mam", "jjas", "ond"]].describe())


In [None]:

# Step 7: Merge crop and rainfall data
print("Merging crop and rainfall data...")
print("=" * 60)

state_to_subdivision = {
    "Andaman and Nicobar Islands": "Andaman & Nicobar Islands",
    "Arunachal Pradesh": "Arunachal Pradesh",
    "Assam": "Assam & Meghalaya",
    "Meghalaya": "Assam & Meghalaya",
    "Nagaland": "Naga Mani Mizo Tripura",
    "Manipur": "Naga Mani Mizo Tripura",
    "Mizoram": "Naga Mani Mizo Tripura",
    "Tripura": "Naga Mani Mizo Tripura",
    "West Bengal": "Gangetic West Bengal",
    "Sikkim": "Sub Himalayan West Bengal & Sikkim",
    "Odisha": "Orissa",
    "Jharkhand": "Jharkhand",
    "Bihar": "Bihar",
    "Uttar Pradesh": "East Uttar Pradesh",
    "Uttarakhand": "West Uttar Pradesh",
    "Haryana": "Haryana Delhi Chandigarh",
    "Delhi": "Haryana Delhi Chandigarh",
    "Punjab": "Punjab",
    "Himachal Pradesh": "Himachal Pradesh",
    "Jammu and Kashmir": "Jammu & Kashmir",
    "Chhattisgarh": "Chhattisgarh",
    "Madhya Pradesh": "West Madhya Pradesh",
    "Gujarat": "Saurashtra Kutch & Diu",
    "Maharashtra": "Konkan & Goa",
    "Goa": "Konkan & Goa",
    "Andhra Pradesh": "Coastal Andhra Pradesh",
    "Telangana": "Telangana",
    "Karnataka": "South Interior Karnataka",
    "Kerala": "Kerala",
    "Tamil Nadu": "Tamil Nadu",
    "Rajasthan": "East Rajasthan"
}

df_crop["subdivision"] = df_crop["state"].map(state_to_subdivision)
df_crop["year_cleaned"] = df_crop["year"].astype(str).str.extract(r"(\d{4})").astype(int)

df_merged = pd.merge(
    df_crop,
    df_rain[["subdivision", "year", "annual_rainfall"]],
    left_on=["subdivision", "year_cleaned"],
    right_on=["subdivision", "year"],
    how="inner"
)

print(f"âœ… Merged dataset shape: {df_merged.shape}")
print(df_merged[["state", "district", "crop", "year_cleaned", "annual_rainfall", "production", "yield"]].head(10))


In [None]:

# Step 8: Analyze relationship between rainfall and crop yield
import matplotlib.pyplot as plt
import seaborn as sns

print("Analyzing rainfall vs crop yield correlation...")
print("=" * 60)

plt.figure(figsize=(8,6))
sns.scatterplot(data=df_merged, x="annual_rainfall", y="yield", alpha=0.4)
plt.title("Relationship between Rainfall and Crop Yield")
plt.xlabel("Annual Rainfall (mm)")
plt.ylabel("Crop Yield (tons/hectare)")
plt.show()

corr = df_merged["annual_rainfall"].corr(df_merged["yield"])
print(f"\nCorrelation between annual rainfall and crop yield: {corr:.3f}")
