<a href="https://colab.research.google.com/github/Purva0210/upskillcampus/blob/main/CropProductionPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import zipfile
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

#STEP 1:File Paths
main_zip_path = '/content/Project4_Ag_Prediction of Agriculture Crop Production In India.zip'
extract_dir = '/content/extracted_data'
os.makedirs(extract_dir, exist_ok=True)

#STEP 2: Extract Main ZIP
print("Extracting main ZIP file...")

with zipfile.ZipFile(main_zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("Main ZIP extracted successfully")

#STEP 3: Find Inner ZIP
print("Searching for inner ZIP file...")

inner_zip = None
for root, dirs, files in os.walk(extract_dir):
    for file in files:
        if file.endswith('.zip'):
            inner_zip = os.path.join(root, file)
            print("Found inner ZIP:", inner_zip)
            break

if inner_zip is None:
    raise FileNotFoundError("No inner ZIP file found!")

#STEP 4: Extract Inner ZIP
final_dir = os.path.join(extract_dir, 'final_data')
os.makedirs(final_dir, exist_ok=True)

with zipfile.ZipFile(inner_zip, 'r') as zip_ref:
    zip_ref.extractall(final_dir)

print("Inner ZIP extracted successfully")


#STEP 5: Find CSV files
csv_files = [f for f in os.listdir(final_dir) if f.endswith('.csv')]

print("\nAvailable CSV files:")
for i, f in enumerate(csv_files, 1):
    print(f"{i}. {f}")

# User Choice
choice = int(input("\nEnter the number of the CSV file to use: ")) - 1
selected_csv = csv_files[choice]
print(f"\nYou selected: {selected_csv}")

#STEP 6: Load CSV
file_path = os.path.join(final_dir, selected_csv)
df = pd.read_csv(file_path)

print("\nDataset Preview:")
display(df.head())

print("\nDataset Columns:")
print(df.columns)

#STEP 7: Initial Data Preprocessing
df.columns = df.columns.str.strip().str.lower()
df = df.loc[:, ~df.columns.str.contains('unnamed')]

#STEP 8: Auto Model Builder (Cleaning and X/Y preparation based on file type)
if selected_csv.lower() == 'produce.csv':
    print("\nDetected: Time-series style dataset ('produce.csv')")

    # Identify numeric (year) columns and categorical columns
    numeric_year_cols = [col for col in df.columns if col.startswith('3-')] # Assuming years start with '3-'
    categorical_cols = [col for col in df.columns if col not in numeric_year_cols]

    # Convert numeric columns to numeric, coercing errors
    for col in numeric_year_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Fill NaNs specifically in numeric columns (time-series data often has leading NaNs)
    df[numeric_year_cols] = df[numeric_year_cols].ffill(axis=1) # Using .ffill() to address FutureWarning

    # Drop rows where the target column or critical categorical data might still be NaN
    # For produce.csv, 'particulars' is a key identifier
    subset_to_check_for_nans = [numeric_year_cols[-1]] + ['particulars'] if 'particulars' in categorical_cols else [numeric_year_cols[-1]]
    df_cleaned = df.dropna(subset=subset_to_check_for_nans, how='any')

    if df_cleaned.empty:
        raise ValueError("DataFrame became empty after cleaning for 'produce.csv'. Consider a different cleaning strategy.")

    # Choose the last year column as target 'y'
    target_col = numeric_year_cols[-1]
    y = df_cleaned[target_col]

    # Features 'X' will be previous year columns + one-hot encoded categorical columns
    X_numeric = df_cleaned[numeric_year_cols[:-1]] # All year columns except the last one
    X_categorical = df_cleaned[categorical_cols]

    # One-hot encode categorical features
    X_categorical_encoded = pd.get_dummies(X_categorical, drop_first=True)

    # Combine numeric and encoded categorical features
    X = pd.concat([X_numeric, X_categorical_encoded], axis=1)


elif selected_csv.lower() in ['datafile.csv', 'datafile (1).csv', 'datafile (2).csv', 'datafile (3).csv']:
    print(f"\nDetected: Standard numeric/mixed dataset ('{selected_csv}')")

    # Drop rows with any NaNs in the *entire* DataFrame (a common initial cleaning step for this type)
    df_cleaned = df.dropna()

    if df_cleaned.empty:
        raise ValueError(f"DataFrame became empty after dropping NaN values from '{selected_csv}'. Consider a different cleaning strategy.")

    # Separate numeric and categorical for encoding
    numeric_cols = df_cleaned.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols = df_cleaned.select_dtypes(include=['object', 'category']).columns

    # One-hot encode categorical columns
    df_encoded = pd.get_dummies(df_cleaned, columns=categorical_cols, drop_first=True)

    # Dynamically set target column based on selected CSV file
    if selected_csv == 'datafile.csv':
        target_col = '2009-10'
    elif selected_csv == 'datafile (1).csv':
        target_col = 'yield (quintal/ hectare)'
    elif selected_csv == 'datafile (2).csv':
        target_col = 'production 2009-10'
    elif selected_csv == 'datafile (3).csv':
        # datafile (3).csv was problematic earlier, explicitly raise error if no good target is found
        raise ValueError(f"Selected file '{selected_csv}' does not have a recognized target column for prediction. Available columns: {df_encoded.columns.tolist()}")
    else: # Fallback, though current if/elif covers all expected files
        raise ValueError(f"No specific target column defined for '{selected_csv}'.")

    if target_col not in df_encoded.columns:
        raise ValueError(f"Target column '{target_col}' not found! Available columns: {df_encoded.columns.tolist()}")

    X = df_encoded.drop(columns=[target_col])
    y = df_encoded[target_col]

else: # Should not be reached with current csv_files list and handling, but good for robustness
    print("\nUnsupported dataset type. Machine Learning not possible.")
    print("Showing cleaned dataset instead:")
    print(df.head())
    raise SystemExit


# Check if X and y are empty after all cleaning (final check before split)
if X.empty or y.empty:
    raise ValueError("Features (X) or Target (y) are empty after preprocessing. Cannot train model.")

# Ensure no NaNs remain in X before training
X.fillna(0, inplace=True)

#STEP 9: Train Model
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

#STEP 10: Model Training and Evaluation
model = LinearRegression()
model.fit(X_train, y_train)

accuracy = model.score(X_test, y_test)
predictions = model.predict(X_test)

print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nSample Predictions (rounded):")
print(np.round(predictions[:10], 4))


Extracting main ZIP file...
Main ZIP extracted successfully
Searching for inner ZIP file...
Found inner ZIP: /content/extracted_data/Project4_Ag_Prediction of Agriculture Crop Production In India/Project4_Ag_Prediction of Agriculture Crop Production In India.zip
Inner ZIP extracted successfully

Available CSV files:
1. produce.csv
2. datafile (1).csv
3. datafile (2).csv
4. datafile (3).csv
5. datafile.csv

Enter the number of the CSV file to use: 5

You selected: datafile.csv

Dataset Preview:


Unnamed: 0,Crop,2004-05,2005-06,2006-07,2007-08,2008-09,2009-10,2010-11,2011-12
0,Rice,100.0,101.0,99.0,105.0,112.0,121.0,117.0,110.0
1,Wheat,100.0,101.0,112.0,115.0,117.0,127.0,120.0,108.0
2,Coarse Cereals,100.0,107.0,110.0,115.0,113.0,123.0,122.0,136.0
3,Pulses,100.0,108.0,134.0,124.0,124.0,146.0,137.0,129.0
4,Vegetables,100.0,109.0,103.0,118.0,113.0,124.0,128.0,115.0



Dataset Columns:
Index(['Crop', '2004-05', '2005-06', '2006-07', '2007-08', '2008-09',
       '2009-10', '2010-11', '2011-12'],
      dtype='object')

Detected: Standard numeric/mixed dataset ('datafile.csv')

Model Accuracy: 0.3007

Sample Predictions (rounded):
[108.2461 100.1593 106.5931]
