<a href="https://colab.research.google.com/github/Purva0210/UpSkill/blob/main/UpSkillML1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import zipfile
import os

# Define the path to the main ZIP file
main_zip_path = '/content/Project4_Ag_Prediction of Agriculture Crop Production In India.zip'
extracted_base_dir = '/content/extracted_data' # Use a base directory for all extractions
os.makedirs(extracted_base_dir, exist_ok=True)

# 1. Extract the main ZIP file
print(f"Extracting main ZIP: {main_zip_path} to {extracted_base_dir}")
with zipfile.ZipFile(main_zip_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_base_dir)

# 2. Locate the inner ZIP file
# Based on the error message, the inner zip is within a directory inside the main extracted folder.
# The structure suggested is: extracted_base_dir/Project4_Ag_Prediction of Agriculture Crop Production In India/Project4_Ag_Prediction of Agriculture Crop Production In India.zip
intermediate_dir_name = 'Project4_Ag_Prediction of Agriculture Crop Production In India'
inner_zip_name = 'Project4_Ag_Prediction of Agriculture Crop Production In India.zip'
inner_zip_path = os.path.join(extracted_base_dir, intermediate_dir_name, inner_zip_name)

print(f"Attempting to find inner ZIP at: {inner_zip_path}")
if not os.path.exists(inner_zip_path):
    # This block would execute if the structure is different than expected.
    # For robustness, we could list contents of extracted_base_dir and search for .zip files
    # For now, we'll assume the path is correct as per error message.
    raise FileNotFoundError(f"Error: Inner ZIP file not found at the expected path: {inner_zip_path}. Please check the contents of your ZIP file.")

# 3. Extract the inner ZIP file
inner_extracted_dir = os.path.join(extracted_base_dir, 'final_data')
os.makedirs(inner_extracted_dir, exist_ok=True)
print(f"Extracting inner ZIP: {inner_zip_path} to {inner_extracted_dir}")
with zipfile.ZipFile(inner_zip_path, 'r') as zip_ref:
    zip_ref.extractall(inner_extracted_dir)

# 4. Find the CSV file within the inner extracted directory and load it
csv_files = [f for f in os.listdir(inner_extracted_dir) if f.endswith('.csv')]
df = None # Initialize df
df_file_path = None

if not csv_files:
    raise FileNotFoundError(f"Error: No CSV file found in {inner_extracted_dir}. Contents: {os.listdir(inner_extracted_dir)}")

# Instead of looking for 'production', we load the first CSV and identify a suitable target later
# For simplicity, we'll load the first CSV found, assuming it's the primary dataset.
# In a more complex scenario, we might need to inspect multiple CSVs.

df_file_path = os.path.join(inner_extracted_dir, csv_files[0]) # Load the first CSV
print(f"Loading the first CSV file found: {df_file_path}")
df = pd.read_csv(df_file_path)

if df is None or df.empty:
    raise ValueError("Failed to load a non-empty DataFrame. Check file paths and contents.")

# Ensure column names are standardized to lowercase for easier lookup if needed later
df.columns = df.columns.str.lower()

# Define the target column - based on inspection, '2009-10' appears to be a suitable target for 'production'
target_column = '2009-10'

if target_column not in df.columns:
    raise ValueError(f"The assumed target column '{target_column}' was not found in the dataset. Available columns: {df.columns.tolist()}")

# Clean data (drop rows with any NaN values after identifying the target)
df = df.dropna()

# Separate target and features
y = df[target_column]
X = df.drop(target_column, axis=1)

# Convert text columns in features to dummy variables
# Select only 'object' (string) or 'category' columns for one-hot encoding
X_object_cols = X.select_dtypes(include=['object', 'category']).columns
if not X_object_cols.empty:
    X_encoded = pd.get_dummies(X, columns=X_object_cols, drop_first=True)
else:
    X_encoded = X.copy() # No object columns to encode

# Ensure target 'y' is numerical for LinearRegression
if pd.api.types.is_numeric_dtype(y):
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

    # Train model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Accuracy
    print("Model Accuracy:", model.score(X_test, y_test))
else:
    raise TypeError(f"Target variable '{target_column}' is not numeric (type: {y.dtype}). LinearRegression requires a numeric target.")


Extracting main ZIP: /content/Project4_Ag_Prediction of Agriculture Crop Production In India.zip to /content/extracted_data
Attempting to find inner ZIP at: /content/extracted_data/Project4_Ag_Prediction of Agriculture Crop Production In India/Project4_Ag_Prediction of Agriculture Crop Production In India.zip
Extracting inner ZIP: /content/extracted_data/Project4_Ag_Prediction of Agriculture Crop Production In India/Project4_Ag_Prediction of Agriculture Crop Production In India.zip to /content/extracted_data/final_data
Loading the first CSV file found: /content/extracted_data/final_data/produce.csv


ValueError: The assumed target column '2009-10' was not found in the dataset. Available columns: ['particulars', 'frequency', 'unit', ' 3-1993', ' 3-1994', ' 3-1995', ' 3-1996', ' 3-1997', ' 3-1998', ' 3-1999', ' 3-2000', ' 3-2001', ' 3-2002', ' 3-2003', ' 3-2004', ' 3-2005', ' 3-2006', ' 3-2007', ' 3-2008', ' 3-2009', ' 3-2010', ' 3-2011', ' 3-2012', ' 3-2013', ' 3-2014']

# Task


## Extract and List CSV Files

### Subtask:
Iterate through the `final_data` directory and print the names of all `.csv` files found, without attempting to load them yet. This will help identify all potential datasets.


**Reasoning**:
The subtask requires printing the names of all CSV files in the `final_data` directory without loading them. The existing code already identifies these files and stores their names in the `csv_files` list. I will remove the DataFrame loading and subsequent processing steps, and add a print statement for the `csv_files` list.



In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import zipfile
import os

# Define the path to the main ZIP file
main_zip_path = '/content/Project4_Ag_Prediction of Agriculture Crop Production In India.zip'
extracted_base_dir = '/content/extracted_data' # Use a base directory for all extractions
os.makedirs(extracted_base_dir, exist_ok=True)

# 1. Extract the main ZIP file
print(f"Extracting main ZIP: {main_zip_path} to {extracted_base_dir}")
with zipfile.ZipFile(main_zip_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_base_dir)

# 2. Locate the inner ZIP file
# Based on the error message, the inner zip is within a directory inside the main extracted folder.
# The structure suggested is: extracted_base_dir/Project4_Ag_Prediction of Agriculture Crop Production In India/Project4_Ag_Prediction of Agriculture Crop Production In India.zip
intermediate_dir_name = 'Project4_Ag_Prediction of Agriculture Crop Production In India'
inner_zip_name = 'Project4_Ag_Prediction of Agriculture Crop Production In India.zip'
inner_zip_path = os.path.join(extracted_base_dir, intermediate_dir_name, inner_zip_name)

print(f"Attempting to find inner ZIP at: {inner_zip_path}")
if not os.path.exists(inner_zip_path):
    # This block would execute if the structure is different than expected.
    # For robustness, we could list contents of extracted_base_dir and search for .zip files
    # For now, we'll assume the path is correct as per error message.
    raise FileNotFoundError(f"Error: Inner ZIP file not found at the expected path: {inner_zip_path}. Please check the contents of your ZIP file.")

# 3. Extract the inner ZIP file
inner_extracted_dir = os.path.join(extracted_base_dir, 'final_data')
os.makedirs(inner_extracted_dir, exist_ok=True)
print(f"Extracting inner ZIP: {inner_zip_path} to {inner_extracted_dir}")
with zipfile.ZipFile(inner_zip_path, 'r') as zip_ref:
    zip_ref.extractall(inner_extracted_dir)

# 4. Find the CSV files within the inner extracted directory
csv_files = [f for f in os.listdir(inner_extracted_dir) if f.endswith('.csv')]

if not csv_files:
    raise FileNotFoundError(f"Error: No CSV file found in {inner_extracted_dir}. Contents: {os.listdir(inner_extracted_dir)}")

# Print the list of CSV files found
print(f"CSV files found in {inner_extracted_dir}: {csv_files}")


Extracting main ZIP: /content/Project4_Ag_Prediction of Agriculture Crop Production In India.zip to /content/extracted_data
Attempting to find inner ZIP at: /content/extracted_data/Project4_Ag_Prediction of Agriculture Crop Production In India/Project4_Ag_Prediction of Agriculture Crop Production In India.zip
Extracting inner ZIP: /content/extracted_data/Project4_Ag_Prediction of Agriculture Crop Production In India/Project4_Ag_Prediction of Agriculture Crop Production In India.zip to /content/extracted_data/final_data
CSV files found in /content/extracted_data/final_data: ['produce.csv', 'datafile (1).csv', 'datafile (2).csv', 'datafile (3).csv', 'datafile.csv']


## Present Options to User

### Subtask:
Inform the user about the available CSV files and explicitly ask them to provide the name of the CSV file they wish to proceed with for crop production prediction.


The following CSV files were found in the extracted archive: `['produce.csv', 'datafile (1).csv', 'datafile (2).csv', 'datafile (3).csv', 'datafile.csv']`.

Which of these CSV files would you like to use for crop production prediction? Please provide the exact filename (e.g., `produce.csv`).


## Final Task

### Subtask:
Summarize that the CSV files have been presented and the system is awaiting your selection to continue with loading the chosen dataset and building the prediction model.


## Summary:

### Data Analysis Key Findings
*   The primary ZIP archive, `Project4_Ag_Prediction of Agriculture Crop Production In India.zip`, was successfully extracted.
*   An embedded ZIP file, also named `Project4_Ag_Prediction of Agriculture Crop Production In India.zip`, was subsequently located and extracted into a dedicated `final_data` directory.
*   Five distinct CSV files were identified within the `final_data` directory: `produce.csv`, `datafile (1).csv`, `datafile (2).csv`, `datafile (3).csv`, and `datafile.csv`.
*   These identified CSV files have been presented to the user for selection.

### Insights or Next Steps
*   The system is now awaiting user input to select one of the provided CSV filenames to proceed with loading the chosen dataset.
*   The next step will involve loading the user-selected CSV file into a Pandas DataFrame and beginning the crop production prediction model building process.


In [9]:
# User selected 'datafile.csv'
selected_csv_file = 'datafile.csv'

df_file_path = os.path.join(inner_extracted_dir, selected_csv_file)
print(f"Loading selected CSV file: {df_file_path}")
df = pd.read_csv(df_file_path)

if df is None or df.empty:
    raise ValueError(f"Failed to load a non-empty DataFrame from {selected_csv_file}. Check file path and contents.")

# Ensure column names are standardized to lowercase for easier lookup
df.columns = df.columns.str.lower()

# Define the target column - '2009-10' seems appropriate for production data in this dataset
target_column = '2009-10'

if target_column not in df.columns:
    raise ValueError(f"The assumed target column '{target_column}' was not found in the selected dataset '{selected_csv_file}'. Available columns: {df.columns.tolist()}")

# Clean data (drop rows with any NaN values after identifying the target)
df = df.dropna()

# If after dropping NaNs, the DataFrame becomes empty, it's problematic
if df.empty:
    raise ValueError(f"DataFrame became empty after dropping NaN values from {selected_csv_file}. Consider a different cleaning strategy or dataset.")

# Separate target and features
y = df[target_column]
X = df.drop(target_column, axis=1)

# Convert text columns in features to dummy variables
# Select only 'object' (string) or 'category' columns for one-hot encoding
X_object_cols = X.select_dtypes(include=['object', 'category']).columns
if not X_object_cols.empty:
    X_encoded = pd.get_dummies(X, columns=X_object_cols, drop_first=True)
else:
    X_encoded = X.copy() # No object columns to encode

# Ensure target 'y' is numerical for LinearRegression
if pd.api.types.is_numeric_dtype(y):
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

    # Train model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Accuracy
    print("Model Accuracy:", model.score(X_test, y_test))
else:
    raise TypeError(f"Target variable '{target_column}' is not numeric (type: {y.dtype}). LinearRegression requires a numeric target.")


Loading selected CSV file: /content/extracted_data/final_data/datafile.csv
Model Accuracy: 0.300741168303997


# Task
The CSV files found in the extracted archive are `produce.csv`, `datafile (1).csv`, `datafile (2).csv`, `datafile (3).csv`, and `datafile.csv`.

Which of these CSV files would you like to use for crop production prediction? Please provide the exact filename.

## Extract and List CSV Files

### Subtask:
Iterate through the `final_data` directory and print the names of all `.csv` files found, without attempting to load them yet. This will help identify all potential datasets.


**Reasoning**:
The subtask requires printing the names of all CSV files in the `final_data` directory without loading them. The existing code already identifies these files and stores their names in the `csv_files` list. I will remove the DataFrame loading and subsequent processing steps, and add a print statement for the `csv_files` list.



In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import zipfile
import os

# Define the path to the main ZIP file
main_zip_path = '/content/Project4_Ag_Prediction of Agriculture Crop Production In India.zip'
extracted_base_dir = '/content/extracted_data' # Use a base directory for all extractions
os.makedirs(extracted_base_dir, exist_ok=True)

# 1. Extract the main ZIP file
print(f"Extracting main ZIP: {main_zip_path} to {extracted_base_dir}")
with zipfile.ZipFile(main_zip_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_base_dir)

# 2. Locate the inner ZIP file
# Based on the error message, the inner zip is within a directory inside the main extracted folder.
# The structure suggested is: extracted_base_dir/Project4_Ag_Prediction of Agriculture Crop Production In India/Project4_Ag_Prediction of Agriculture Crop Production In India.zip
intermediate_dir_name = 'Project4_Ag_Prediction of Agriculture Crop Production In India'
inner_zip_name = 'Project4_Ag_Prediction of Agriculture Crop Production In India.zip'
inner_zip_path = os.path.join(extracted_base_dir, intermediate_dir_name, inner_zip_name)

print(f"Attempting to find inner ZIP at: {inner_zip_path}")
if not os.path.exists(inner_zip_path):
    # This block would execute if the structure is different than expected.
    # For robustness, we could list contents of extracted_base_dir and search for .zip files
    # For now, we'll assume the path is correct as per error message.
    raise FileNotFoundError(f"Error: Inner ZIP file not found at the expected path: {inner_zip_path}. Please check the contents of your ZIP file.")

# 3. Extract the inner ZIP file
inner_extracted_dir = os.path.join(extracted_base_dir, 'final_data')
os.makedirs(inner_extracted_dir, exist_ok=True)
print(f"Extracting inner ZIP: {inner_zip_path} to {inner_extracted_dir}")
with zipfile.ZipFile(inner_zip_path, 'r') as zip_ref:
    zip_ref.extractall(inner_extracted_dir)

# 4. Find the CSV files within the inner extracted directory
csv_files = [f for f in os.listdir(inner_extracted_dir) if f.endswith('.csv')]

if not csv_files:
    raise FileNotFoundError(f"Error: No CSV file found in {inner_extracted_dir}. Contents: {os.listdir(inner_extracted_dir)}")

# Print the list of CSV files found
print(f"CSV files found in {inner_extracted_dir}: {csv_files}")

Extracting main ZIP: /content/Project4_Ag_Prediction of Agriculture Crop Production In India.zip to /content/extracted_data
Attempting to find inner ZIP at: /content/extracted_data/Project4_Ag_Prediction of Agriculture Crop Production In India/Project4_Ag_Prediction of Agriculture Crop Production In India.zip
Extracting inner ZIP: /content/extracted_data/Project4_Ag_Prediction of Agriculture Crop Production In India/Project4_Ag_Prediction of Agriculture Crop Production In India.zip to /content/extracted_data/final_data
CSV files found in /content/extracted_data/final_data: ['produce.csv', 'datafile (1).csv', 'datafile (2).csv', 'datafile (3).csv', 'datafile.csv']


## Present Options to User

### Subtask:
Inform the user about the available CSV files and explicitly ask them to provide the name of the CSV file they wish to proceed with for crop production prediction.


The following CSV files were found in the extracted archive: `['produce.csv', 'datafile (1).csv', 'datafile (2).csv', 'datafile (3).csv', 'datafile.csv']`.

Which of these CSV files would you like to use for crop production prediction? Please provide the exact filename (e.g., `produce.csv`).

## Final Task

### Subtask:
Summarize that the CSV files have been presented and the system is awaiting your selection to continue with loading the chosen dataset and building the prediction model.


## Summary:

### Data Analysis Key Findings
*   The primary ZIP archive, `Project4_Ag_Prediction of Agriculture Crop Production In India.zip`, was successfully extracted.
*   An embedded ZIP file, also named `Project4_Ag_Prediction of Agriculture Crop Production In India.zip`, was subsequently located and extracted into a dedicated `final_data` directory.
*   Five distinct CSV files were identified within the `final_data` directory: `produce.csv`, `datafile (1).csv`, `datafile (2).csv`, `datafile (3).csv`, and `datafile.csv`.
*   These identified CSV files have been presented to the user for selection.

### Insights or Next Steps
*   The system is now awaiting user input to select one of the provided CSV filenames to proceed with loading the chosen dataset.
*   The next step will involve loading the user-selected CSV file into a Pandas DataFrame and beginning the crop production prediction model building process.


In [25]:
import pandas as pd
import numpy as np
import zipfile
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# STEP 1: File Paths
main_zip_path = '/content/Project4_Ag_Prediction of Agriculture Crop Production In India.zip'
extracted_base_dir = '/content/extracted_data'
os.makedirs(extracted_base_dir, exist_ok=True)

# STEP 2: Extract Main ZIP
print("Extracting main ZIP file...")

with zipfile.ZipFile(main_zip_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_base_dir)

print("Main ZIP extracted successfully")

# STEP 3: Find Inner ZIP
print("Searching for inner ZIP file...")

inner_zip_path = None
for root, dirs, files in os.walk(extracted_base_dir):
    for file in files:
        if file.endswith('.zip'):
            inner_zip_path = os.path.join(root, file)
            print("Found inner ZIP:", inner_zip_path)
            break

if inner_zip_path is None:
    raise FileNotFoundError("No inner ZIP file found!")

# STEP 4: Extract Inner ZIP
final_data_dir = '/content/final_data'
os.makedirs(final_data_dir, exist_ok=True)

with zipfile.ZipFile(inner_zip_path, 'r') as zip_ref:
    zip_ref.extractall(final_data_dir)

print("Inner ZIP extracted successfully")

# STEP 5: Show CSV Files
csv_files = [f for f in os.listdir(final_data_dir) if f.endswith('.csv')]

print("\nAvailable CSV files:")
for i, f in enumerate(csv_files):
    print(f"{i+1}. {f}")

# STEP 6: Select File
file_no = int(input("\nEnter the number of the CSV file to use: "))
selected_csv = csv_files[file_no - 1]

print("You selected:", selected_csv)

# STEP 7: Load Data
csv_path = os.path.join(final_data_dir, selected_csv)
df = pd.read_csv(csv_path)

# Standardize column names: strip spaces and convert to lowercase
df.columns = df.columns.str.strip().str.lower()

print("\nDataset Preview:")
display(df.head())

print("\nDataset Columns:")
print(df.columns)

# STEP 8: Clean Data
df = df.dropna()

# Convert categorical columns to numeric
df_encoded = pd.get_dummies(df, drop_first=True)

# STEP 9: Prepare X and Y
# Dynamically set target column based on selected CSV file
if selected_csv == 'datafile.csv':
    target_col = '2009-10'
elif selected_csv == 'datafile (1).csv':
    target_col = 'yield (quintal/ hectare)' # Based on previous inspection
elif selected_csv == 'datafile (2).csv':
    target_col = 'production 2009-10' # Based on previous successful run
elif selected_csv == 'produce.csv' or selected_csv == 'datafile (3).csv':
    # These files did not have a clear 'production' or 'yield' column in previous runs
    # For this demonstration, we'll raise an error or set a default if applicable
    raise ValueError(f"Selected file '{selected_csv}' does not have a recognized target column for prediction. Available columns: {df_encoded.columns.tolist()}")
else:
    # Default or ask user for target column if a new file is encountered
    target_col = '2009-10' # Assuming '2009-10' as a common year column for other datafiles

print(f"\nAttempting to use '{target_col}' as the target column.")

if target_col not in df_encoded.columns:
    raise ValueError(f"Target column '{target_col}' not found! Available columns: {df_encoded.columns.tolist()}")

X = df_encoded.drop(target_col, axis=1)
y = df_encoded[target_col]

# STEP 10: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# STEP 11: Train Model
model = LinearRegression()
model.fit(X_train, y_train)

# STEP 12: Accuracy + Predictions (Rounded)
accuracy = model.score(X_test, y_test)
predictions = model.predict(X_test)

print(f"\nModel Accuracy: {accuracy:.4f}")

rounded_predictions = np.round(predictions, 4)
print("\nSample Predictions (rounded):")
print(rounded_predictions[:10])

Extracting main ZIP file...
Main ZIP extracted successfully
Searching for inner ZIP file...
Found inner ZIP: /content/extracted_data/Project4_Ag_Prediction of Agriculture Crop Production In India/Project4_Ag_Prediction of Agriculture Crop Production In India.zip
Inner ZIP extracted successfully

Available CSV files:
1. produce.csv
2. datafile (1).csv
3. datafile (2).csv
4. datafile (3).csv
5. datafile.csv

Enter the number of the CSV file to use: 2
You selected: datafile (1).csv

Dataset Preview:


Unnamed: 0,crop,state,cost of cultivation (`/hectare) a2+fl,cost of cultivation (`/hectare) c2,cost of production (`/quintal) c2,yield (quintal/ hectare)
0,ARHAR,Uttar Pradesh,9794.05,23076.74,1941.55,9.83
1,ARHAR,Karnataka,10593.15,16528.68,2172.46,7.47
2,ARHAR,Gujarat,13468.82,19551.9,1898.3,9.59
3,ARHAR,Andhra Pradesh,17051.66,24171.65,3670.54,6.42
4,ARHAR,Maharashtra,17130.55,25270.26,2775.8,8.72



Dataset Columns:
Index(['crop', 'state', 'cost of cultivation (`/hectare) a2+fl',
       'cost of cultivation (`/hectare) c2',
       'cost of production (`/quintal) c2', 'yield (quintal/ hectare)'],
      dtype='object')

Attempting to use 'yield (quintal/ hectare)' as the target column.

Model Accuracy: 0.9066

Sample Predictions (rounded):
[-15.1392   2.0413  49.2709 807.5678 -22.3956 156.9517  42.2256 -41.0444
 -69.3945  37.653 ]


In [42]:
import pandas as pd
import numpy as np
import zipfile
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

#STEP 1:File Paths
main_zip_path = '/content/Project4_Ag_Prediction of Agriculture Crop Production In India.zip'
extract_dir = '/content/extracted_data'
os.makedirs(extract_dir, exist_ok=True)

#STEP 2: Extract Main ZIP
print("Extracting main ZIP file...")

with zipfile.ZipFile(main_zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("Main ZIP extracted successfully")

#STEP 3: Find Inner ZIP
print("Searching for inner ZIP file...")

inner_zip = None
for root, dirs, files in os.walk(extract_dir):
    for file in files:
        if file.endswith('.zip'):
            inner_zip = os.path.join(root, file)
            print("Found inner ZIP:", inner_zip)
            break

if inner_zip is None:
    raise FileNotFoundError("No inner ZIP file found!")

#STEP 4: Extract Inner ZIP
final_dir = os.path.join(extract_dir, 'final_data')
os.makedirs(final_dir, exist_ok=True)

with zipfile.ZipFile(inner_zip, 'r') as zip_ref:
    zip_ref.extractall(final_dir)

print("Inner ZIP extracted successfully")


#STEP 5: Find CSV files
csv_files = [f for f in os.listdir(final_dir) if f.endswith('.csv')]

print("\nAvailable CSV files:")
for i, f in enumerate(csv_files, 1):
    print(f"{i}. {f}")

# User Choice
choice = int(input("\nEnter the number of the CSV file to use: ")) - 1
selected_csv = csv_files[choice]
print(f"\nYou selected: {selected_csv}")

#STEP 6: Load CSV
file_path = os.path.join(final_dir, selected_csv)
df = pd.read_csv(file_path)

print("\nDataset Preview:")
display(df.head())

print("\nDataset Columns:")
print(df.columns)

#STEP 7: Initial Data Preprocessing
df.columns = df.columns.str.strip().str.lower()
df = df.loc[:, ~df.columns.str.contains('unnamed')]

#STEP 8: Auto Model Builder (Cleaning and X/Y preparation based on file type)
if selected_csv.lower() == 'produce.csv':
    print("\nDetected: Time-series style dataset ('produce.csv')")

    # Identify numeric (year) columns and categorical columns
    numeric_year_cols = [col for col in df.columns if col.startswith('3-')] # Assuming years start with '3-'
    categorical_cols = [col for col in df.columns if col not in numeric_year_cols]

    # Convert numeric columns to numeric, coercing errors
    for col in numeric_year_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Fill NaNs specifically in numeric columns (time-series data often has leading NaNs)
    df[numeric_year_cols] = df[numeric_year_cols].ffill(axis=1) # Using .ffill() to address FutureWarning

    # Drop rows where the target column or critical categorical data might still be NaN
    # For produce.csv, 'particulars' is a key identifier
    subset_to_check_for_nans = [numeric_year_cols[-1]] + ['particulars'] if 'particulars' in categorical_cols else [numeric_year_cols[-1]]
    df_cleaned = df.dropna(subset=subset_to_check_for_nans, how='any')

    if df_cleaned.empty:
        raise ValueError("DataFrame became empty after cleaning for 'produce.csv'. Consider a different cleaning strategy.")

    # Choose the last year column as target 'y'
    target_col = numeric_year_cols[-1]
    y = df_cleaned[target_col]

    # Features 'X' will be previous year columns + one-hot encoded categorical columns
    X_numeric = df_cleaned[numeric_year_cols[:-1]] # All year columns except the last one
    X_categorical = df_cleaned[categorical_cols]

    # One-hot encode categorical features
    X_categorical_encoded = pd.get_dummies(X_categorical, drop_first=True)

    # Combine numeric and encoded categorical features
    X = pd.concat([X_numeric, X_categorical_encoded], axis=1)


elif selected_csv.lower() in ['datafile.csv', 'datafile (1).csv', 'datafile (2).csv', 'datafile (3).csv']:
    print(f"\nDetected: Standard numeric/mixed dataset ('{selected_csv}')")

    # Drop rows with any NaNs in the *entire* DataFrame (a common initial cleaning step for this type)
    df_cleaned = df.dropna()

    if df_cleaned.empty:
        raise ValueError(f"DataFrame became empty after dropping NaN values from '{selected_csv}'. Consider a different cleaning strategy.")

    # Separate numeric and categorical for encoding
    numeric_cols = df_cleaned.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols = df_cleaned.select_dtypes(include=['object', 'category']).columns

    # One-hot encode categorical columns
    df_encoded = pd.get_dummies(df_cleaned, columns=categorical_cols, drop_first=True)

    # Dynamically set target column based on selected CSV file
    if selected_csv == 'datafile.csv':
        target_col = '2009-10'
    elif selected_csv == 'datafile (1).csv':
        target_col = 'yield (quintal/ hectare)'
    elif selected_csv == 'datafile (2).csv':
        target_col = 'production 2009-10'
    elif selected_csv == 'datafile (3).csv':
        # datafile (3).csv was problematic earlier, explicitly raise error if no good target is found
        raise ValueError(f"Selected file '{selected_csv}' does not have a recognized target column for prediction. Available columns: {df_encoded.columns.tolist()}")
    else: # Fallback, though current if/elif covers all expected files
        raise ValueError(f"No specific target column defined for '{selected_csv}'.")

    if target_col not in df_encoded.columns:
        raise ValueError(f"Target column '{target_col}' not found! Available columns: {df_encoded.columns.tolist()}")

    X = df_encoded.drop(columns=[target_col])
    y = df_encoded[target_col]

else: # Should not be reached with current csv_files list and handling, but good for robustness
    print("\nUnsupported dataset type. Machine Learning not possible.")
    print("Showing cleaned dataset instead:")
    print(df.head())
    raise SystemExit


# Check if X and y are empty after all cleaning (final check before split)
if X.empty or y.empty:
    raise ValueError("Features (X) or Target (y) are empty after preprocessing. Cannot train model.")

# Ensure no NaNs remain in X before training
X.fillna(0, inplace=True)

#STEP 9: Train Model
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

#STEP 10: Model Training and Evaluation
model = LinearRegression()
model.fit(X_train, y_train)

accuracy = model.score(X_test, y_test)
predictions = model.predict(X_test)

print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nSample Predictions (rounded):")
print(np.round(predictions[:10], 4))


Extracting main ZIP file...
Main ZIP extracted successfully
Searching for inner ZIP file...
Found inner ZIP: /content/extracted_data/Project4_Ag_Prediction of Agriculture Crop Production In India/Project4_Ag_Prediction of Agriculture Crop Production In India.zip
Inner ZIP extracted successfully

Available CSV files:
1. produce.csv
2. datafile (1).csv
3. datafile (2).csv
4. datafile (3).csv
5. datafile.csv

Enter the number of the CSV file to use: 5

You selected: datafile.csv

Dataset Preview:


Unnamed: 0,Crop,2004-05,2005-06,2006-07,2007-08,2008-09,2009-10,2010-11,2011-12
0,Rice,100.0,101.0,99.0,105.0,112.0,121.0,117.0,110.0
1,Wheat,100.0,101.0,112.0,115.0,117.0,127.0,120.0,108.0
2,Coarse Cereals,100.0,107.0,110.0,115.0,113.0,123.0,122.0,136.0
3,Pulses,100.0,108.0,134.0,124.0,124.0,146.0,137.0,129.0
4,Vegetables,100.0,109.0,103.0,118.0,113.0,124.0,128.0,115.0



Dataset Columns:
Index(['Crop', '2004-05', '2005-06', '2006-07', '2007-08', '2008-09',
       '2009-10', '2010-11', '2011-12'],
      dtype='object')

Detected: Standard numeric/mixed dataset ('datafile.csv')

Model Accuracy: 0.3007

Sample Predictions (rounded):
[108.2461 100.1593 106.5931]
