In [1]:
%pip install python-docx

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/253.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.2.0


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from docx import Document
from docx.shared import Inches
import os
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

def clean_data(df):
    """
    Cleans the dataframe by handling missing values and converting data types.

    Args:
        df (pd.DataFrame): The input dataframe.

    Returns:
        pd.DataFrame: The cleaned dataframe.
    """
    print("Starting data cleaning...")

    # --- Column Selection and Explanation ---
    # The analysis focuses on a subset of columns. Key cost columns are defined by the CACP.
    # 'cul_cost_c2' is the most comprehensive cost, including all paid-out costs (like seeds, fertilizer, hired labor)
    # plus imputed costs like the rental value of owned land and the value of unpaid family labor.
    # It is used as the 'Total Cultivation Cost' in this analysis.
    relevant_columns = [
        'year', 'state_name', 'crop_name', 'crop_type', 'cul_cost_c2',
        'main_product_value', 'by_product_value', 'derived_yield',
        'opr_cost_hmn_lab_family', 'opr_cost_hmn_lab_attached', 'opr_cost_hmn_lab_casual',
        'opr_cost_ani_lab_hired', 'opr_cost_ani_lab_owned', 'opr_cost_mch_lab_hired',
        'opr_cost_mch_lab_owned', 'opr_cost_seed', 'opr_cost_fertilizer', 'opr_cost_manure',
        'opr_cost_insecticides' # Removed 'opr_cost_irrigation' as it's not in the index
    ]
    # Create a copy to avoid SettingWithCopyWarning
    df_copy = df.copy()
    df_copy = df_copy[relevant_columns]


    # Drop rows with missing values in critical columns
    critical_cols = ['cul_cost_c2', 'main_product_value', 'by_product_value', 'derived_yield', 'crop_type']
    df_copy.dropna(subset=critical_cols, inplace=True)

    # Convert columns to numeric, coercing errors
    for col in df_copy.columns:
        if df_copy[col].dtype == 'object' and col not in ['year', 'state_name', 'crop_name', 'crop_type']:
            df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')

    # Fill remaining NaNs in operational cost columns with 0, assuming no cost was recorded
    opr_cost_cols = [col for col in df_copy.columns if 'opr_cost' in col]
    df_copy[opr_cost_cols] = df_copy[opr_cost_cols].fillna(0)

    print("Data cleaning complete.")
    return df_copy

def feature_engineering(df):
    """
    Creates new features for better analysis.

    Args:
        df (pd.DataFrame): The input dataframe.

    Returns:
        pd.DataFrame: The dataframe with new features.
    """
    print("Performing feature engineering...")
    # Calculate Net Return using the comprehensive Cost C2
    df['net_return'] = (df['main_product_value'] + df['by_product_value']) - df['cul_cost_c2']

    # Calculate Total Operational Costs
    opr_cost_cols = [col for col in df.columns if 'opr_cost' in col]
    df['total_opr_cost'] = df[opr_cost_cols].sum(axis=1)

    # Calculate Fixed Cost (as the residual of Comprehensive Cost C2 minus operational costs)
    df['fix_cost'] = df['cul_cost_c2'] - df['total_opr_cost']

    # Calculate Total Labor and Machinery Costs
    df['total_human_labor_cost'] = df['opr_cost_hmn_lab_family'] + df['opr_cost_hmn_lab_attached'] + df['opr_cost_hmn_lab_casual']
    df['total_animal_labor_cost'] = df['opr_cost_ani_lab_hired'] + df['opr_cost_ani_lab_owned']
    df['total_machine_labor_cost'] = df['opr_cost_mch_lab_hired'] + df['opr_cost_mch_lab_owned']

    print("Feature engineering complete.")
    return df

def generate_visualizations(df, output_dir='plots'):
    """
    Generates and saves all visualizations.

    Args:
        df (pd.DataFrame): The dataframe to visualize.
        output_dir (str): The directory to save plots.

    Returns:
        dict: A dictionary of plot filenames and their descriptions.
    """
    print(f"Generating visualizations... Plots will be saved in '{output_dir}/' directory.")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    plots_info = {}
    sns.set_style("whitegrid")

    # --- Plotting Functions ---
    def save_plot(filename, title, description):
        """Helper to save plot and store info."""
        filepath = os.path.join(output_dir, filename)
        plt.savefig(filepath, bbox_inches='tight')
        plt.close()
        plots_info[filepath] = {'title': title, 'description': description}
        print(f"  - Saved {filename}")

    # 1. Distribution of Crops by Type
    plt.figure(figsize=(10, 8))
    crop_type_counts = df['crop_type'].value_counts()
    plt.pie(crop_type_counts, labels=crop_type_counts.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette("viridis", len(crop_type_counts)))
    plt.title('1. Distribution of Data by Crop Type', fontsize=16)
    save_plot('1_crop_type_distribution.png', 'Distribution by Crop Type',
              "This pie chart shows the proportion of data entries for each major crop category. Cereals constitute the largest portion, highlighting their significance in Indian agriculture as recorded in this dataset.")

    # 2. Number of Data Entries per State
    plt.figure(figsize=(12, 8))
    state_counts = df['state_name'].value_counts()
    sns.barplot(y=state_counts.index, x=state_counts.values, palette='plasma')
    plt.title('2. Number of Data Entries per State', fontsize=16)
    plt.xlabel('Number of Entries')
    plt.ylabel('State')
    save_plot('2_state_distribution.png', 'Data Entries per State',
              "This bar chart displays the geographic distribution of the dataset, showing the number of agricultural data entries from each Indian state. States like Uttar Pradesh and Madhya Pradesh have the highest representation.")

    # 3. Average Cultivation Cost (C2) by State
    plt.figure(figsize=(12, 8))
    avg_cost_state = df.groupby('state_name')['cul_cost_c2'].mean().sort_values(ascending=False)
    sns.barplot(y=avg_cost_state.index, x=avg_cost_state.values, palette='magma')
    plt.title('3. Average Total Cultivation Cost (C2) by State', fontsize=16)
    plt.xlabel('Average Cost (INR per Hectare)')
    plt.ylabel('State')
    save_plot('3_avg_cost_by_state.png', 'Average Cultivation Cost by State',
              "This chart ranks states by their average total cost of cultivation (Cost C2). It reveals significant regional disparities in farming expenses, with states like Andhra Pradesh showing the highest costs.")

    # 4. Top 10 Costliest Crops to Cultivate
    plt.figure(figsize=(12, 8))
    avg_cost_crop = df.groupby('crop_name')['cul_cost_c2'].mean().sort_values(ascending=False).head(10)
    sns.barplot(y=avg_cost_crop.index, x=avg_cost_crop.values, palette='rocket')
    plt.title('4. Top 10 Costliest Crops to Cultivate (Cost C2)', fontsize=16)
    plt.xlabel('Average Cost (INR per Hectare)')
    plt.ylabel('Crop')
    save_plot('4_top_10_costliest_crops.png', 'Top 10 Costliest Crops',
              "This bar chart identifies the ten crops with the highest average cultivation cost. High-value crops like Sugarcane and Potato dominate this list, indicating they are capital-intensive to grow.")

    # 5. Operational vs. Fixed Costs by Crop Type
    plt.figure(figsize=(12, 8))
    cost_composition = df.groupby('crop_type')[['total_opr_cost', 'fix_cost']].mean()
    cost_composition.plot(kind='bar', stacked=True, figsize=(12, 8), color=sns.color_palette("coolwarm", 2))
    plt.title('5. Average Operational vs. Fixed Costs by Crop Type', fontsize=16)
    plt.ylabel('Cost (INR per Hectare)')
    plt.xlabel('Crop Type')
    plt.xticks(rotation=45)
    save_plot('5_op_vs_fix_cost.png', 'Operational vs. Fixed Costs',
              "This stacked bar chart breaks down the total cultivation cost into operational (variable) and fixed components for each crop type. It clearly shows that operational costs form the bulk of expenses across all categories.")

    # 6. Breakdown of Total Labor & Machinery Costs
    labor_costs = df[['total_human_labor_cost', 'total_animal_labor_cost', 'total_machine_labor_cost']].mean()
    plt.figure(figsize=(10, 8))
    plt.pie(labor_costs, labels=labor_costs.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette("Set2"))
    plt.title('6. Average Composition of Labor & Machinery Costs', fontsize=16)
    save_plot('6_labor_breakdown.png', 'Labor & Machinery Cost Breakdown',
              "This pie chart illustrates the average share of human, animal, and machine labor in the total labor and machinery expenditure. Human labor is the most significant component, followed closely by machinery, signifying a shift towards mechanization.")

    # 7. Breakdown of Key Input Costs
    input_costs = df[['opr_cost_seed', 'opr_cost_fertilizer', 'opr_cost_manure', 'opr_cost_insecticides']].mean()
    plt.figure(figsize=(10, 6))
    sns.barplot(x=input_costs.index, y=input_costs.values, palette='cubehelix')
    plt.title('7. Average Cost of Key Material Inputs', fontsize=16)
    plt.ylabel('Average Cost (INR per Hectare)')
    plt.xlabel('Input Type')
    save_plot('7_input_costs.png', 'Key Input Costs',
              "This bar chart compares the average per-hectare expenditure on four critical inputs: seeds, fertilizer, manure, and insecticides. Fertilizer and seeds typically represent the highest material input costs for farmers.")

    # 8. Distribution of Yield by State
    plt.figure(figsize=(15, 10))
    sns.boxplot(y='state_name', x='derived_yield', data=df, palette='Spectral')
    plt.title('8. Distribution of Crop Yield by State', fontsize=16)
    plt.xlabel('Yield (Quintal per Hectare)')
    plt.ylabel('State')
    plt.xlim(0, df['derived_yield'].quantile(0.95)) # Trim outliers for better visualization
    save_plot('8_yield_by_state.png', 'Yield Distribution by State',
              "This box plot shows the distribution of crop yields for each state. It highlights not only the average productivity (the line in the box) but also the variability and presence of outliers. States like Punjab and Haryana show high median yields.")

    # 9. Top 10 Highest Yielding Crops
    plt.figure(figsize=(12, 8))
    avg_yield_crop = df.groupby('crop_name')['derived_yield'].mean().sort_values(ascending=False).head(10)
    sns.barplot(y=avg_yield_crop.index, x=avg_yield_crop.values, palette='viridis')
    plt.title('9. Top 10 Highest Yielding Crops', fontsize=16)
    plt.xlabel('Average Yield (Quintal per Hectare)')
    plt.ylabel('Crop')
    save_plot('9_top_10_yielding_crops.png', 'Top 10 Highest Yielding Crops',
              "This chart ranks the top ten crops by their average yield. Unsurprisingly, crops like Sugarcane and Potato, which are bulky, are at the top of this list.")

    # 10. Revenue Composition: Main vs. By-Product
    plt.figure(figsize=(12, 8))
    revenue_composition = df.groupby('crop_type')[['main_product_value', 'by_product_value']].mean()
    revenue_composition.plot(kind='bar', stacked=True, figsize=(12, 8), color=sns.color_palette("Paired"))
    plt.title('10. Average Revenue Composition by Crop Type', fontsize=16)
    plt.ylabel('Revenue (INR per Hectare)')
    plt.xlabel('Crop Type')
    plt.xticks(rotation=45)
    save_plot('10_revenue_composition.png', 'Revenue Composition',
              "This stacked bar chart illustrates the average revenue generated from the main product versus the by-product for each crop type. For most crops, the main product is the primary revenue source, but for some, like certain cereals, the by-product (fodder) is also significant.")

    # 11. Cost vs. Yield Relationship
    plt.figure(figsize=(12, 8))
    sns.scatterplot(data=df, x='cul_cost_c2', y='derived_yield', hue='crop_type', alpha=0.6)
    plt.title('11. Cultivation Cost vs. Yield', fontsize=16)
    plt.xlabel('Total Cultivation Cost (INR per Hectare)')
    plt.ylabel('Yield (Quintal per Hectare)')
    plt.xlim(0, df['cul_cost_c2'].quantile(0.95))
    plt.ylim(0, df['derived_yield'].quantile(0.95))
    save_plot('11_cost_vs_yield.png', 'Cost vs. Yield',
              "This scatter plot explores the relationship between total cultivation cost and yield, with points colored by crop type. Generally, higher investment leads to higher yield, but the relationship is not always linear and shows considerable variation, suggesting differences in efficiency.")

    # 12. Distribution of Net Return
    plt.figure(figsize=(12, 7))
    sns.histplot(df['net_return'], kde=True, bins=50, color='green')
    plt.axvline(x=0, color='red', linestyle='--', linewidth=2, label='Break-even Point')
    plt.title('12. Distribution of Net Return', fontsize=16)
    plt.xlabel('Net Return (INR per Hectare)')
    plt.ylabel('Frequency')
    plt.legend()
    plt.xlim(df['net_return'].quantile(0.05), df['net_return'].quantile(0.95))
    save_plot('12_net_return_distribution.png', 'Net Return Distribution',
              "This histogram shows the distribution of net returns (profit/loss). The red dashed line at zero represents the break-even point. A significant portion of the distribution falls to the left of this line, indicating that a substantial number of farming operations recorded in the dataset resulted in a financial loss.")

    # 13. Top 10 Most Profitable Crops
    plt.figure(figsize=(12, 8))
    avg_profit_crop = df.groupby('crop_name')['net_return'].mean().sort_values(ascending=False).head(10)
    sns.barplot(y=avg_profit_crop.index, x=avg_profit_crop.values, palette='summer')
    plt.title('13. Top 10 Most Profitable Crops', fontsize=16)
    plt.xlabel('Average Net Return (INR per Hectare)')
    plt.ylabel('Crop')
    save_plot('13_top_10_profitable_crops.png', 'Top 10 Most Profitable Crops',
              "This chart highlights the ten crops with the highest average net return. Commercial crops like Sugarcane and Potato are among the most profitable, assuming favorable conditions.")

    # 14. Top 10 Least Profitable Crops
    plt.figure(figsize=(12, 8))
    avg_loss_crop = df.groupby('crop_name')['net_return'].mean().sort_values(ascending=True).head(10)
    sns.barplot(y=avg_loss_crop.index, x=avg_loss_crop.values, palette='autumn')
    plt.title('14. Top 10 Least Profitable Crops', fontsize=16)
    plt.xlabel('Average Net Return (INR per Hectare)')
    plt.ylabel('Crop')
    save_plot('14_top_10_least_profitable_crops.png', 'Top 10 Least Profitable Crops',
              "This bar chart shows the ten crops that, on average, resulted in the largest financial losses. It underscores the high financial risk associated with certain crops, where costs frequently exceed revenues.")

    # 15. Average Net Return by State
    plt.figure(figsize=(12, 8))
    avg_profit_state = df.groupby('state_name')['net_return'].mean().sort_values(ascending=False)
    sns.barplot(y=avg_profit_state.index, x=avg_profit_state.values, palette='PiYG')
    plt.title('15. Average Net Return by State', fontsize=16)
    plt.xlabel('Average Net Return (INR per Hectare)')
    plt.ylabel('State')
    save_plot('15_avg_profit_by_state.png', 'Average Net Return by State',
              "This chart ranks states by their average net return from agriculture. It reveals which states have, on average, more profitable farming sectors. States like Haryana and Punjab often show higher profitability, while others face systemic challenges.")

    # 16. Net Return Distribution by Crop Type
    plt.figure(figsize=(14, 8))
    sns.violinplot(y='crop_type', x='net_return', data=df, palette='muted')
    plt.axvline(x=0, color='black', linestyle='--', linewidth=1)
    plt.title('16. Net Return Distribution by Crop Type', fontsize=16)
    plt.xlabel('Net Return (INR per Hectare)')
    plt.ylabel('Crop Type')
    plt.xlim(df['net_return'].quantile(0.05), df['net_return'].quantile(0.95))
    save_plot('16_profit_dist_by_croptype.png', 'Net Return Distribution by Crop Type',
              "This violin plot provides a deeper look at profitability by showing the full distribution of net returns for each crop type. The width of the violin indicates the frequency of data points at that profit level. It effectively visualizes both the profitability and the financial risk (spread) associated with different crop categories.")

    # 17. Trend of Costs and Returns Over Years
    df_time = df.groupby('year')[['cul_cost_c2', 'net_return']].mean().reset_index()
    plt.figure(figsize=(14, 7))
    plt.plot(df_time['year'], df_time['cul_cost_c2'], marker='o', label='Average Cultivation Cost (C2)')
    plt.plot(df_time['year'], df_time['net_return'], marker='s', label='Average Net Return')
    plt.title('17. Trend of Average Costs and Net Returns Over Time', fontsize=16)
    plt.xlabel('Year')
    plt.ylabel('Amount (INR per Hectare)')
    plt.legend()
    plt.grid(True)
    save_plot('17_trends_over_time.png', 'Trends of Costs and Returns',
              "This line chart tracks the average total cultivation cost and average net return from 1999 to 2022. It illustrates the 'cost-price squeeze,' showing that while costs have steadily risen, net returns have been much more volatile and have not kept pace.")

    # 18. Correlation Heatmap of Key Variables
    plt.figure(figsize=(16, 12))
    corr_cols = ['cul_cost_c2', 'derived_yield', 'main_product_value', 'net_return', 'total_opr_cost',
                 'total_human_labor_cost', 'total_machine_labor_cost', 'opr_cost_fertilizer'] # Removed 'opr_cost_irrigation'
    corr_matrix = df[corr_cols].corr()
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('18. Correlation Matrix of Key Financial and Operational Variables', fontsize=16)
    save_plot('18_correlation_heatmap.png', 'Correlation Heatmap',
              "This heatmap shows the correlation coefficients between key variables. Red indicates a positive correlation, while blue indicates a negative one. It helps to statistically identify relationships, such as the strong positive link between operational costs and total cultivation costs, and the relationship between yield and revenue.")

    # --- In-Focus Crop Analysis ---
    def plot_crop_cost_structure(crop_name, plot_num):
        plt.figure(figsize=(10, 8))
        crop_df = df[df['crop_name'] == crop_name]
        opr_cost_cols = ['total_human_labor_cost', 'total_machine_labor_cost', 'opr_cost_fertilizer', 'opr_cost_seed', 'opr_cost_insecticides'] # Removed 'opr_cost_irrigation'
        cost_structure = crop_df[opr_cost_cols].mean()
        cost_structure = cost_structure[cost_structure > 0] # Only show non-zero costs
        plt.pie(cost_structure, labels=cost_structure.index, autopct='%1.1f%%', startangle=90, colors=sns.color_palette("YlOrRd"))
        plt.title(f'{plot_num}. Operational Cost Structure for {crop_name}', fontsize=16)
        save_plot(f'{plot_num}_cost_structure_{crop_name.lower()}.png', f'Cost Structure for {crop_name}',
                  f"This pie chart breaks down the major operational costs for {crop_name}. It highlights the primary drivers of expense for cultivating this specific crop.")

    # 19. Paddy Cost Structure
    plot_crop_cost_structure('Paddy', 19)
    # 20. Wheat Cost Structure
    plot_crop_cost_structure('Wheat', 20)
    # 21. Cotton Cost Structure
    plot_crop_cost_structure('Cotton', 21)

    # 22. Net Return for Paddy across Top States
    plt.figure(figsize=(12, 8))
    paddy_df = df[df['crop_name'] == 'Paddy']
    top_paddy_states = paddy_df['state_name'].value_counts().nlargest(5).index
    paddy_profit_state = paddy_df[paddy_df['state_name'].isin(top_paddy_states)].groupby('state_name')['net_return'].mean().sort_values(ascending=False)
    sns.barplot(y=paddy_profit_state.index, x=paddy_profit_state.values, palette='Greens_r')
    plt.title('22. Average Net Return for Paddy in Top 5 Producing States', fontsize=16)
    plt.xlabel('Average Net Return (INR per Hectare)')
    plt.ylabel('State')
    save_plot('22_paddy_profit_by_state.png', 'Paddy Profitability by State',
              "This chart compares the average profitability of Paddy cultivation across the top 5 producing states (by data volume). It shows that even for the same crop, profitability varies dramatically by region.")

    # 23. Fertilizer Cost vs. Yield for Wheat
    plt.figure(figsize=(12, 8))
    wheat_df = df[df['crop_name'] == 'Wheat']
    sns.scatterplot(data=wheat_df, x='opr_cost_fertilizer', y='derived_yield', alpha=0.7, color='orange')
    plt.title('23. Fertilizer Cost vs. Yield for Wheat', fontsize=16)
    plt.xlabel('Fertilizer Cost (INR per Hectare)')
    plt.ylabel('Yield (Quintal per Hectare)')
    plt.xlim(0, wheat_df['opr_cost_fertilizer'].quantile(0.95))
    plt.ylim(0, wheat_df['derived_yield'].quantile(0.95))
    save_plot('23_wheat_fert_vs_yield.png', 'Wheat: Fertilizer vs. Yield',
              "This scatter plot specifically examines the relationship between money spent on fertilizer and the resulting yield for Wheat. It helps analyze the efficiency of input use for a key crop.")

    print("All visualizations generated successfully.")
    return plots_info

def create_word_report(plots_info, filename='Agricultural_EDA_Report.docx'):
    """
    Creates a Word document with all the plots and their descriptions.

    Args:
        plots_info (dict): Dictionary containing plot filepaths and descriptions.
        filename (str): The name of the output Word document.
    """
    print(f"Creating Word report: {filename}...")
    document = Document()
    document.add_heading('Exploratory Data Analysis of Indian Agricultural Costs and Yields', 0)
    document.add_paragraph(
        'This report presents a comprehensive Exploratory Data Analysis (EDA) of the cost of cultivation in India, based on the provided dataset. '
        'The following sections visualize and explain key findings related to costs, yields, profitability, and regional variations.'
    )

    # --- NEW GLOSSARY SECTION ---
    document.add_heading('Glossary: Understanding Agricultural Cost Concepts', level=1)
    document.add_paragraph(
        "The dataset uses standard cost concepts defined by India's Commission for Agricultural Costs and Prices (CACP). "
        "Understanding these is key to the analysis. The primary metric used in this report is Cost C2, as it is the most comprehensive measure of a farmer's expenses."
    )

    definitions = {
        "Cost A1": "All actual paid-out expenses in cash and kind. This includes hired labor, seeds, fertilizers, irrigation, etc.",
        "Cost A2": "Cost A1 + rent paid for any land leased by the farmer.",
        "Cost A2+FL": "Cost A2 + the imputed value of unpaid labor contributed by the farmer and their family.",
        "Cost B1": "Cost A1 + interest on the value of owned fixed capital assets (e.g., machinery), excluding land.",
        "Cost B2": "Cost B1 + the rental value of the farmer's own land and rent paid for leased land.",
        "Cost C1": "Cost B1 + imputed value of family labor.",
        "Cost C2 (Comprehensive Cost)": "Cost B2 + imputed value of family labor. This is the most comprehensive cost, accounting for all paid and imputed expenses. It is the 'Total Cultivation Cost' used throughout this report.",
        "Cost C3": "Cost C2 + 10% of Cost C2 to account for the farmer's managerial functions."
    }

    for term, definition in definitions.items():
        p = document.add_paragraph(style='List Bullet')
        p.add_run(term).bold = True
        p.add_run(f": {definition}")

    document.add_paragraph() # Add space
    # --- END OF NEW SECTION ---

    for filepath, info in sorted(plots_info.items()):
        try:
            document.add_heading(info['title'], level=1)
            document.add_paragraph(info['description'])
            document.add_picture(filepath, width=Inches(6.0))
            document.add_paragraph() # Add some space
        except Exception as e:
            print(f"Could not add {filepath} to document. Error: {e}")

    document.save(filename)
    print(f"Word report '{filename}' created successfully.")

def main():
    """Main function to run the EDA pipeline."""
    # Define file paths
    csv_file = 'cost-of-cultivation.csv'
    cleaned_csv_file = 'cleaned_cost_of_cultivation.csv'
    output_dir = 'eda_plots'
    report_file = 'Agricultural_EDA_Report.docx'

    # Check if the dataset exists
    if not os.path.exists(csv_file):
        print(f"Error: The file '{csv_file}' was not found.")
        print("Please make sure the dataset is in the same directory as the script.")
        return

    # --- Run the EDA Pipeline ---
    # 1. Load data
    print(f"Loading raw data from '{csv_file}'...")
    df = pd.read_csv(csv_file)

    # 2. Clean and preprocess
    df_clean = clean_data(df)

    # Save the cleaned data
    print(f"Saving cleaned data to '{cleaned_csv_file}'...")
    df_clean.to_csv(cleaned_csv_file, index=False)

    # 3. Feature Engineering
    df_featured = feature_engineering(df_clean)

    # 4. Generate Visualizations
    plots_info = generate_visualizations(df_featured, output_dir)

    # 5. Create Word Report
    create_word_report(plots_info, report_file)

    print("\n--- EDA process complete! ---")
    print(f"The cleaned data has been saved as '{cleaned_csv_file}'.")
    print(f"All plots have been saved to the '{output_dir}' folder.")
    print(f"The final summary report has been saved as '{report_file}'.")

if __name__ == '__main__':
    main()


Loading raw data from 'cost-of-cultivation.csv'...
Starting data cleaning...
Data cleaning complete.
Saving cleaned data to 'cleaned_cost_of_cultivation.csv'...
Performing feature engineering...
Feature engineering complete.
Generating visualizations... Plots will be saved in 'eda_plots/' directory.
  - Saved 1_crop_type_distribution.png
  - Saved 2_state_distribution.png
  - Saved 3_avg_cost_by_state.png
  - Saved 4_top_10_costliest_crops.png
  - Saved 5_op_vs_fix_cost.png
  - Saved 6_labor_breakdown.png
  - Saved 7_input_costs.png
  - Saved 8_yield_by_state.png
  - Saved 9_top_10_yielding_crops.png
  - Saved 10_revenue_composition.png
  - Saved 11_cost_vs_yield.png
  - Saved 12_net_return_distribution.png
  - Saved 13_top_10_profitable_crops.png
  - Saved 14_top_10_least_profitable_crops.png
  - Saved 15_avg_profit_by_state.png
  - Saved 16_profit_dist_by_croptype.png
  - Saved 17_trends_over_time.png
  - Saved 18_correlation_heatmap.png
  - Saved 19_cost_structure_paddy.png
  - Save

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>