# Analysis crop production in india



In [33]:
import numpy as np
import pandas as pd 
import os

In [34]:
df = pd.read_csv('crop_production.csv')
df.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0


In [35]:
df.columns

Index(['State_Name', 'District_Name', 'Crop_Year', 'Season', 'Crop', 'Area',
       'Production'],
      dtype='object')

In [36]:
df.shape

(246091, 7)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246091 entries, 0 to 246090
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   State_Name     246091 non-null  object 
 1   District_Name  246091 non-null  object 
 2   Crop_Year      246091 non-null  int64  
 3   Season         246091 non-null  object 
 4   Crop           246091 non-null  object 
 5   Area           246091 non-null  float64
 6   Production     242361 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 13.1+ MB


In [38]:
df.describe()

Unnamed: 0,Crop_Year,Area,Production
count,246091.0,246091.0,242361.0
mean,2005.643018,12002.82,582503.4
std,4.952164,50523.4,17065810.0
min,1997.0,0.04,0.0
25%,2002.0,80.0,88.0
50%,2006.0,582.0,729.0
75%,2010.0,4392.0,7023.0
max,2015.0,8580100.0,1250800000.0


# Data Cleaning

In [39]:
df.isnull().sum()

State_Name          0
District_Name       0
Crop_Year           0
Season              0
Crop                0
Area                0
Production       3730
dtype: int64

In [40]:
df.dropna(inplace = True)

In [68]:
from sklearn.impute import SimpleImputer
# Impute missing values in the target variable
df['Production'] = SimpleImputer(strategy='mean').fit_transform(df['Production'].values.reshape(-1, 1))
df['Production'] = np.log1p(df['Production'])
df.isnull().sum()

State_Name       0
District_Name    0
Crop_Year        0
Season           0
Crop             0
Area             0
Production       0
dtype: int64

In [61]:
df.shape

(246091, 7)

In [62]:
df.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0


# Exploratory Analysis & Visualization

**1.Overall Crop Production by state**

**2.Productivity of different states**

**3.Overall Crop Production Through years**

**4.Crops which are produced the most**

**5.Average Crop Area through years**

In [63]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
ohe = OneHotEncoder(drop='first')
scale = StandardScaler()

preprocesser = ColumnTransformer(
        transformers = [
            ('StandardScale', scale, [0, 1, 2, 3]),
            ('OHE', ohe, [4, 5]),
        ],
        remainder='passthrough'
)

In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

file_path = 'crop_production.csv'
df = pd.read_csv(file_path)

# Separate numerical and categorical columns
numeric_cols = ['Crop_Year', 'Area']
categorical_cols = ['State_Name', 'District_Name', 'Season', 'Crop']

# Create preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values in numeric columns
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values in categorical columns
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Split the data
X = df.drop('Production', axis=1)
y = df['Production']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0, shuffle=True)

# Apply preprocessing
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)


In [65]:
print(df['Production'].isnull().sum())

3730


In [66]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')  # You can use other strategies like 'median', 'most_frequent', etc.
df['Production'] = imputer.fit_transform(df['Production'].values.reshape(-1, 1))

In [50]:
print(np.isfinite(df['Production']).all())

True


In [59]:
print(np.isnan(X_train_preprocessed).any())
print(np.isfinite(X_train_preprocessed).all())

False
True


In [70]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error,r2_score


models = {
    'lr':LinearRegression(),
    'lss':Lasso(),
    'Rid':Ridge(),
    'Dtr':DecisionTreeRegressor()
}
for name, md in models.items():
    md.fit(X_train_preprocessed,y_train)
    y_pred = md.predict(X_test_preprocessed)
    
    print(f"{name} : mae : {mean_absolute_error(y_test,y_pred)} score : {r2_score(y_test,y_pred)}")

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [71]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr.fit(X_train_preprocessed,y_train)
dtr.predict(X_test_preprocessed)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').