In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler,MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split

In [None]:
data=pd.read_csv(r"C:\Users\Tippu\Downloads\Crop Production data.csv")
data1=data.copy()

In [None]:
data.info()
#datatype conversion from int64 to datetime 
data['Crop_Year']=pd.to_datetime(data['Crop_Year'], 
                          format='%Y')

data['Crop_Year'].unique()
data.info()


In [None]:
data.set_index('Crop_Year',inplace=True)
data['State_Name'].value_counts()
data['Season'].nunique()
data.shape

In [None]:
data['Production'].describe()

In [None]:
data.duplicated().sum() # no duplicates are present 
data.isna().sum()   #null values present in production 

In [None]:
for col in data.select_dtypes(include='object').columns:
    print(data[col].unique())

In [None]:
#sns.scatterplot(x=data['Area'],y=data['Production'])

#scatterplot between area and production 
fig,ax=plt.subplots()  #non linear relationship
ax.scatter(data['Area'],data['Production'])
fig.show()

In [None]:
#correlation between area and production
corr_=data.corr(numeric_only=True)  #this does imply that the yield per unit area varies ..
print(corr_)

'''
Pearson correlation coefficient is a measure of the linear association between two variables.Range[-1,1]
#pcc is outlier sensitive.
But even if a Pearson correlation coefficient tells us that two variables are uncorrelated,
they could still have some type of nonlinear relationship.
'''

In [None]:
#data['yield']=data['Production']/data['Area']
#Poor farming practices, adverse weather conditions, or pest infestations can decrease the yield, reducing total production even if the area remains the same.

In [None]:
'''
Factors Affecting Area and Yield
Land Availability: The amount of cultivable land available for agriculture.
Soil Fertility: The quality and fertility of the soil affecting crop growth and
yield.
Water Resources: Availability of water for irrigation.
Climate: Suitable climate conditions for the specific crop.
Technological Advancements: Use of modern farming techniques and equipment.
Agricultural Practices: Quality of seeds, use of fertilizers and pesticides, 
crop rotation practices.
Government Policies: Policies affecting land use, subsidies, and support for 
farmers.
Economic Factors: Market prices, input costs, and profitability influencing 
farmers’ decisions on the area to be cultivated.'''

In [None]:
#box plot
fig=px.box(data,'Production',title='box plot for production variable')
fig.show()

fig=px.box(data,'Area',title='box plot for area variable')
fig.show()

In [None]:
print(data['District_Name'].nunique(),  #646
data['State_Name'].nunique(), #33
data['Season'].nunique(),
data['Crop'].nunique())

##memory usage

memory_usage=data.memory_usage(deep=True)
total_memory_usage=memory_usage.sum()
a=total_memory_usage/(1024**2)
a

In [None]:
#onehot encoding on categorical variables
cat=data.select_dtypes(include='object').columns
onc=OneHotEncoder(drop='first',categories='auto')
on1=onc.fit_transform(data[cat]).toarray()
feature_names = onc.get_feature_names_out(cat)
on2=pd.DataFrame(on1,columns=feature_names)
on2

#memory usage
m1=on2.memory_usage(deep=True)
tm=m1.sum()
tmm1=tm/(1024**2)
tmm1

In [None]:
#concatinate the categorical variables and numeric variables
on2.index=data.index
d1=data.drop(cat,axis=1)
d2=pd.concat([d1,on2],axis=1)

#memory usage 
m2=d2.memory_usage(deep=True)
tm1=m2.sum()
tmm2=tm1/(1024**2)
tmm2

In [None]:
#scaling the area and production variables
#step1
col=['Area','Production']
inst_rb=RobustScaler()

rb_scaler=inst_rb.fit_transform(d2[col])      
rb_scaler2=pd.DataFrame(rb_scaler,columns=col)
rb_scaler2.describe()
#rb_scaler2.isna().sum()

#step2
inst_mn=MinMaxScaler()
mn_scaler=inst_mn.fit_transform(rb_scaler2[col])
mn_scaler1=pd.DataFrame(mn_scaler,columns=col)
mn_scaler1.describe()

In [None]:
#setting the index of d2
d2.index=mn_scaler1.index

d3=pd.concat([d2,mn_scaler1],axis=1)
d3.sample(5)
d4=d3.iloc[:,2:]
#d3.columns=d3.columns.astype('str')
d4.describe()

In [None]:
#memory usage 
m3=d4.memory_usage(deep=True)
tm3=m3.sum()
tmm3=tm3/(1024**2)
tmm3

In [None]:
#missing indexes
id=d4[d4['Production'].isna()].index

#missing dataframe
ms_df=d4.iloc[id,]
ms_df.sample(5)

In [None]:
#train data
# Use boolean indexing to filter out rows by index
train_df = d4[~d4.index.isin(id)]
train_df

##############
y_df=train_df['Production']
x_df=train_df.drop(['Production'],axis=1)
x_df.sample(5)


In [None]:
#split the data 
x_train,x_test,y_train,y_test=train_test_split(x_df,y_df,test_size=0.2,shuffle=True)

# Initialize the KNNImputer
reg = KNeighborsRegressor(n_neighbors=5,weights='distance',n_jobs=-1)
model_reg=reg.fit(x_df,y_df)


In [None]:
#prediction
ms_df=ms_df.drop(['Production'],axis=1)
pred=model_reg.predict(ms_df)
pred1=pd.DataFrame(pred,columns=['Production'])
pred1.describe()

In [None]:
pred1.index=ms_df.index
y1=pd.concat([ms_df['Area'],pred1],axis=1)
#inverse the scaled data to original form step1
y_org1=pd.DataFrame(inst_mn.inverse_transform(y1[col]),columns=y1.columns)
#inverse the scaled data to original form step2
y_org2=pd.DataFrame(inst_rb.inverse_transform(y_org1[col]),columns=y1.columns)  # converted into its original form by inversing the data points of area and production
y_org2.index=y1.index
y_org2

In [None]:
ms1=ms_df.drop(['Area'],axis=1)
ms2=pd.concat([ms1,y_org2],axis=1)
ms2

In [None]:
tf=train_df.drop(['Area','Production'],axis=1)

#converting train data's area, production into original value
train_df_modified=pd.DataFrame(inst_mn.inverse_transform(train_df[col]),columns=col)
train_df_modified1=pd.DataFrame(inst_rb.inverse_transform(train_df_modified[col]),columns=col)
train_df_modified1

In [None]:
data2=data1.drop(['Area','Production'],axis=1)

t_m2=pd.concat([y_org2,train_df_modified1],axis=0)
t1_=t_m2.sort_index(ascending=True)

In [None]:
#concatinate the processed data
data_final=pd.concat([data2,t1_],axis=1)
data_final.isna().sum()      #no null values are present after the preprocessing
data_final

In [None]:
#save the file in excel  and csv format 
data_final.to_excel('crop_cleaned.xlsx')
data_final.to_csv('crop_cleaned.csv',index=False)