In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.feature_selection import SelectKBest, f_regression

In [None]:
X,y = make_regression(n_samples=100, n_features=100, n_informative=10)
X.shape

In [None]:
y

In [None]:
#generate feature selection
fs = SelectKBest(f_regression, k=10)

In [None]:
#apply feature selection
X_selected = fs.fit_transform(X, y)
X_selected.shape

In [None]:
fs

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('agriculture_data.csv')
df

In [None]:
X = df.drop('Crop_Yield', axis=1)
y = df['Crop_Yield']
y

In [None]:
# fs = SelectKBest(f_regression, k=2)
# X_selected = fs.fit_transform(X, y)
# X_selected

In [None]:
import seaborn as sns
df_data = df[['Temperature', 'Rainfall', 'Soil_pH', 'Fertilizer', 'Crop_Yield']]
df_copy = df_data.corr()
ax = sns.heatmap(df_copy,
                 annot=True,
                 linewidths=0.5,
                 fmt=".2f",
                 cmap="YlGnBu")

In [None]:
df.dtypes

In [None]:
#handling missing values

#detect outliers
features = ['Temperature', 'Rainfall', 'Soil_pH', 'Fertilizer', 'Crop_Yield']

#plot box plots
df[features] = np.log(df[features])
df[features].boxplot(figsize=(8, 4))

plt.title('Box Plot for Outlier Detection')
plt.ylabel('Values')
plt.xticks(rotation=45)

# code to replace null values with median values
numerical_cols = df.select_dtypes(include=['number']).columns
for col in numerical_cols:
    median = df[col].median()
    df[col] = df[col].fillna(median) # Replace nulls with median

df

In [None]:
df[features]

## Feature Transformation

In [None]:
# Identify columns containing zeros
numerical_0s = df.loc[:, (df == 0).any()].select_dtypes(include='number').columns

# Remove columns that contain zeros from considerations
numerical_cols = numerical_cols.difference(numerical_0s)

# Calculate the skewness for the remaining columns
skewness = df[numerical_cols].skew()

# Set threshold for skewness (e.g., absolute value > 1 indicates high skewness)
skewed_columns = skewness[abs(skewness) > 1]

# list of skewed features/cols
skewed_features = ['Temperature']

# Apply log transformation to each skewed feature -> convert this skewed features into Gaussian Distribution
for col in skewed_features:
    df[col] = np.log(df[col])

skewness

In [None]:
temp_data = df['Temperature']
temp_mean = temp_data.mean()
temp_median = temp_data.median()
temp_mode = temp_data.mode()
temp_std = temp_data.std()

skew = 3 * ((temp_mean - temp_median)) / temp_std
print(f"The pearson's second skewness coefficient distribution is {skew}")
print(f"Mean: {temp_mean}")
print(f"Median: {temp_median}")
print(f"Mode: {temp_mode}")

In [None]:
mean = np.mean([1, 2, 3])
mean

## Kurtosis

In [None]:
agric_data = pd.read_csv('agriculture_data.csv')
print(f"Kurtosis for agric dataset: {agric_data.select_dtypes(include='number').kurtosis()}")
print("-----------------------------------")
print(f"Kurtosis for Numerical cols: {df[numerical_cols].kurtosis()}")

## Visualizing Skewness and Kurtosis

In [None]:
temperatures = agric_data['Temperature']
sns.kdeplot(temperatures)

plt.title("KDE plot of temperature")
plt.xlabel("Temp (Â°C)")

#Add vertical lines at the position of mean, median, and mode
plt.axvline(temperatures.mean(), label="Mean")
plt.axvline(temperatures.median(), color="black", label="Median")
plt.axvline(temperatures.mode()[55], color="green", label="Mode")
plt.legend()

print(f"Mode: {temperatures.mode()}")
print("--------------------------")
agric_data.mode().squeeze()

In [None]:
sns.histplot(agric_data['Temperature'], kde=True)
plt.title("A Histogram of Temperature")

In [None]:
#skewness fxn

def skew_fxn(data):
    n = len(data)
    mean = np.mean(data)
    std = np.std(data)
    first_part = n / ((n-1) * (n-2))
    second_part = sum(((data - mean) / std) ** 3)
    # print(f"second_part: {second_part}")
    skewness = first_part * second_part
    return skewness

skew_fxn(agric_data["Temperature"])

In [None]:
# kurtosis fxn
def kurtosis_fxn(data):
    n = len(data)
    mean = np.mean(data)
    std = np.std(data)
    kurt = 1 / n * sum(((data - mean) / std) ** 4) - 3
    return kurt

kurtosis_fxn(agric_data["Temperature"])