In [18]:
dataset <- read.csv("/kaggle/input/soil-data/new soil data.csv")

In [19]:
summary(dataset)

 District_Name      SOIL.TYPE.1             Area           Rain_Fall   
 Length:29          Length:29          Min.   : 321976   Min.   :1156  
 Class :character   Class :character   1st Qu.: 760658   1st Qu.:1312  
 Mode  :character   Mode  :character   Median :1051768   Median :1368  
                                       Mean   :1101690   Mean   :1381  
                                       3rd Qu.:1443499   3rd Qu.:1421  
                                       Max.   :2361199   Max.   :1734  
      Temp          Humidity       Production     
 Min.   :25.36   Min.   :65.04   Min.   : 330153  
 1st Qu.:26.46   1st Qu.:71.16   1st Qu.: 716721  
 Median :26.69   Median :71.86   Median :1352362  
 Mean   :26.77   Mean   :72.87   Mean   :1413243  
 3rd Qu.:27.34   3rd Qu.:75.68   3rd Qu.:2047906  
 Max.   :27.75   Max.   :80.05   Max.   :3938172  

In [None]:
#Taking care of missing  values
dataset$Area = ifelse(is.na(dataset$Area),
                     ave(dataset$Area, FUN = function(x) mean(x, na.rm = TRUE)),
                     dataset$Area)
dataset$Rain_Fall = ifelse(is.na(dataset$Rain_Fall),
                        ave(dataset$Rain_Fall, FUN = function(x) median(x, na.rm = TRUE)),
                        dataset$Rain_Fall)
dataset$Temp = ifelse(is.na(dataset$Temp),
                        ave(dataset$Temp, FUN = function(x) median(x, na.rm = TRUE)),
                        dataset$Temp)
dataset$Humidity = ifelse(is.na(dataset$Humidity),
                        ave(dataset$Humidity, FUN = function(x) median(x, na.rm = TRUE)),
                        dataset$Humidity)
dataset$Humidity = ifelse(is.na(dataset$Humidity),
                        ave(dataset$Humidity, FUN = function(x) median(x, na.rm = TRUE)),
                        dataset$Humidity)
dataset$Production = ifelse(is.na(dataset$Production),
                        ave(dataset$Production, FUN = function(x) median(x, na.rm = TRUE)),
                        dataset$Production)

In [None]:
library(NbClust)
library(cluster)

rownames(dataset) <- dataset$District_Name

# Select columns for analysis
data <- dataset[, c("Rain_Fall", "Temp", "Humidity")]

# Scale the numerical features
df <- as.data.frame(scale(data))

# Compute the sum of squares for different values of k
ss <- sapply(2:10, function(k) {
  kmeans_model <- kmeans(df, centers = k, nstart = 25)
  kmeans_model$tot.withinss
})

# Plot the elbow curve
plot(2:10, ss, type = "b", pch = 19, frame = FALSE, xlab = "Number of clusters (k)", ylab = "Within-cluster sum of squares")

# Identify the optimal number of clusters using the elbow method
optimal_k <- 3  # Adjust this value based on the elbow curve

# Perform k-means clustering with the optimal number of clusters
km.res <- kmeans(df, centers = optimal_k, nstart = 25)

# Visualize the clustering results
plot(df, col = km.res$cluster, pch = 19, main = "K-means Clustering")

# Print the k-means results
print(km.res)

# Check which record belongs to which cluster
cluster_labels <- km.res$cluster
print(cluster_labels)

# Cluster sizes
cluster_sizes <- table(km.res$cluster)
print(cluster_sizes)

In [None]:
# Compute the optimal number of clusters using the "complete" method
nb_clusters <- NbClust(df, diss = NULL, distance = "euclidean", min.nc = 2, max.nc = 10,method = "complete")

# Convert the optimal number of clusters to an integer
num_clusters <- as.integer(nb_clusters$Best.nc[1])

# Compute k-means with the optimal number of clusters
km.res <- kmeans(df, num_clusters, nstart = 25)

# Visualize the clustering results
clus_plot <- clusplot(df, km.res$cluster, color = TRUE, shade = TRUE, labels = 0, lines = 0)
clus_plot

# Print the k-means results
km.res

# Check which record belongs to which cluster
cluster_labels <- km.res$cluster
print(cluster_labels)

# Cluster sizes
cluster_sizes <- table(km.res$cluster)
print(cluster_sizes)

In [None]:
cov <- cov(df)

eigvals <- eigen(cov)$values
eigvecs <- eigen(cov)$vectors

eigvals <- sort(eigvals, decreasing=TRUE)
eigvecs <- eigvecs[, order(eigvals)]

# Plotting the first two principal components
plot(eigvals[1:2], type="b", xlab="Principal Component", ylab="Eigenvalue")

In [None]:
# Perform PCA
my_pca <- prcomp(df, scale = TRUE, center = TRUE, retx = TRUE)

# Print names and summary
print(names(my_pca))
print(my_pca)
print(summary(my_pca))

# Print principal components
print("The Principal Components are:")
print(dim(my_pca$x))
print(my_pca$x)

# Plotting the resultant principal components
biplot(my_pca, main = "Biplot", scale = 0)

# Add points to the biplot
points(my_pca$x[, 1], my_pca$x[, 2], col = "red")

# Compute standard deviation
print(my_pca$sdev)

# Compute variance
my_pca.var <- my_pca$sdev ^ 2
print(my_pca.var)

In [None]:
# Proportion of variance for a scree plot
propve <- my_pca.var / sum(my_pca.var)
print(propve)

# Plotting variance for each principal component
plot(propve, xlab = "principal component",
     ylab = "Proportion of Variance Explained",
     ylim = c(0, 1), type = "b",
     main = "Scree Plot")

# Plotting the cumulative proportion of variance 
plot(cumsum(propve),
     xlab = "Principal Component",
     ylab = "Cumulative Proportion of Variance Explained",
     ylim = c(0, 1), type = "b")

# top pca is which will at least cover 90 % variance of dimension
print(paste("Top PCA is",which(cumsum(propve) >= 0.9)[1]))

In [None]:
fdata <- read.csv("/kaggle/input/odisha-data/fdata.csv")
summary(fdata)

In [None]:
head(fdata)

In [None]:
tail(fdata)

In [None]:
#Taking care of missing  values
fdata$Area = ifelse(is.na(fdata$Area),
                     ave(fdata$Area, FUN = function(x) mean(x, na.rm = TRUE)),
                     fdata$Area)
fdata$Rainfall = ifelse(is.na(fdata$Rainfall),
                        ave(fdata$Rainfall, FUN = function(x) median(x, na.rm = TRUE)),
                        fdata$Rainfall)
fdata$Avg_temp = ifelse(is.na(fdata$Avg_temp),
                        ave(fdata$Avg_temp, FUN = function(x) median(x, na.rm = TRUE)),
                        fdata$Avg_temp)
fdata$Humidity = ifelse(is.na(fdata$Humidity),
                        ave(fdata$Humidity, FUN = function(x) median(x, na.rm = TRUE)),
                        fdata$Humidity)
fdata$Production = ifelse(is.na(fdata$Production),
                        ave(fdata$Production, FUN = function(x) median(x, na.rm = TRUE)),
                        fdata$Production)
null_values <- is.null(fdata)
null_values

In [None]:
d = fdata[,c("Area","Avg_temp","Rainfall","Humidity","Soil","Production")]

# Encoding categorical data
d$Soil = factor(d$Soil,
                      levels = c('RED AND YELLOW SOIL&RED SOIL', 'BLACK SOIL &RED SOIL',
                                 'ALLUVIAL SOIL & LITERITE SOIL & SALINE SOIL',
                                 'LITERITE SOIL & RED SOIL & RED AND BLACK SOIL',
                                 'ALLUVIAL SOIL  & LITERITE SOIL &SALINE SOIL',
                                 'SALINE SOIL & LITERITE SOIL & ALLUVIAL SOIL',
                                 'ALLUVIAL SOIL  & LITERITE SOIL & BLACK',
                                 'RED SOIL & RED AND YELLOW SOIL', 'RED SOIL & LITERITE SOIL',
                                 'LITERITE SOIL & BROWN FOREST',
                                 'ALLUVIAL SOIL & BLACK & BROWN FOREST & SALINE SOIL',
                                 'ALLUVIAL SOIL & LITERITE SOIL', 'ALLUVIAL SOIL & SALINE SOIL',
                                 'ALLUVIAL SOIL & LITERITE SOIL & RED SOIL', 'RED SOIL',
                                 'RED SOIL & LITERITE SOIL & ALLUVIAL SOIL',
                                 'ALLUVIAL SOIL & BLACK& RED SOIL', 'BLACK SOIL  & RED SOIL',
                                 'RED AND BLACK SOIL & RED AND YELLOW SOIL',
                                 'RED SOIL & BLACK SOIL & BROWN FOREST',
                                 'RED SOIL & LITERITE SOIL & RED AND YELLOW SOIL'),
                      labels = c(1, 2, 3, 4, 5, 6, 7 , 8, 9, 10, 11, 12 , 13, 14 ,15, 16, 17, 18, 19, 20, 21))

# Splitting the dataset into the Training set and Test set
library(caTools)
split = sample.split(d$Production, SplitRatio = 0.8)
training_set = subset(d, split == TRUE)
test_set = subset(d, split == FALSE)





# Fitting Multiple Linear Regression to the Training set
regressor = lm(formula = Production ~ .,
               data = training_set)

# Predicting the Test set results
y_pred = predict(regressor, newdata = test_set)

# Other useful functions 
confint(regressor, level=0.95) # CIs for model parameters 
fitted(regressor) # predicted values
anova(regressor) # anova table 


# Building the optimal model using Backward Elimination
regressor = lm(formula = Production ~ Area + Avg_temp + Rainfall + Humidity + Soil,
               data = d)
summary(regressor)

# diagnostic plots 
layout(matrix(c(1,2,3,4),2,2)) # optional 4 graphs/page 
plot(regressor)

# Plot matrix of all variables.
newdata = training_set[,c(1:6)]
summary(newdata)
plot(newdata, pch=16, col="blue", main="Matrix Scatterplot of Area,Avg_teap,Rainfall,Humidity,Soil and production")

In [None]:
library(cluster)

d <- sapply(d, as.numeric)

d <- na.omit(d)
# Compute CLARA
clara.res <- clara(d, 3, samples = 50, pamLike = TRUE)

# Plot cluster assignments
plot(d, col = clara.res$clustering, pch = 19,
     main = "CLARA Clustering Results",
     xlab = "Variable 1", ylab = "Variable 2")

In [None]:
library(caret)
library(MASS)

# Convert training and test sets to data frames if needed
training_set <- as.data.frame(training_set)
test_set <- as.data.frame(test_set)

# Estimate preprocessing parameters
preproc.parameter <- preProcess(training_set, method = c("center", "scale"))

# Transform the data using the estimated parameters
train.transform <- predict(preproc.parameter, training_set)
test.transform <- predict(preproc.parameter, test_set)

# Fit the model
model <- lda(Production ~ ., data = train.transform)

# Make predictions
predictions <- predict(model, test.transform)
head(predictions$class)
head(predictions$posterior)
head(predictions$x)
# Model accuracy
accuracy <- mean(predictions$class == test.transform$Production)
accuracy

# Model summary
summary(model)

In [None]:
# Plot 1: Scatter plot of LDA scores
plot(predictions$x, col = test.transform$Production, pch = 19, xlab = "LD1", ylab = "LD2")

In [None]:
library(tseries) 
ts_data <- ts(fdata$Rainfall,frequency = 5)
plot(ts_data, main = "Time Series Plot")
decomp <- decompose(ts_data)
plot(decomp)
# ADF test
adf_result <- adf.test(ts_data)

# Interpret the result
if (adf_result$p.value < 0.05) {
  print("The time series is stationary.")
  plot(ts_data, main = "Stationarity Check")
} else {
  print("The time series is non-stationary.")
}


# ACF and PACF plots
acf(ts_data)
pacf(ts_data)

# Automatic ARIMA model selection
library(forecast)
model <- auto.arima(ts_data)
summary(model)
# Residual analysis
residuals <- residuals(model)
acf(residuals)
pacf(residuals)
plot(residuals)
# Forecasting
forecast <- forecast(model, h = 12)  # Change 'h' to the desired forecast horizon
plot(forecast)