In [8]:
# Load necessary library
# install.packages("readxl")  # For reading Excel files
# install.packages("dplyr")   # For data manipulation

library(readxl)
library(dplyr)

In [16]:
# Set working directory
setwd("D:\\OEL\ 1")

In [10]:
# Load dataset
data <- read_excel("online_retail_II.xlsx")
print(colnames(data))

[1] "Invoice"     "StockCode"   "Description" "Quantity"    "InvoiceDate"
[6] "Price"       "Customer ID" "Country"    


In [11]:
# Remove rows with missing CustomerID
cleaned_data <- data %>% filter(!is.na(`Customer ID`)) 

In [12]:
# Replace missing values in UnitPrice and Quantity with their median
cleaned_data$Price[is.na(cleaned_data$Price)] <- median(cleaned_data$Price, na.rm = TRUE)
cleaned_data$Quantity[is.na(cleaned_data$Quantity)] <- median(cleaned_data$Quantity, na.rm = TRUE)

In [13]:
# Save the cleaned dataset
write.csv(cleaned_data, "processed_online_retail.csv", row.names = FALSE)

In [14]:
# Display summary
summary(cleaned_data)

   Invoice           StockCode         Description           Quantity       
 Length:417534      Length:417534      Length:417534      Min.   :-9360.00  
 Class :character   Class :character   Class :character   1st Qu.:    2.00  
 Mode  :character   Mode  :character   Mode  :character   Median :    4.00  
                                                          Mean   :   12.76  
                                                          3rd Qu.:   12.00  
                                                          Max.   :19152.00  
  InvoiceDate                         Price            Customer ID   
 Min.   :2009-12-01 07:45:00.00   Min.   :    0.000   Min.   :12346  
 1st Qu.:2010-03-26 11:26:00.00   1st Qu.:    1.250   1st Qu.:13983  
 Median :2010-07-08 19:12:00.00   Median :    1.950   Median :15311  
 Mean   :2010-07-01 01:12:19.78   Mean   :    3.888   Mean   :15361  
 3rd Qu.:2010-10-14 14:08:00.00   3rd Qu.:    3.750   3rd Qu.:16799  
 Max.   :2010-12-09 20:01:00.00   Max.   

In [15]:
# Task 4

In [17]:
# Load necessary libraries
library(caTools)
library(caret)

Loading required package: ggplot2

Loading required package: lattice



In [18]:
# Load dataset
data <- read.csv("diabetes.csv")

In [19]:
# Check data structure
str(data)

'data.frame':	768 obs. of  9 variables:
 $ Pregnancies             : int  6 1 8 1 0 5 3 10 2 8 ...
 $ Glucose                 : int  148 85 183 89 137 116 78 115 197 125 ...
 $ BloodPressure           : int  72 66 64 66 40 74 50 0 70 96 ...
 $ SkinThickness           : int  35 29 0 23 35 0 32 0 45 0 ...
 $ Insulin                 : int  0 0 0 94 168 0 88 0 543 0 ...
 $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
 $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
 $ Age                     : int  50 31 32 21 33 30 26 29 53 54 ...
 $ Outcome                 : int  1 0 1 0 1 0 1 0 1 1 ...


In [20]:
# Convert Outcome to factor (required for classification)
data$Outcome <- as.factor(data$Outcome)

In [21]:
# Split data (80% training, 20% testing)
set.seed(69)  # For reproducibility
split <- sample.split(data$Outcome, SplitRatio = 0.8)
train_data <- subset(data, split == TRUE)
test_data  <- subset(data, split == FALSE)

In [22]:
# Train Logistic Regression Model
model <- glm(Outcome ~ ., data = train_data, family = binomial)

In [23]:
# Make predictions on test data
predictions <- predict(model, test_data, type = "response")

In [24]:
# Convert probabilities to binary values (0 or 1)
predicted_classes <- ifelse(predictions > 0.5, 1, 0)

In [25]:
# Convert to factor for comparison
predicted_classes <- as.factor(predicted_classes)

In [26]:
# Calculate Accuracy
accuracy <- mean(predicted_classes == test_data$Outcome)

In [27]:
# Print accuracy
print(paste("Model Accuracy:", round(accuracy * 100, 2), "%"))

[1] "Model Accuracy: 75.97 %"
