<h1 align=\"center\"><font color='green'><font size=\"6\">  Linear Regression vs Random Forest vs SVM</font> </h1>

<div style="background-color: #90EE90;">.</div>  </div>

In [1]:
#importing necessary libraries
import numpy as np
import pandas as pd

In [2]:
# Generating synthetic data to predict membership of grocery based on age, transaction & items bought
np.random.seed(42)
data_size = 200  

customer_age = np.random.randint(18, 70, size=data_size)  # Customer ages
total_spend = np.random.randint(5, 500, size=data_size)  # Total amount spend in the transaction
items_purchased = np.random.randint(1, 50, size=data_size)  # Number of items purchased
Membership = np.random.choice([0, 1], size=data_size, p=[0.6, 0.4])  
#0 means they are not a member while 1 means they are a member


In [3]:
# Creating a dataframe of the dataset
grocery = pd.DataFrame({
    'customer_age': customer_age,
    'total_spend': total_spend,
    'items_purchased': items_purchased,
    'Membership': Membership,

})


In [4]:
# Displaying the first five rows of the dataset
grocery.head()

Unnamed: 0,customer_age,total_spend,items_purchased,Membership
0,56,202,33,0
1,69,460,5,1
2,46,415,48,0
3,32,141,19,1
4,60,322,4,0


In [5]:
from sklearn.model_selection import train_test_split
#all the libraries should be imported in the beginning/first cell 
#but to make comparison, libraries has been imported when necessary for better understanding

In [6]:
#Spliting the dataset into features (customer age, transaction & items bought) and target (Membership).
# Assigning columns to Features(X) and target(y) 
X = grocery[['customer_age','total_spend','items_purchased']]
y = grocery['Membership']

#Performing 80/20 split to create training and test datasets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<div class = "alert alert-block alert-success">
     <font size = "5">
   Linear Regression



-  Assumes a linear relationship between the independent and dependent variables.
 - A simple equation with a slope and intercept.
 -  Easy to interpret, fast to train, and computationally efficient.
 -  Can be sensitive to outliers, might not capture complex non-linear relationships.

###### If you have a clear linear relationship and interpretability is important, Linear Regression is used.

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

#Training a Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [8]:
#Predicting using the linear regression model
y_pred_lr = lr_model.predict(X_test)
y_pred_lr

array([0.39276994, 0.3495367 , 0.40704909, 0.41468259, 0.3640612 ,
       0.38519071, 0.45938591, 0.43757463, 0.4150235 , 0.41543865,
       0.37085544, 0.34106957, 0.49530449, 0.41936654, 0.46051583,
       0.46218257, 0.37263591, 0.43986782, 0.36787398, 0.47259995,
       0.48851065, 0.48063503, 0.35002417, 0.42843344, 0.41252797,
       0.49020159, 0.50901527, 0.43227796, 0.42222225, 0.34799364,
       0.39338305, 0.43423196, 0.40991635, 0.44364972, 0.47308053,
       0.47003646, 0.45864268, 0.37590624, 0.38385449, 0.4007923 ])

In [9]:
#Converting predictions to binary classification (threshold = 0.5)
y_pred_lr_class = [1 if i >= 0.5 else 0 for i in y_pred_lr]
y_pred_lr_class

#The output indicates that the y_pred_lr list contains values that are primarily less than 0.5. 

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [10]:
# Evaluating Linear Regression Model
accuracy_lr = accuracy_score(y_test, y_pred_lr_class)
print(f"Linear Regression Accuracy: {accuracy_lr:.2f}")

Linear Regression Accuracy: 0.68


In [11]:
#Applying the model created in new data 
new_data = pd.DataFrame({'customer_age':[28],'total_spend':[45],'items_purchased':[5]})
predict_linear = lr_model.predict(new_data)

print(f"The customer aged 28 buying 5 items and spending $45 will receive: {predict_linear} membership")

The customer aged 28 buying 5 items and spending $45 will receive: [0.43285756] membership


<div class = "alert alert-block alert-success">
     <font size = "5">
  Random Forest



 -  No strong assumptions about the data distribution.
 - An ensemble of decision trees, each trained on a random subset of the data.
 - Handles non-linear relationships well, less prone to overfitting, can handle mixed data types.
 -  Can be computationally expensive for large datasets, might be less interpretable than linear models.
 
 ######  Random Forest is used If you have complex non-linear relationships and want a robust model.


In [12]:
from sklearn.ensemble import RandomForestClassifier

#Training a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [13]:
#Predicting using the random forest model
y_pred_rf = rf_model.predict(X_test)
y_pred_rf

array([1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0])

In [14]:
#Evaluating Random Forest Model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf:.2f}")

Random Forest Accuracy: 0.62


In [15]:
#Applying the model created in new data 
new_data1 = pd.DataFrame({'customer_age':[28],'total_spend':[45],'items_purchased':[5]})
predict_random = rf_model.predict(new_data1)

print(f"The customer aged 28 buying 5 items and spending $45 will receive: {predict_random} membership")

The customer aged 28 buying 5 items and spending $45 will receive: [0] membership


<div class = "alert alert-block alert-success">
     <font size = "5">
   Support Vector Machines (SVM)



 -  Assumes that data can be separated by a hyperplane.
 - Finds the optimal hyperplane to separate data points.
 -  Effective for high-dimensional data, can handle non-linear relationships using kernels.
 -  Can be computationally expensive for large datasets, sensitive to parameter tuning.
 
######  SVM is used If you have high-dimensional data and need to handle non-linear relationships, but computational resources are a concern.

In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [17]:
#Standardizing the data (SVM is sensitive to the scale of features)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
#Training an SVM classifier
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_scaled, y_train)

In [19]:
#Predicting using the SVM model
y_pred_svm = svm_model.predict(X_test_scaled)
y_pred_svm

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [20]:
#Evaluating SVM Model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm:.2f}")

SVM Accuracy: 0.70


In [21]:
#Applying the model created in new data 
new_data2 = pd.DataFrame({'customer_age':[28],'total_spend':[45],'items_purchased':[5]})
predict_svm = svm_model.predict(new_data2)

print(f"The customer aged 28 buying 5 items and spending $45 will receive: {predict_svm} membership")

The customer aged 28 buying 5 items and spending $45 will receive: [0] membership




In [22]:
new_data3 = np.array([[28,45,5]])
predict_svm = svm_model.predict(new_data3)

print(f"The customer aged 28 buying 5 items and spending $45 will receive: {predict_svm} membership")

The customer aged 28 buying 5 items and spending $45 will receive: [0] membership


In [23]:
print("Model Comparison:")
print(f"Linear Regression Accuracy: {accuracy_lr:.2f}")
print(f"Random Forest Accuracy: {accuracy_rf:.2f}")
print(f"SVM Accuracy: {accuracy_svm:.2f}")

Model Comparison:
Linear Regression Accuracy: 0.68
Random Forest Accuracy: 0.62
SVM Accuracy: 0.70
