In [9]:
import pandas as pd
import numpy as np

In [10]:
df = pd.read_excel("Data.xlsx")
print(df)

   Region   Age   Income Online Shopper
0   India  49.0  86400.0             No
1  Brazil  32.0  57600.0            Yes
2     USA  35.0  64800.0             No
3  Brazil  43.0  73200.0             No
4     USA  45.0      NaN            Yes
5   India  40.0  69600.0            Yes
6  Brazil   NaN  62400.0             No
7   India  53.0  94800.0            Yes
8     USA  55.0  99600.0             No
9   India  42.0  80400.0            Yes


In [11]:
#checking for the missing values
print(df.isnull().sum())

Region            0
Age               1
Income            1
Online Shopper    0
dtype: int64


In [12]:
#checking for the duplicated rows
duplicated_row = df[df.duplicated()]
print(duplicated_row)

Empty DataFrame
Columns: [Region, Age, Income, Online Shopper]
Index: []


In [13]:
#checking for datatypes of columns
print(df.dtypes)

Region             object
Age               float64
Income            float64
Online Shopper     object
dtype: object


DATA CLEANING

In [19]:
#Converting categorical data to numerical data
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce

# Convert 'Region' column to numerical using one-hot encoding
df = pd.get_dummies(df,columns=['Region'], prefix='Region')

# Convert 'Online Shopper' column to numerical using binary encoding
binary_encoder = ce.BinaryEncoder(cols=['Online Shopper'])
df = binary_encoder.fit_transform(df)
print(df)


    Age   Income  Online Shopper_0  Online Shopper_1  Region_Brazil  \
0  49.0  86400.0                 0                 1          False   
1  32.0  57600.0                 1                 0           True   
2  35.0  64800.0                 0                 1          False   
3  43.0  73200.0                 0                 1           True   
4  45.0      NaN                 1                 0          False   
5  40.0  69600.0                 1                 0          False   
6   NaN  62400.0                 0                 1           True   
7  53.0  94800.0                 1                 0          False   
8  55.0  99600.0                 0                 1          False   
9  42.0  80400.0                 1                 0          False   

   Region_India  Region_USA  
0          True       False  
1         False       False  
2         False        True  
3         False       False  
4         False        True  
5          True       False  
6       

In [21]:
#Filling the missing values
df_filled= df.fillna(df.mean())
print(df_filled)

         Age        Income  Online Shopper_0  Online Shopper_1  Region_Brazil  \
0  49.000000  86400.000000                 0                 1          False   
1  32.000000  57600.000000                 1                 0           True   
2  35.000000  64800.000000                 0                 1          False   
3  43.000000  73200.000000                 0                 1           True   
4  45.000000  76533.333333                 1                 0          False   
5  40.000000  69600.000000                 1                 0          False   
6  43.777778  62400.000000                 0                 1           True   
7  53.000000  94800.000000                 1                 0          False   
8  55.000000  99600.000000                 0                 1          False   
9  42.000000  80400.000000                 1                 0          False   

   Region_India  Region_USA  
0          True       False  
1         False       False  
2         False   

In [27]:
#Splitting the data
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
x = df_filled.drop(['Online Shopper_0','Online Shopper_1'], axis=1)
y = df_filled['Online Shopper_1']
#split the data into training and testing sets
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.2,random_state=42)
# Scale the features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(x_train_scaled, y_train)
#make predictions
y_pred = model.predict(x_test_scaled)
y_test_binary = y_test.replace({'Online Shopper_0': 0, 'Online Shopper_1': 1})
y_pred_binary = pd.Series(y_pred).replace({'Online Shopper_0': 0, 'Online Shopper_1': 1})


# Evaluate the model
accuracy = accuracy_score(y_test_binary, y_pred_binary)
print("Accuracy:", accuracy)





Accuracy: 0.0
