In [21]:
#Importing original dataset via CSV (https://archive.ics.uci.edu/dataset/468/online+shoppers+purchasing+intention+dataset)
import pandas as pd
def load_dataset(csv):
  df = pd.read_csv('online_shoppers_intention.csv')
  return df
csv = 'online_shoppers_intention.csv'
df = load_dataset(csv)

#Feature Engineering
#Converting Categorical into Integers and Use One-Hot Coding
sessions = df
sessions['Revenue'] = sessions['Revenue'].astype(int)#converting revenue into 0 and 1
sessions['SpecialDay'] = sessions['SpecialDay'].astype(int)
# One-hot encode VisitorType
visitor_type_dummies = pd.get_dummies(sessions['VisitorType'], prefix='VisitorType').astype(int)
sessions = pd.concat([sessions, visitor_type_dummies], axis=1)
sessions.drop('VisitorType', axis=1, inplace=True)
sessions['Weekend']=sessions['Weekend'].astype(int)
sessions['Month'] = sessions['Month'].map({'Feb':2, 'Mar':3,'May':5, 'June':6, 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12})

#Adding Relevant Columns
sessions['SessionDuration']=sessions['Administrative_Duration']+sessions['Informational_Duration']+sessions['ProductRelated_Duration']
sessions['TotalPagesViewed']=sessions['Administrative']+sessions['Informational']+sessions['ProductRelated']
sessions['BounceRates'] = sessions['BounceRates'].astype(float)
sessions.head()

#sessions.to_csv('updated_online_shoppers_intention.csv', sep=',', index=False)

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,...,Browser,Region,TrafficType,Weekend,Revenue,VisitorType_New_Visitor,VisitorType_Other,VisitorType_Returning_Visitor,SessionDuration,TotalPagesViewed
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0,...,1,1,1,0,0,0,0,1,0.0,1
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0,...,2,1,2,0,0,0,0,1,64.0,2
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0,...,1,9,3,0,0,0,0,1,0.0,1
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0,...,2,2,4,0,0,0,0,1,2.666667,2
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0,...,3,1,4,1,0,0,0,1,627.5,10


In [26]:
import os
from sklearn.model_selection import train_test_split

# Separate the testing set before running any imbalance
target_variable = "Revenue"

# Split the data into 80% training and 20% testing sets with stratification
train_data, test_data = train_test_split(
    sessions, 
    test_size=0.2, 
    stratify=sessions[target_variable], 
    random_state=42
)

# Create directory if it doesn't exist
os.makedirs('Test_Set_Files', exist_ok=True)

# Save the output file in the desired directory
outputfile_path = os.path.join('Test_Set_Files', 'test_osi.csv')
test_data.to_csv(outputfile_path, sep=',', index=False)

train_data.to_csv("train_online_shoppers_intention.csv", sep=',', index=False)


In [27]:
import pandas as pd
from imblearn.over_sampling import SMOTENC, SMOTE

 # Define categorical variables
categorical_vars = ['Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'Weekend', 'VisitorType_New_Visitor', 'VisitorType_Other', 'VisitorType_Returning_Visitor']

# Function to get column indices of categorical variables
def get_categorical_indices(df, categorical_vars):
  return [df.columns.get_loc(var) for var in categorical_vars if var in df.columns]

# Function to apply SMOTENC or SMOTE based on the presence of categorical variables
def apply_smote(df, target_var, categorical_vars):
  X = df.drop(target_var, axis=1)
  y = df[target_var]
  cat_indices = get_categorical_indices(X, categorical_vars)
  
  if cat_indices:#if there are categorical variables
    sm = SMOTENC(random_state=42, categorical_features=cat_indices)
  else:
    sm = SMOTE(random_state=42)
  
  X_res, y_res = sm.fit_resample(X, y)
  resampled_df = pd.DataFrame(X_res, columns=X.columns)
  resampled_df[target_var] = y_res
  
  return resampled_df

# List of file roots and folds for training data (update if necessary)
fileroots = ['all']
kfold = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']

# Process each file and apply SMOTENC or SMOTE
for root in fileroots:
  for k in kfold:
    inputfile = f'{root}_{k}_train.csv'
    trainingset = pd.read_csv('train_online_shoppers_intention.csv')
    
    # Apply SMOTENC or SMOTE
    smote_output = apply_smote(trainingset, 'Revenue', categorical_vars)
    
    # Save the output file in the desired directory
    outputfile = f'SM_{inputfile}'
    outputfile_path = os.path.join('SMOTE_files', outputfile)#creating folder to save files
    smote_output.to_csv(outputfile_path, sep=',', index=False)


In [28]:
#verifying the balancedness of the new dataset

def load_dataset(csv):
  df = pd.read_csv(csv)
  return df

fileroots = ['all']
kfold = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
for k in kfold:
    csv = f'SM_{fileroots[0]}_{k}_train.csv'
    df = load_dataset(f'SMOTE_files/{csv}')
    revenue_counts = df[df['Revenue'] == 1].shape[0]
    non_revenue_counts = df[df['Revenue'] == 0].shape[0]
    table = pd.DataFrame({
        'Revenue': [revenue_counts],
        'Non-Revenue': [non_revenue_counts]
    })
    print(f"Table for {csv}:")
    print(table)

Table for SM_all_1_train.csv:
   Revenue  Non-Revenue
0     8338         8338
Table for SM_all_2_train.csv:
   Revenue  Non-Revenue
0     8338         8338
Table for SM_all_3_train.csv:
   Revenue  Non-Revenue
0     8338         8338
Table for SM_all_4_train.csv:
   Revenue  Non-Revenue
0     8338         8338
Table for SM_all_5_train.csv:
   Revenue  Non-Revenue
0     8338         8338
Table for SM_all_6_train.csv:
   Revenue  Non-Revenue
0     8338         8338
Table for SM_all_7_train.csv:
   Revenue  Non-Revenue
0     8338         8338
Table for SM_all_8_train.csv:
   Revenue  Non-Revenue
0     8338         8338
Table for SM_all_9_train.csv:
   Revenue  Non-Revenue
0     8338         8338
Table for SM_all_10_train.csv:
   Revenue  Non-Revenue
0     8338         8338


In [None]:
df = load_dataset('train_online_shoppers_intention.csv')
df.value_counts()