## Data Preprocessing 

In [77]:
import pandas as pd
import numpy as np

file1 = 'ecommerce_customer_data_custom_ratios.csv'
file2 = 'ecommerce_customer_data_large.csv'

df_custom_ratios = pd.read_csv(file1)
df_large = pd.read_csv(file2)

df_large.columns = df_large.columns.str.replace('_large', '', regex=False)

# Inspect the first few rows
print("Custom Ratios Data:")
print(df_custom_ratios.head())
print("\nLarge Data:")
print(df_large.head())

df = pd.merge(df_custom_ratios, df_large, how='inner', on='Customer ID', suffixes=('_custom', '_large'))

# Print columns to identify potential conflicts
print("\nColumns after merge:")
print(df.columns)

# Combine columns with the same base name
for col in df.columns:
    if col.endswith('_custom') or col.endswith('_large'):
        base_col = col.rsplit('_', 1)[0]
        if base_col in df.columns:
            # Combine columns, prioritizing non-null values
            df[base_col] = df[base_col].combine_first(df[col])
        else:
            # Rename to base column if not present yet
            df[base_col] = df[col]
        # Drop the old columns with suffixes
        df = df.drop(columns=[col])

# Check for missing values
print("\nMissing Values Before Handling:")
print(df.isnull().sum())

# Handle missing values
# For numerical columns, fill with mean
numerical_cols = df.select_dtypes(include=[np.number]).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())

# For categorical columns, fill with mode
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Verify missing values are handled
print("\nMissing Values After Handling:")
print(df.isnull().sum())

df = df.drop_duplicates(subset=['Customer ID', 'Purchase Date'])

cleaned_file_path = 'cleaned_ecommerce_data.csv'
df.to_csv(cleaned_file_path, index=False)


print("\nData pre-processing completed and saved to", cleaned_file_path)


Custom Ratios Data:
   Customer ID        Purchase Date Product Category  Product Price  Quantity  \
0        46251  2020-09-08 09:38:32      Electronics             12         3   
1        46251  2022-03-05 12:56:35             Home            468         4   
2        46251  2022-05-23 18:18:01             Home            288         2   
3        46251  2020-11-12 13:13:29         Clothing            196         1   
4        13593  2020-11-27 17:55:11             Home            449         1   

   Total Purchase Amount Payment Method  Customer Age  Returns  \
0                    740    Credit Card            37      0.0   
1                   2739         PayPal            37      0.0   
2                   3196         PayPal            37      0.0   
3                   3509         PayPal            37      0.0   
4                   3452    Credit Card            49      0.0   

         Customer Name  Age  Gender  Churn  
0  Christine Hernandez   37    Male      0  
1  Chr

In [78]:
print(df.columns)

Index(['Customer ID', 'Purchase Date', 'Product Category', 'Product Price',
       'Quantity', 'Total Purchase Amount', 'Payment Method', 'Customer Age',
       'Returns', 'Customer Name', 'Age', 'Gender', 'Churn'],
      dtype='object')


In [79]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.get_dummies(df, columns=['Product Category', 'Payment Method', 'Gender'], drop_first=True)

# Feature scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['Product Price', 'Total Purchase Amount', 'Customer Age']])
df[['Product Price', 'Total Purchase Amount', 'Customer Age']] = scaled_features

# Splitting the data into training and testing sets
X = df.drop(['Customer ID', 'Purchase Date', 'Returns', 'Customer Name', 'Churn'], axis=1)
y = df['Churn']  # or 'Returns', depending on what you're predicting

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Linear Regression

In [80]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Define features (X) and target variable (y)
X = df.drop(['Total Purchase Amount', 'Customer ID', 'Purchase Date', 'Returns', 'Customer Name', 'Churn'], axis=1)
y = df['Total Purchase Amount']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


Mean Squared Error: 0.9993181643763727


## Cross Validation For Linear Regression

In [81]:
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation for linear regression
cross_val_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

# Print results
print(f'Cross-Validation Scores: {cross_val_scores}')
print(f'Mean CV MSE: {-cross_val_scores.mean()}')


Cross-Validation Scores: [-1.00296364 -1.00189746 -0.99144301 -0.99437972 -0.99612099]
Mean CV MSE: 0.9973609640724124


## Performance metrics for the model

In [82]:
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import numpy as np

# Assuming y_test are true values and y_pred are predicted values
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)  # Root Mean Squared Error
mae = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error
r2 = r2_score(y_test, y_pred)  # R^2 Score

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")
print(f"R² Score: {r2}")


Mean Squared Error: 0.9993181643763727
Root Mean Squared Error: 0.9996590240558891
Mean Absolute Error: 0.8661135906321362
R² Score: 0.0026690511505077508


## Updating Linear Regression with Multiple Variables

In [83]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Select features (X) and target variable (y)
X = df[['Product Price', 'Quantity', 'Customer Age']]  # Multiple variables
y = df['Total Purchase Amount']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the model
multi_var_model = LinearRegression()
multi_var_model.fit(X_train, y_train)

# Predictions
y_pred = multi_var_model.predict(X_test)

# Performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Multi-variable MSE: {mse}")
print(f"R² Score: {r2}")


Multi-variable MSE: 0.9993799535324418
R² Score: 0.0026073848665875143
