In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# Function to create and show plots
def plot_heatmap(data, index, columns, values, title, xlabel, ylabel):
    heatmap_data = data.pivot_table(index=index, columns=columns, values=values, aggfunc='count')
    plt.figure(figsize=(10, 6))
    sns.heatmap(heatmap_data, annot=True, fmt='d', cmap='coolwarm', cbar=True)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()

In [None]:

# Load the dataset
file_path = "/Users/shakirahbenson/Python Projects/pythonProject/E Commerce Dataset(E Comm).csv"  # Update this path
data = pd.read_csv(file_path)

In [None]:
# Data cleaning
numeric_columns = ['Tenure', 'SatisfactionScore', 'OrderCount']
for col in numeric_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')
    data[col].fillna(data[col].mean(), inplace=True)

In [None]:
# Handle missing values
data.fillna(method='ffill', inplace=True)

In [None]:
# Inspect dataset
print("First 5 rows of the dataset:")
print(data.head())
print("\nDataset Info:")
print(data.info())

In [None]:

# Ensure the correlation matrix only uses numeric columns
numeric_data = data.select_dtypes(include=['float64', 'int64'])  # Select only numeric columns
correlation_matrix = numeric_data.corr()  # Calculate correlation matrix

In [None]:
# Plot the heatmap
columns_to_exclude = ['CustomerID']  # Add or remove columns as needed
filtered_data = numeric_data.drop(columns=columns_to_exclude)

In [None]:
# Compute the correlation matrix
correlation_matrix = filtered_data.corr()

In [None]:
# Heatmap Coorelation Matrix
plt.figure(figsize=(12, 10))  
sns.heatmap(
    correlation_matrix,
    annot=True,               # Add values to the heatmap
    cmap='coolwarm',          
    annot_kws={"size": 8},    # Reduce font size of annotations
    fmt=".2f"                 # Limit to 2 decimal places for clarity
)
plt.title('Correlation Matrix', fontsize=16)  
plt.xticks(fontsize=10)      
plt.yticks(fontsize=10)      
plt.tight_layout()           
plt.show()

In [None]:
# Visualizations
# Satisfaction score distribution
plt.figure(figsize=(8, 6))
sns.histplot(data['SatisfactionScore'], bins=10, kde=True)
plt.title('Distribution of Customer Satisfaction Scores')
plt.xlabel('Satisfaction Score')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Preferred payment methods
plt.figure(figsize=(8, 6))
sns.countplot(y=data['PreferredPaymentMode'], order=data['PreferredPaymentMode'].value_counts().index)
plt.title('Preferred Payment Modes')
plt.xlabel('Count')
plt.ylabel('Payment Mode')
plt.show()

In [None]:
# Tenure vs Churn
plt.figure(figsize=(10, 6))
sns.boxplot(x='Churn', y='Tenure', data=data)
plt.title('Tenure vs Churn')
plt.xlabel('Churn')
plt.ylabel('Tenure')
plt.show()

In [None]:
# Gender Distribution by Churn
plt.figure(figsize=(10, 6))
sns.countplot(x='Churn', hue='Gender', data=data)
plt.title('Gender Distribution by Churn')
plt.xlabel('Churn')
plt.ylabel('Count')
plt.show()

In [None]:
# Ensure numeric columns are properly converted
numeric_columns = ['OrderCount', 'Tenure', 'SatisfactionScore']
for col in numeric_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')
    data[col].fillna(data[col].mean(), inplace=True)

In [None]:
data.fillna(method='ffill', inplace=True)  # Handle any remaining missing values

In [None]:
# Categorize OrderCount into bins for better visualization
data['OrderCountBins'] = pd.cut(data['OrderCount'], bins=[0, 5, 10, 20, 50, 100], 
                                labels=['0-5', '6-10', '11-20', '21-50', '51-100'], include_lowest=True)

In [None]:
# Calculate churn rate for each bin
order_churn_rate = data.groupby('OrderCountBins')['Churn'].mean().reset_index()
order_churn_rate.rename(columns={'Churn': 'ChurnRate'}, inplace=True)

In [None]:
# Plot the churn rate by OrderCount bins
plt.figure(figsize=(10, 6))
sns.barplot(x='OrderCountBins', y='ChurnRate', data=order_churn_rate, palette='viridis')
plt.title('Churn Rate by Order Count', fontsize=16)
plt.xlabel('Order Count Range', fontsize=14)
plt.ylabel('Churn Rate', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:

# Complaints and Churn
complaint_churn_data = data.groupby(['Complain', 'Churn']).size().unstack()
complaint_churn_data.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Complaints and Churn Relationship')
plt.xlabel('Complaint Status')
plt.ylabel('Count')
plt.legend(title='Churn', labels=['No Churn', 'Churn'])
plt.show()

In [None]:
# Update the heatmap to use 'sum'
def plot_heatmap(data, index, columns, values, title, xlabel, ylabel):
    heatmap_data = data.pivot_table(index=index, columns=columns, values=values, aggfunc='sum')
    plt.figure(figsize=(10, 6))
    sns.heatmap(heatmap_data, annot=True, fmt='.1f', cmap='coolwarm', cbar=True)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()

In [None]:
# Gender vs Preferred Cat
plot_heatmap(data, 'Gender', 'PreferedOrderCat', 'Churn', 
             'Preferred Order Category by Gender (Churn)', 
             'Preferred Order Category', 'Gender')

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
# Select features for clustering
clustering_features = ['Tenure', 'SatisfactionScore', 'OrderCount']
X = data[clustering_features]

In [None]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Perform K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)  # You can tune the number of clusters
data['CustomerSegment'] = kmeans.fit_predict(X_scaled)

In [None]:
# Visualize the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data['Tenure'], y=data['SatisfactionScore'], hue=data['CustomerSegment'], palette='viridis', s=100)
plt.title('Customer Segmentation')
plt.xlabel('Tenure')
plt.ylabel('Satisfaction Score')
plt.legend(title='Segment')
plt.show()

In [None]:
# Analyze cluster characteristics
cluster_summary = data.groupby('CustomerSegment')[clustering_features].mean()
print("Cluster Summary:")
print(cluster_summary)

In [None]:
# Calculate total revenue per customer
customer_revenue = data.groupby('CustomerID')['CashbackAmount'].sum()

In [None]:
# Average revenue per customer
avg_revenue = customer_revenue.mean()

In [None]:
# Estimate churn probability using logistic regression
from sklearn.linear_model import LogisticRegression

In [None]:
# Select relevant features
features = ['Tenure', 'OrderCount', 'SatisfactionScore', 'CouponUsed']
X = data[features]
y = data['Churn']

In [None]:
# Train logistic model
model = LogisticRegression()
model.fit(X, y)

In [None]:
# Predict churn probability
data['ChurnProbability'] = model.predict_proba(X)[:, 1]

In [None]:
# Compute expected CLV
data['CLV'] = data['CashbackAmount'] * (1 - data['ChurnProbability'])

In [None]:
# Display high-value customers
high_value_customers = data[['CustomerID', 'CLV']].sort_values(by='CLV', ascending=False)
print(high_value_customers.head(10))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
# Prepare features and target
X = data[['Tenure', 'OrderCount', 'SatisfactionScore', 'CouponUsed']]
y = data['Churn']

In [None]:
# Split into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Predict churn
y_pred = model.predict(X_test)

In [None]:
# Show performance
print(classification_report(y_test, y_pred))