*** Goal of this Analysis**
* Are there any null values or outliers? How will you handle them?

* What factors are significantly related to the number of web purchases?
 
* Which marketing campaign was the most successful?
 
* What does the average customer look like?
 
* Which products are performing best?
 
* Which channels are underperforming?

In [None]:
#Importing useful and important libraries

import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df=pd.read_csv("/kaggle/input/marketing-dataset/marketing_data.csv")
df.head()

In [None]:
df.describe()

In [None]:
df.info()

# 1.Are there any null values or outliers? How will you handle them?

In [None]:
# Rename the column to remove the space
df.rename(columns={' Income ': 'Income'}, inplace=True)

# Verify the column name has been changed
print(df.columns)


In [None]:
# Check for null values
null_values = df.isnull().sum()
print("Null Values:")
print(null_values)

#'Income' has null values, we can impute them with the median or mean:
df['Income'].fillna(df['Income'].median(), inplace=True)

# Or drop rows with null values:
# df.dropna(inplace=True)


In [None]:
from scipy import stats

# Calculate z-score for each numerical column
z_scores = stats.zscore(df.select_dtypes(include=['int64', 'float64']), nan_policy='omit')

# Set the threshold for z-score (e.g., 3)
threshold = 3

# Identify outliers
outliers = df[(z_scores > threshold).any(axis=1)]

# Remove outliers
cleaned_df = df[(z_scores <= threshold).all(axis=1)]

# Display the cleaned DataFrame
print("Cleaned DataFrame:")
print(cleaned_df)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot box plots for numerical columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=df.select_dtypes(include=['int64', 'float64']))
plt.xticks(rotation=45)
plt.title('Box Plot of Numerical Columns')
plt.show()


# 2.What factors are significantly related to the number of web purchases?

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm

# Assuming your DataFrame is named df

# Encode categorical variables
label_encoder = LabelEncoder()
df['Education_Code'] = label_encoder.fit_transform(df['Education'])

# Define independent variables (features) and dependent variable (target)
X = df[['Education_Code', 'Income', 'Recency', 'NumDealsPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']]
y = df['NumWebPurchases']

# Add a constant to the independent variables
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Print the summary statistics of the model
print(model.summary())


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate age of each customer
current_year = 2024  # Assuming the current year is 2024
df['Age'] = current_year - df['Year_Birth']

# Define age ranges
age_bins = [0, 30, 40, 50, 60, 70, 120]  # You can adjust these age ranges as needed
age_labels = ['0-29', '30-39', '40-49', '50-59', '60-69', '70+']

# Categorize customers into age ranges
df['Age_Range'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels)

# Visualize the relationship between age range and number of web purchases
plt.figure(figsize=(10, 6))
sns.boxplot(x='Age_Range', y='NumWebPurchases', data=df)
plt.xlabel('Age Range')
plt.ylabel('Number of Web Purchases')
plt.title('Relationship between Age Range and Number of Web Purchases')
plt.show()


# 3.Which marketing campaign was the most successful?

In [None]:
# Calculate response rates for each campaign
campaign_columns = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']

response_rates = {}
for column in campaign_columns:
    response_rate = df[column].sum() / len(df) * 100  # Response rate as a percentage
    response_rates[column] = response_rate

# Find the campaign with the highest response rate
most_successful_campaign = max(response_rates, key=response_rates.get)
highest_response_rate = response_rates[most_successful_campaign]

print(f"The most successful campaign is {most_successful_campaign} with a response rate of {highest_response_rate:.2f}%.")


In [None]:
import matplotlib.pyplot as plt

# Calculate response rates for each campaign
campaign_columns = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']

response_rates = {}
for column in campaign_columns:
    response_rate = df[column].sum() / len(df) * 100  # Response rate as a percentage
    response_rates[column] = response_rate

# Create a bar plot
plt.figure(figsize=(10, 6))
plt.bar(response_rates.keys(), response_rates.values(), color='skyblue')
plt.xlabel('Marketing Campaign')
plt.ylabel('Response Rate (%)')
plt.title('Response Rates of Marketing Campaigns')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()


# 4.What does the average customer look like?

In [None]:
# Calculate mean values for demographic and behavioral variables
mean_age = df['Age'].mean()
mean_income = df['Income'].mean()
mean_num_children = (df['Kidhome'] + df['Teenhome']).mean()  # Total number of children
mean_num_web_purchases = df['NumWebPurchases'].mean()

# You can add more variables as needed...

# Print the mean values
print(f"Average Age: {mean_age:.2f} years")
print(f"Average Income: ${mean_income:.2f}")
print(f"Average Number of Children: {mean_num_children:.2f}")
print(f"Average Number of Web Purchases: {mean_num_web_purchases:.2f}")

# You can add more print statements for other variables...


In [None]:
import matplotlib.pyplot as plt

# Create subplots
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 8))

# Histogram for Age
axes[0].hist(df['Age'], bins=20, color='skyblue', edgecolor='black')
axes[0].set_title('Distribution of Age')
axes[0].set_xlabel('Age')
axes[0].set_ylabel('Frequency')

# Histogram for Income
axes[1].hist(df['Income'].dropna(), bins=20, color='salmon', edgecolor='black')
axes[1].set_title('Distribution of Income')
axes[1].set_xlabel('Income')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()


# 5.Which products are performing best?

In [None]:
# Calculate total spending for each product category
total_spending = {
    'Wines': df['MntWines'].sum(),
    'Fruits': df['MntFruits'].sum(),
    'MeatProducts': df['MntMeatProducts'].sum(),
    'FishProducts': df['MntFishProducts'].sum(),
    'SweetProducts': df['MntSweetProducts'].sum(),
    'GoldProducts': df['MntGoldProds'].sum()
}

# Find the product category with the highest total spending
best_performing_product = max(total_spending, key=total_spending.get)
highest_spending = total_spending[best_performing_product]

print(f"The best-performing product category is {best_performing_product} with a total spending of ${highest_spending:,.2f}.")


In [None]:
import matplotlib.pyplot as plt

# Calculate total spending for each product category
total_spending = {
    'Wines': df['MntWines'].sum(),
    'Fruits': df['MntFruits'].sum(),
    'MeatProducts': df['MntMeatProducts'].sum(),
    'FishProducts': df['MntFishProducts'].sum(),
    'SweetProducts': df['MntSweetProducts'].sum(),
    'GoldProducts': df['MntGoldProds'].sum()
}

# Create a bar plot
plt.figure(figsize=(10, 6))
plt.bar(total_spending.keys(), total_spending.values(), color='skyblue')
plt.xlabel('Product Category')
plt.ylabel('Total Spending ($)')
plt.title('Total Spending on Each Product Category')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()


# 6.Which channels are underperforming?

In [None]:
# Calculate total number of purchases made through each channel
total_purchases = {
    'Web': df['NumWebPurchases'].sum(),
    'Catalog': df['NumCatalogPurchases'].sum(),
    'Store': df['NumStorePurchases'].sum()
}

# Find the channel with the lowest total number of purchases
underperforming_channel = min(total_purchases, key=total_purchases.get)
lowest_purchases = total_purchases[underperforming_channel]

print(f"The underperforming channel is {underperforming_channel} with a total of {lowest_purchases} purchases.")


In [None]:
import matplotlib.pyplot as plt

# Calculate total number of purchases made through each channel
total_purchases = {
    'Web': df['NumWebPurchases'].sum(),
    'Catalog': df['NumCatalogPurchases'].sum(),
    'Store': df['NumStorePurchases'].sum()
}

# Create a bar plot
plt.figure(figsize=(8, 6))
plt.bar(total_purchases.keys(), total_purchases.values(), color='skyblue')
plt.xlabel('Channel')
plt.ylabel('Total Number of Purchases')
plt.title('Total Number of Purchases Through Each Channel')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()
