In [None]:
import pandas as pd

In [None]:
# Load the dataset
df = pd.read_csv(r"C:\Users\Sachini\Downloads\bank+marketing\bank-additional\bank-additional\bank-additional.csv", sep=';')

In [None]:
# Display basic info
print("Dataset Overview:\n", df.info())

In [None]:
print("\nMissing Values (including 'unknown'):\n")
for col in df.select_dtypes(include=['object']).columns:
    print(f"{col}: {df[col].isin(['unknown']).sum()} missing values")

In [None]:
df

In [None]:
df.replace("unknown", "missing", inplace=True)  # Rename "unknown" to "missing"

In [None]:
df

In [None]:
# Summary statistics
print("\nSummary Statistics:\n", df.describe())

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image

# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns
numerical_cols = df.select_dtypes(include=['int', 'float']).columns

# Generate summary statistics
numerical_summary = df[numerical_cols].describe().round(2)
categorical_summary = df[categorical_cols].describe().T  # Transposed for better readability

# Function to create and save table as an image with styling
def save_table_as_image(summary_df, filename, title):
    fig, ax = plt.subplots(figsize=(15, min(10, len(summary_df) * 0.7)))  # Auto-adjust height

    # Hide axes
    ax.axis('tight')
    ax.axis('off')

    # Create the table with a styled background
    table = ax.table(cellText=summary_df.values, colLabels=summary_df.columns, 
                     rowLabels=summary_df.index, loc='center', cellLoc='center', colLoc='center',
                     colColours=["#f4f4f4"] * len(summary_df.columns),  # Light gray header background
                     rowColours=["#e6e6e6"] * len(summary_df))  # Alternate row shading

    # Adjust table font size
    table.auto_set_font_size(False)
    table.set_fontsize(10)

    # Set the title
    ax.set_title(title, fontsize=12, fontweight="bold", pad=10)

    # Save the table as a PNG image
    plt.savefig(filename, bbox_inches='tight', dpi=300)
    plt.close()

# Save numerical summary table
save_table_as_image(numerical_summary, 'numerical_summary.png', "Numerical Summary Statistics")

# Save categorical summary table
save_table_as_image(categorical_summary, 'categorical_summary.png', "Categorical Summary Statistics")

# Display both tables
display(Image('numerical_summary.png'))
display(Image('categorical_summary.png'))

In [None]:
# Step 1: Create a mapping dictionary for the 'education' column
education_mapping = {
    'basic.4y': 'basic',
    'basic.6y': 'basic',
    'basic.9y': 'basic'
}

# Step 2: Apply the mapping using the replace() method
df['education'] = df['education'].replace(education_mapping)

# Step 3: Check the updated unique values in 'education' column
print(df['education'].unique())

In [None]:
plt.figure(figsize=(12, 5))
sns.countplot(x='education', hue='y', data=df, order=df['education'].value_counts().index)
plt.xticks(rotation=90)
plt.title('Subscription Rate by Education Level')

# Save the plot as an image
plt.savefig('subscription_rate_by_education.png', bbox_inches='tight', dpi=300)

# Display the plot
plt.show()

In [None]:
# Set the figure size
plt.figure(figsize=(12, 5))

# Plot histogram
plt.subplot(1, 2, 1)
df['age'].hist(bins=30)
plt.xlim(0, 100)  # Set the range of the x-axis to 0-100
plt.title('Age Distribution')

# Plot boxplot
plt.subplot(1, 2, 2)
sns.boxplot(x='y', y='age', data=df)
plt.ylim(0, 100)  # Set the range of the y-axis to 0-100
plt.title('Age by Subscription Status')

# Save the figure
plt.tight_layout()
plt.savefig('age_distribution_and_boxplot.png', bbox_inches='tight', dpi=300)

# Show the plot
plt.show()

In [None]:
# Create figure
plt.figure(figsize=(10, 6))

# Scatter plot with regression line (trendline)
sns.regplot(x=df['duration'], y=df['campaign'], scatter=True, line_kws={"color": "red"})

# Add title and labels
plt.title('Scatter Plot of Duration vs. Campaign with Trendline')
plt.xlabel('Duration')
plt.ylabel('Campaign')

# Save the figure
plt.savefig('scatter_duration_vs_campaign_trendline.png', dpi=300, bbox_inches='tight')

# Show the plot
plt.show()

In [None]:
# Filter only subscribed clients
subscribed_clients = df[df["y"] == "yes"]["poutcome"].value_counts()

# Plot pie chart
plt.figure(figsize=(6,6))
subscribed_clients.plot.pie(autopct="%1.1f%%", colormap="viridis", startangle=90)
plt.ylabel("")
plt.title("Poutcome Distribution Among Subscribed Clients")

# Save the figure
plt.savefig("poutcome_pie_chart.png")
plt.show()