## EDA ##
1. data sourcing
2. data cleaning 
3. univariate analysis
4. bivariate analysis
5. derived metrics



In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


loading dataset

In [None]:
data_source =pd.read_csv("./dataset.csv")

In [None]:
#understanding the data
data_source.head()

In [None]:
data_source.tail()

In [None]:
data_source.shape

In [None]:
data_source.describe()

In [None]:
data_source.columns

In [None]:
data_source['gender'].unique()

In [None]:
data_source.nunique

# data cleaning #


handling some missing values

In [None]:
data_source.isnull().sum()

In [None]:
data_source.dropna()

In [None]:
# Check for missing values before replacing
print(data_source.isnull().sum())



In [28]:
# Replace NaN with the mean in the 'MonthlyCharges' column
data_source['MonthlyCharges'] = data_source['MonthlyCharges'].fillna(data_source['MonthlyCharges'].mean())


In [None]:
# After replacing, verify again
print(data_source.isnull().sum())

In [None]:

#To replace missing values (NaN) in categorical data with the mode (the most frequent value), you can use pandas' fillna() method along with the mode() function.

#Steps to Replace NaN with the Mode
# Replace NaN with the mode in the 'gender' column
data_source['gender'] = data_source['gender'].fillna(data_source['gender'].mode()[0])

#2. Replace Missing Values in All Categorical Columns
#To replace missing values in all categorical columns, you can loop through them or filter the DataFrame for categorical columns:
# Replace NaN with the mode for all categorical columns
categorical_columns = data_source.select_dtypes(include=['object']).columns
for col in categorical_columns:
    data_source[col] = data_source[col].fillna(data_source[col].mode()[0])
#3. Check for Missing Values
#Before and after replacing, check for missing values:
# Check for missing values
print(data_source.isnull().sum())
# Check for missing values
print("Missing values before:\n", data_source.isnull().sum())

# Replace NaN in numeric columns with mean
data_source = data_source.fillna(data_source.select_dtypes(include=['TotalCharges']).mean())

# Replace NaN in categorical columns with mode
categorical_columns = data_source.select_dtypes(include=['object']).columns
for col in categorical_columns:
    data_source[col] = data_source[col].fillna(data_source[col].mode()[0])

# Check for missing values after replacement
print("\nMissing values after:\n", data_source.isnull().sum())

# Print updated DataFrame
print("\nUpdated DataFrame:\n", data_source)

In [None]:
data = data_source.drop(['customerID'], axis= 1)
data.head()

outlier treatment

In [None]:
outliers = data_source[data_source['MonthlyCharges'] > 200]  # Example threshold
print("Outliers:\n", outliers)

In [None]:
# Calculate Z-scores for each value in the 'MonthlyCharges' column
z_scores = (data_source['MonthlyCharges'] - data_source['MonthlyCharges'].mean()) / data_source['MonthlyCharges'].std()

# Identify outliers
outliers = data_source[np.abs(z_scores) > 3]
print("Outliers:\n", outliers)

# Remove outliers where Z-score > 3 or < -3
filtered_data = data_source[np.abs(z_scores) <= 3]

# Print the result
print("Original Data:\n", data_source)
print("\nFiltered Data (Outliers Removed):\n", filtered_data)

In [None]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = data_source['MonthlyCharges'].quantile(0.25)
Q3 = data_source['MonthlyCharges'].quantile(0.75)
IQR = Q3 - Q1

# Define outlier bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = data_source[(data_source['MonthlyCharges'] < lower_bound) | (data_source['MonthlyCharges'] > upper_bound)]
print("Outliers:\n", outliers)

filtered_data = data_source[(data_source['MonthlyCharges'] >= lower_bound) & 
                            (data_source['MonthlyCharges'] <= upper_bound)]

# Print the result
print("Original Data:\n", data_source)
print("\nFiltered Data (Outliers Removed):\n", filtered_data)


In [None]:
import matplotlib.pyplot as plt

# Plot boxplot for 'MonthlyCharges'
plt.boxplot(data_source['MonthlyCharges'], vert=False)
plt.title('Box Plot for MonthlyCharges')
plt.show()


In [None]:
# Scatter plot to visualize outliers
plt.scatter(data_source.index, data_source['MonthlyCharges'])
plt.title('Scatter Plot of MonthlyCharges')
plt.xlabel('Index')
plt.ylabel('MonthlyCharges')
plt.show()


handling invalid value

In [None]:
# Function to check validity
def validate_data(data):
    errors = []
    
    # Example: Check for negative values
    if (data['MonthlyCharges'] < 0).any():
        errors.append("Negative values found in 'MonthlyCharges'")
        
    # Example: Check for invalid categories
    if not set(data['PaymentMethod'].unique()).issubset({'Credit Card', 'Cash', 'Online'}):
        errors.append("Unexpected values in 'PaymentMethod'")
    
    return errors

# Run validation
validation_errors = validate_data(data_source)
print("Validation Errors:", validation_errors)


In [None]:
# Remove duplicate rows
data_source = data_source.drop_duplicates()



In [None]:
# Example: Replace inconsistent labels
data_source['gender'] = data_source['gender'].str.strip().str.lower()
data_source['gender'].replace({'male': 'M', 'female': 'F'}, inplace=True)

In [None]:
# Convert 'date_column' to datetime, mark invalid dates as NaT
data_source['date_column'] = pd.to_datetime(data_source['date_column'], errors='coerce')


univariate analysis


In [None]:
# Summary statistics for numerical variables
print(data_source.describe())


In [None]:
# Frequency counts for categorical variables
print(data_source['PaymentMethod'].value_counts())


In [None]:
import matplotlib.pyplot as plt
data_source['MonthlyCharges'].hist(bins=30, edgecolor='black')
plt.title('Distribution of Monthly Charges')
plt.xlabel('Monthly Charges')
plt.ylabel('Frequency')
plt.show()


In [None]:
data_source.boxplot(column='MonthlyCharges')
plt.title('Box Plot of Monthly Charges')
plt.show()


In [None]:
data_source['PaymentMethod'].value_counts().plot(kind='bar')
plt.title('Frequency of Payment Methods')
plt.xlabel('Payment Method')
plt.ylabel('Count')
plt.show()


In [None]:
Q1 = data_source['MonthlyCharges'].quantile(0.25)
Q3 = data_source['MonthlyCharges'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = data_source[(data_source['MonthlyCharges'] < lower_bound) | (data_source['MonthlyCharges'] > upper_bound)]
print(outliers)


In [None]:
import seaborn as sns

# Plot original data
sns.histplot(data_source['MonthlyCharges'], kde=True)
plt.title('Original Distribution of Monthly Charges')
plt.show()

# After transformation (e.g., log)
sns.histplot(np.log1p(data_source['MonthlyCharges']), kde=True)
plt.title('Transformed Distribution of Monthly Charges')
plt.show()


In [None]:
print(data_source['PaymentMethod'].value_counts().sort_values(ascending=False))


In [None]:

print(data_source['PaymentMethod'].value_counts(dropna=False))


In [None]:
print(data_source['PaymentMethod'].value_counts(normalize=True))


bivariate analysis

Common Techniques for Bivariate Analysis:
Numerical vs Numerical:

Pearson's Correlation: Quantifies the linear relationship.
Scatter Plot: Visualizes the relationship between two continuous variables.
Line Plot: If one variable is time-based, you can use a line plot.
pair plot
heat map


Categorical vs Categorical:

Crosstab / Contingency Table: Shows the frequency distribution of the two categorical variables.
Stacked Bar Plot: Visualizes how one categorical variable is distributed across another.


Numerical vs Categorical:

Box Plot: Compares the distribution of numerical data across different categories.
Violin Plot: Similar to a box plot, but with a smoothed distribution.
Bar Plot: To show the average of the numerical variable for each category.
facet grid


In [None]:
# FacetGrid to visualize MonthlyCharges by PaymentMethod for each Churn status
g = sns.FacetGrid(data_source, col='Churn', height=5)
g.map(sns.histplot, 'MonthlyCharges', kde=True)
g.set_axis_labels('Monthly Charges', 'Frequency')
g.set_titles('Churn: {col_name}')
plt.show()


In [None]:
correlation_matrix = data.corr()
print(correlation_matrix['MonthlyCharges'])  # Shows correlation between 'MonthlyCharges' and other numerical features


In [None]:
from sklearn.preprocessing import LabelEncoder


# Step 1: Identify non-numeric columns
non_numeric_columns = data_source.select_dtypes(exclude=['number']).columns
print("Non-numeric columns:", non_numeric_columns)

# Step 2: Drop non-numeric columns or convert them to numeric
data_numeric = data_source.select_dtypes(include=['number'])

# Step 3: If necessary, convert categorical columns to numeric
label_encoder = LabelEncoder()
data_source['PaymentMethod'] = label_encoder.fit_transform(data_source['PaymentMethod'])

# Step 4: Compute the correlation matrix on the numeric data
correlation_matrix = data_numeric.corr()
print(correlation_matrix)


In [51]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency

In [None]:
# Plotting the heatmap of correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# 2. Scatter Plot for numerical variables
sns.scatterplot(x='MonthlyCharges', y='TotalCharges', data=data_source)
plt.title('Scatter Plot: Monthly Charges vs Total Charges')
plt.xlabel('Monthly Charges')
plt.ylabel('Total Charges')
plt.show()

In [None]:
# 3. Crosstab for categorical variables
crosstab = pd.crosstab(data_source['PaymentMethod'], data_source['Churn'])
print(crosstab)

In [None]:
# Stacked bar plot for categorical vs categorical
crosstab.plot(kind='bar', stacked=True)
plt.title('Payment Method vs Churn')
plt.xlabel('Payment Method')
plt.ylabel('Frequency')
plt.show()

In [None]:
# 4. Box Plot for numerical vs categorical
sns.boxplot(x='Churn', y='MonthlyCharges', data=data_source)
plt.title('Monthly Charges by Churn Status')
plt.show()

In [None]:


# 5. Chi-Square Test for categorical variables
crosstab_churn = pd.crosstab(data_source['PaymentMethod'], data_source['Churn'])
chi2, p, dof, expected = chi2_contingency(crosstab_churn)
print(f"Chi-Square Statistic: {chi2}")
print(f"P-value: {p}")



In [None]:
sns.pairplot(data_source[['MonthlyCharges', 'TotalCharges', 'tenure']])
plt.show()


In [None]:
sns.violinplot(x='Churn', y='MonthlyCharges', data=data_source)
plt.title('Violin Plot: Monthly Charges by Churn')
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
crosstab.plot(kind='bar', stacked=True)
plt.title('Payment Method vs Churn')
plt.xlabel('Payment Method')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Countplot for 'PaymentMethod' to see the distribution of each category
sns.countplot(x='PaymentMethod', data=data_source)
plt.title('Count of Payment Method Categories')
plt.xlabel('Payment Method')
plt.ylabel('Count')
plt.show()


In [None]:
# KDE Plot for 'MonthlyCharges'
sns.kdeplot(data_source['MonthlyCharges'], shade=True)
plt.title('KDE of Monthly Charges')
plt.xlabel('Monthly Charges')
plt.ylabel('Density')
plt.show()


In [None]:
# Descriptive statistics for 'MonthlyCharges'
print(data_source['MonthlyCharges'].describe())


In [None]:
# Pie chart for 'PaymentMethod'
payment_method_counts = data_source['PaymentMethod'].value_counts()
payment_method_counts.plot.pie(autopct='%1.1f%%', startangle=90, cmap='Set3', figsize=(6,6))
plt.title('Distribution of Payment Methods')
plt.ylabel('')
plt.show()


In [73]:
# Convert 'MonthlyCharges' and 'TotalCharges' to numeric, coercing errors to NaN
data_source['MonthlyCharges'] = pd.to_numeric(data_source['MonthlyCharges'], errors='coerce')
data_source['TotalCharges'] = pd.to_numeric(data_source['TotalCharges'], errors='coerce')


In [74]:
covariance = data_source[['MonthlyCharges', 'TotalCharges']].cov()
print(covariance)


                MonthlyCharges  TotalCharges
MonthlyCharges      905.410934  4.440133e+04
TotalCharges      44401.333073  5.138252e+06
