In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
df = pd.read_csv('data/data.csv')

In [None]:
# View the first few records
df.head()

In [None]:
# Get information about the dataset
df.info()

In [None]:
# Statistical summary of the dataset
df.describe()

In [None]:
# checking for spaces in
print(df.columns)

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# check for unique values in column bank_account
df['bank_account'].unique()

In [None]:
# create a new data frame without the NaN in our target feature
df_wo_target_nan = df.dropna(axis=0)

In [None]:
df_wo_target_nan.info()

In [None]:
df_wo_target_nan.describe()

In [None]:
df_wo_target_nan.head().T

In [None]:
for column in df_wo_target_nan.columns:
    unique_values = df_wo_target_nan[column].unique()
    print(f"Column '{column}' has {len(unique_values)} unique value(s):")
    print(unique_values)
    print()

In [None]:
from scipy.stats import chi2_contingency

### Try to find out if there is a relation between the features (which contains objects) and our target feature, using the "Cramers V"

#### Small Effect: 
Cramér's V values close to 0 indicate a weak or negligible association between the categorical variables.

#### Medium Effect: 
Cramér's V values around 0.1 to 0.3 suggest a moderate association. This indicates that the variables have some degree of dependency, but the association may not be very strong.

#### Large Effect: 
Cramér's V values close to 0.3 or higher indicate a relatively strong association between the categorical variables. This suggests a notable dependency or relationship between the variables.

In [None]:
# country vs bank_account

column_name = 'country'
target_column = 'bank_account'

# Create a contingency table
contingency_table = pd.crosstab(df_wo_target_nan[column_name], df_wo_target_nan[target_column])

# Perform chi-square test
chi2, p, _, _ = chi2_contingency(contingency_table)

# Calculate Cramér's V
n = len(df_wo_target_nan)
cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))

print("Chi-square:", chi2)
print("p-value:", p)
print("Cramér's V:", cramers_v)

In [None]:
# year vs bank_account

column_name = 'year'
target_column = 'bank_account'

# Create a contingency table
contingency_table = pd.crosstab(df_wo_target_nan[column_name], df_wo_target_nan[target_column])

# Perform chi-square test
chi2, p, _, _ = chi2_contingency(contingency_table)

# Calculate Cramér's V
n = len(df_wo_target_nan)
cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))

print("Chi-square:", chi2)
print("p-value:", p)
print("Cramér's V:", cramers_v)

In [None]:
# household_size vs bank_account

column_name = 'household_size'
target_column = 'bank_account'

# Create a contingency table
contingency_table = pd.crosstab(df_wo_target_nan[column_name], df_wo_target_nan[target_column])

# Perform chi-square test
chi2, p, _, _ = chi2_contingency(contingency_table)

# Calculate Cramér's V
n = len(df_wo_target_nan)
cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))

print("Chi-square:", chi2)
print("p-value:", p)
print("Cramér's V:", cramers_v)

In [None]:
# age_of_respondent vs bank_account

column_name = 'age_of_respondent'
target_column = 'bank_account'

# Create a contingency table
contingency_table = pd.crosstab(df_wo_target_nan[column_name], df_wo_target_nan[target_column])

# Perform chi-square test
chi2, p, _, _ = chi2_contingency(contingency_table)

# Calculate Cramér's V
n = len(df_wo_target_nan)
cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))

print("Chi-square:", chi2)
print("p-value:", p)
print("Cramér's V:", cramers_v)

In [None]:
# relationship_with_head vs bank_account

column_name = 'relationship_with_head'
target_column = 'bank_account'

# Create a contingency table
contingency_table = pd.crosstab(df_wo_target_nan[column_name], df_wo_target_nan[target_column])

# Perform chi-square test
chi2, p, _, _ = chi2_contingency(contingency_table)

# Calculate Cramér's V
n = len(df_wo_target_nan)
cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))

print("Chi-square:", chi2)
print("p-value:", p)
print("Cramér's V:", cramers_v)

In [None]:
# marital_status vs bank_account

column_name = 'marital_status'
target_column = 'bank_account'

# Create a contingency table
contingency_table = pd.crosstab(df_wo_target_nan[column_name], df_wo_target_nan[target_column])

# Perform chi-square test
chi2, p, _, _ = chi2_contingency(contingency_table)

# Calculate Cramér's V
n = len(df_wo_target_nan)
cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))

print("Chi-square:", chi2)
print("p-value:", p)
print("Cramér's V:", cramers_v)

In [None]:
# education_level vs bank_account

column_name = 'education_level'
target_column = 'bank_account'

# Create a contingency table
contingency_table = pd.crosstab(df_wo_target_nan[column_name], df_wo_target_nan[target_column])

# Perform chi-square test
chi2, p, _, _ = chi2_contingency(contingency_table)

# Calculate Cramér's V
n = len(df_wo_target_nan)
cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))

print("Chi-square:", chi2)
print("p-value:", p)
print("Cramér's V:", cramers_v)

In [None]:
# job_type vs bank_account

column_name = 'job_type'
target_column = 'bank_account'

# Create a contingency table
contingency_table = pd.crosstab(df_wo_target_nan[column_name], df_wo_target_nan[target_column])

# Perform chi-square test
chi2, p, _, _ = chi2_contingency(contingency_table)

# Calculate Cramér's V
n = len(df_wo_target_nan)
cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))

print("Chi-square:", chi2)
print("p-value:", p)
print("Cramér's V:", cramers_v)

In [None]:
# Initialize an empty matrix to store Cramér's V values
n_columns = len(df_wo_target_nan.columns)
cramers_matrix = np.zeros((n_columns, n_columns))

# Iterate over each pair of columns
for i in range(n_columns):
    for j in range(n_columns):
        # Create a contingency table for the column pair
        contingency_table = pd.crosstab(df_wo_target_nan.iloc[:, i], df.iloc[:, j])
        
        # Perform the chi-square test and calculate Cramér's V
        chi2, _, _, _ = chi2_contingency(contingency_table)
        n = len(df_wo_target_nan)
        cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))
        
        # Store the Cramér's V value in the matrix
        cramers_matrix[i, j] = cramers_v

# Create a DataFrame from the matrix with column names as indices and columns
cramers_df = pd.DataFrame(cramers_matrix, index=df_wo_target_nan.columns, columns=df_wo_target_nan.columns)

print(cramers_df)

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import seaborn as sns
import matplotlib.pyplot as plt


# Initialize an empty matrix to store Cramér's V values
n_columns = len(df_wo_target_nan.columns)
cramers_matrix = np.zeros((n_columns, n_columns))

# Iterate over each pair of columns
for i in range(n_columns):
    for j in range(n_columns):
        # Create a contingency table for the column pair
        contingency_table = pd.crosstab(df_wo_target_nan.iloc[:, i], df.iloc[:, j])
        
        # Perform the chi-square test and calculate Cramér's V
        chi2, _, _, _ = chi2_contingency(contingency_table)
        n = len(df_wo_target_nan)
        cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))
        
        # Store the Cramér's V value in the matrix
        cramers_matrix[i, j] = cramers_v

# Create a DataFrame from the matrix with column names as indices and columns
cramers_df = pd.DataFrame(cramers_matrix, index=df_wo_target_nan.columns, columns=df_wo_target_nan.columns)

# Create a heatmap using Seaborn
plt.figure(figsize=(10, 8))
sns.heatmap(cramers_df, annot=True, fmt=".2f", cmap="crest", square=True)
plt.title("Cramér's V Heatmap")
plt.show()

In [None]:
'''                        bank_account  
country                     0.188737  
year                        0.144151  
uniqueid                    0.594507  
location_type               0.087163  
cellphone_access            0.209529  
household_size              0.068803  
age_of_respondent           0.139979  
gender_of_respondent        0.117110  
relationship_with_head      0.118065  
marital_status              0.090647  
education_level             0.388423  
job_type                    0.359027  
bank_account                0.999824'''