## Import Libraries

In [None]:

import matplotlib.pyplot as plt
import pathlib
import os
import pandas as pd
from collections import Counter
import importlib.util


## Import Data Files

In [None]:
source_file_path = os.path.join(os.path.dirname(os.getcwd()),'parent','constants','__init__.py')


spec = importlib.util.spec_from_file_location('__init__', source_file_path)
source_file = importlib.util.module_from_spec(spec)
spec.loader.exec_module(source_file)


path=[]
for dirname, _, filenames in os.walk(os.path.join(os.path.dirname(os.getcwd()),source_file.DATASET_DIR)): 
    for filename in filenames:
        if(pathlib.Path(os.path.join(dirname, filename)).suffix =='.csv'):
           path.append(os.path.join(dirname, filename))

##### Read the imported files

In [None]:
for filename in path:
    if(os.path.basename(filename)==source_file.TRAIN_SET): 
        train_set=pd.read_csv(filename) 
   

## Exploratory Data Analysis(EDA)

##### At first get a brief idea of data i.e features

In [None]:
train_set.head(2)#gives first 2 rows of dataset

##### Gather insights of the data now i.e null values

In [None]:
train_set.info()  #gives datatype,count of entries i.e for checking null values in features 
print("\n","="*80,"\n")


##### Find actual % of null values of all features now

In [None]:
#gives % of null values corresponding to all features
print(100*train_set.isnull().sum()/len(train_set))
print("\n","="*80,"\n")


##### Get a better insight of data i.e mean,s.d,percentiles etc.

In [None]:
#gives a vivid insight of data i.e mean,count,max,min,std.50% etc
print(train_set.describe())
print("\n","="*80,"\n")


##### Make a copy of actual data

In [None]:
train_set_mod= train_set.copy()


## Data Visualisation

In [None]:


# Separate counts
gender_counts = train_set_mod[source_file.COLUMN2_ENCODE].value_counts()
location_counts = train_set_mod[source_file.COLUMN1_ENCODE].value_counts()
credit_card_counts = train_set_mod[source_file.PLOT1].value_counts()
active_member_counts = train_set_mod[source_file.PLOT2].value_counts()
exited_member_counts = train_set_mod[source_file.PLOT3].value_counts()

#  figure with 3 rows and 2 columns of subplots
fig, axes = plt.subplots(3, 2, figsize=(8, 10))


gender_counts.plot(kind='bar', ax=axes[0, 0])
axes[0, 0].set_xlabel("Gender")
axes[0, 0].set_ylabel("Count")
axes[0, 0].set_title("Count of Gender")
axes[0, 0].grid(axis="y", linestyle="-.")

location_counts.plot(kind='bar', ax=axes[0, 1])
axes[0, 1].set_xlabel("Location")
axes[0, 1].set_ylabel("Count")
axes[0, 1].set_title("Count of Location")
axes[0, 1].grid(axis="y", linestyle="-.")


credit_card_counts.plot(kind='bar', ax=axes[1, 0])
axes[1, 0].set_xlabel("Credit Card Holder")
axes[1, 0].set_ylabel("Count")
axes[1, 0].set_title("Count of Credit Card Holder")
axes[1, 0].grid(axis="y", linestyle="-.")


active_member_counts.plot(kind='bar', ax=axes[1, 1])
axes[1, 1].set_xlabel("Active Member")
axes[1, 1].set_ylabel("Count")
axes[1, 1].set_title("Count of Active Member")
axes[1, 1].grid(axis="y", linestyle="-.")


exited_member_counts.plot(kind='bar', ax=axes[2, 0])
axes[2, 0].set_xlabel("Exited Member")
axes[2, 0].set_ylabel("Count")
axes[2, 0].set_title("Count of Exited Member")
axes[2, 0].grid(axis="y", linestyle="-.")

# Hide empty subplot 
axes[2, 1].axis('off')

#  spacing between subplots
plt.tight_layout()


plt.show()

In [None]:

# Create a figure with 1 row and 2 columns of subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))  


axes[0].hist(train_set_mod['Age'], bins=20, color='skyblue', edgecolor='black')
axes[0].set_xlabel("Age")
axes[0].set_ylabel("Frequency")
axes[0].set_title("Age Distribution")
axes[0].grid(axis="y", linestyle="-.")


axes[1].hist(train_set_mod['CreditScore'], bins=20, color='skyblue', edgecolor='black')
axes[1].set_xlabel("Credit Score")
axes[1].set_ylabel("Frequency")
axes[1].set_title("Credit Score Distribution")
axes[1].grid(axis="y", linestyle="-.")

plt.tight_layout()  
plt.show()