In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [10]:
# lets define sum functions for EDA
# Importing librariesm, Helper functions, Loading files and Transformations identified during EDA

from IPython.core.interactiveshell import InteractiveShell #

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.stats import f_oneway, pointbiserialr, pearsonr, spearmanr # statistical analysis

from sklearn.preprocessing import LabelEncoder,OneHotEncoder, MinMaxScaler # transformation

from tabulate import tabulate # tabulate printing

import seaborn as sns # plots
import matplotlib.pyplot as plt # plots

import warnings #

# settings for jupyter envioronment

# This ensures that plots are rendered inline
%matplotlib inline

# This ensures that all output, including text and plots, is shown automatically
InteractiveShell.ast_node_interactivity = "all"

# Switching off the future warrnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#Helper functions

#anova Test
def anova_test(feature):

    df_lcl = df.loc[(df.src=='trn') & (df[feature].notna())].copy()
    # Group the continuous values based on the categorical column
    groups = [group['Premium Amount'].values for name, group in df_lcl.groupby(feature)]
    
    # Perform ANOVA
    f_stat, p_value = f_oneway(*groups)
    
    print(f"F-statistic: {f_stat:.4f}")
    print(f"P-value: {p_value:.4f}")
    
    if p_value < 0.05:
        print(f"There is a \033[1msignificant correlation\033[0m between the {feature} and Premium Amount variables.")
    else:
        print(f"There is \033[1mno significant\033[0m correlation between the {feature} and Premium Amount variables.")

# Pearson correlation
def pearson_correlation(feature):
    df_lcl = df.loc[(df.src=='trn') & (df[feature].notna())].copy()
    # Calculate Pearson correlation
    correlation, p_value = pearsonr(df_lcl[feature].values, df_lcl['Premium Amount'])
    
    # Categorize the correlation strength
    if abs(correlation) >= 0.8:
        strength = "high"
    elif abs(correlation) >= 0.5:
        strength = "moderate"
    else:
        strength = "weak"
    
    # Print results
    print(f"Pearson Correlation Coefficient: {correlation:.4f}")
    print(f"P-value: {p_value:.4f}")
    print(f"The correlation is \033[1m{strength}\033[0m.")
    
    # Check statistical significance
    if p_value < 0.05:
        print("The correlation is statistically \033[1msignificant\033[0m.")
    else:
        print("The correlation is \033[1mNOT\033[0m statistically significant.")

# Spearman Correlation
def spearman_correlation(feature):

    df_lcl = df.loc[(df.src=='trn') & (df[feature].notna())].copy()

    # Calculate Spearman's rank correlation
    correlation, p_value = spearmanr(df_lcl[feature].values, df_lcl['Premium Amount'])
    
    # Categorize the correlation strength
    if abs(correlation) >= 0.8:
        strength = "strong"
    elif abs(correlation) >= 0.5:
        strength = "moderate"
    else:
        strength = "weak"
    
    # Print results
    print(f"Spearman Correlation Coefficient: {correlation:.4f}")
    print(f"P-value: {p_value:.4f}")
    print(f"The correlation is \033[1m{strength}\033[0m.")
    
    # Check if correlation is impactful
    if p_value < 0.05:
        print("The correlation is statistically significant (\033[1mimpactful\033[0m).")
    else:
        print("The correlation is \033[1mNOT\033[0m statistically significant (not impactful).")

# Point biserial correlation
def point_biserial_correlation(feature):

    df_lcl = df.loc[(df.src=='trn') & (df[feature].notna()),:].copy()
        # Encode the binary string column to 0s and 1s
    le = LabelEncoder()
    df_lcl.loc[:,'Category Encoded'] = le.fit_transform(df_lcl[feature])
    # Calculate point-biserial correlation
    correlation, p_value = pointbiserialr(df_lcl['Category Encoded'], df_lcl['Premium Amount'])
    
    print(f"\nPoint-Biserial Correlation: {correlation:.4f}")
    print(f"P-value: {p_value:.4f}")
    
    if p_value < 0.05:
        print(f"There is a \033[1msignificant correlation\033[0m between the binary {feature} and 'Premium Amount' variables.\n")
    else:
        print("There is \033[1mNO\033[0m significant correlation.\n")

# Function to generate EDA for Ordinal features
def analyse_ordinal(feature):
    
    # Value distribution 
    print('\n')
    pivot = df.pivot_table(index='src', columns=feature, aggfunc='size', fill_value=0)
    pivot_percentage = round(pivot.div(pivot.sum(axis=1), axis=0) * 100)
    pivot_percentage_with_symbol = pivot_percentage.applymap(lambda x: f"{x:.2f}%")
    
    print(tabulate(pivot_percentage_with_symbol, headers='keys', tablefmt='grid'))
    
    # Correlation 
    print('\n')
    # Print the correlation
    spearman_correlation(feature)
    print('\n')
    
    
    # Create a violing plot for column 'Gender'
    _ = plt.figure(figsize=(20, 5))
    _ = sns.violinplot(x=feature, y='Premium Amount', data=df.loc[df.src=='trn',:])
    _ = plt.title(f'Premium amount vs {feature}')
    
    # Adjust layout to avoid overlap
    plt.tight_layout()
    
    # Show the plot
    plt.show()


# Function to generate EDA for categorical features
def analyse_categorical(feature):
    
    # Value distribution 
    print('\n')
    pivot = df.pivot_table(index='src', columns=feature, aggfunc='size', fill_value=0)
    pivot_percentage = round(pivot.div(pivot.sum(axis=1), axis=0) * 100)
    pivot_percentage_with_symbol = pivot_percentage.applymap(lambda x: f"{x:.2f}%")
    
    print(tabulate(pivot_percentage_with_symbol, headers='keys', tablefmt='grid'))

    #correlation
    print('\n')
    if df[feature].nunique() == 2 :
        point_biserial_correlation(feature)
    else:
        anova_test(feature)
    print('\n')
    
    # Create a violing plot for column 'Gender'
    _ = plt.figure(figsize=(20, 5))
    _ = sns.violinplot(x=feature, y='Premium Amount', data=df.loc[df.src=='trn',:])
    _ = plt.title(f'Premium amount vs {feature}')
    
    # Adjust layout to avoid overlap
    plt.tight_layout()
    
    # Show the plot
    plt.show()
    
# function to generate EDA for continues feature
def analyse_continues(feature):

    # Print the correlation
    print('\n')
    pearson_correlation(feature)
    print('\n')
    
    # plotting 
    fig = plt.figure(figsize=(20, 8))
    
    # Define grid for 2 rows, 2 columns
    gs = fig.add_gridspec(2, 2)
    
    # First subplot in the first row, first column
    ax1 = fig.add_subplot(gs[0, 0])
    _ = sns.histplot(df.loc[df.src=='trn',feature], kde=True, ax=ax1)
    _ = ax1.set_title(f'Histogram of {feature} in Train')
    
    # Second subplot in the first row, second column
    ax2 = fig.add_subplot(gs[0, 1])
    _ = sns.histplot(df.loc[df.src=='tst',feature], kde=True, ax=ax2)
    _ = ax2.set_title(f'Histogram of {feature} in Test')
    
    # Third subplot in the second row, spanning both columns
    ax3 = fig.add_subplot(gs[1, :])  # Span both columns
    _ = sns.scatterplot(x=feature, y='Premium Amount', data=df.loc[df.src=='trn',:], ax=ax3)
    _ = ax3.set_title(f'Premium amount vs {feature}')
    
    # Adjust layout to avoid overlap
    plt.tight_layout()
    
    # Show the plot
    plt.show()


In [11]:
def ouno(df):
  """
  Prints information about a DataFrame, including shape, data types, unique values, and null values (with descending null value order).

  Args:
      df: Pandas DataFrame.
  """
  print('+'*100)
  print('Shape of the dataframe is: ', df.shape)
  print('+'*100)
  print('{:<20} {:<25} {:<20} {:<25}'.format('Field Name', 'Object Type', 'Unique Label Count', 'Null Values (Percentage)'))
  print('+'*100)

  # Sort by null values in descending order
  df_sorted = df.isnull().sum().sort_values(ascending=False)
  total_rows = len(df)

  for col in df_sorted.index:
    unique_labels_count = len(df[col].unique())
    object_type = str(df[col].dtype)  # Convert dtype to string
    null_values = df[col].isna().sum()
    null_percentage = (null_values / total_rows) * 100
    print('{:<20} {:<25} {:<20} {:<25}'.format(col, object_type, unique_labels_count, f'{null_values} ({null_percentage:.2f}%)'))

  print('+'*100)

In [2]:
# Getting data
train = pd.read_csv('train.csv')
test =  pd.read_csv('test.csv')

In [4]:
# Merging two dataframes after adding trn and tst tag
train['src']='trn'
test['src']='tst'

df = pd.concat([train, test], ignore_index=True)

In [5]:
## Replace 'inf' and '-inf' with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
ouno(df[df['src']=="trn"])
ouno(df[df['src']=="tst"])

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Shape of the dataframe is:  (230130, 7)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Field Name           Object Type               Unique Label Count   Null Values (Percentage) 
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
num_sold             float64                   4038                 8871 (3.85%)             
id                   int64                     230130               0 (0.00%)                
date                 object                    2557                 0 (0.00%)                
country              object                    6                    0 (0.00%)                
store                object                    3                    0 (0.00%)                
product              object                    5                    0 (0.00%)                