In [21]:
# 1. Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
import sweetviz as sv

# Data Manupulation
import numpy as np
import pandas as pd
import sklearn as sl
import ydata_profiling as yp


In [11]:
def ouno(df):
  """
  Prints information about a DataFrame, including shape, data types, unique values, and null values (with descending null value order).

  Args:
      df: Pandas DataFrame.
  """
  print('+'*100)
  print('Shape of the dataframe is: ', df.shape)
  print('+'*100)
  print('{:<20} {:<25} {:<20} {:<25}'.format('Field Name', 'Object Type', 'Unique Label Count', 'Null Values (Percentage)'))
  print('+'*100)

  # Sort by null values in descending order
  df_sorted = df.isnull().sum().sort_values(ascending=False)
  total_rows = len(df)

  for col in df_sorted.index:
    unique_labels_count = len(df[col].unique())
    object_type = str(df[col].dtype)  # Convert dtype to string
    null_values = df[col].isna().sum()
    null_percentage = (null_values / total_rows) * 100
    print('{:<20} {:<25} {:<20} {:<25}'.format(col, object_type, unique_labels_count, f'{null_values} ({null_percentage:.2f}%)'))

  print('+'*100)

In [4]:
# Getting data
train_df = pd.read_csv('train.csv')
test_df =  pd.read_csv('test.csv')

In [5]:
# View Training Data
train_df.head()
# Predict Premium amount depending on other data

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,...,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,...,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


In [6]:
# View Testing Data
test_df.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,1200000,28.0,Female,2310.0,,4.0,Bachelor's,Self-Employed,7.657981,Rural,Basic,,19.0,,1.0,2023-06-04 15:21:39.245086,Poor,Yes,Weekly,House
1,1200001,31.0,Female,126031.0,Married,2.0,Master's,Self-Employed,13.381379,Suburban,Premium,,14.0,372.0,8.0,2024-04-22 15:21:39.224915,Good,Yes,Rarely,Apartment
2,1200002,47.0,Female,17092.0,Divorced,0.0,PhD,Unemployed,24.354527,Urban,Comprehensive,,16.0,819.0,9.0,2023-04-05 15:21:39.134960,Average,Yes,Monthly,Condo
3,1200003,28.0,Female,30424.0,Divorced,3.0,PhD,Self-Employed,5.136225,Suburban,Comprehensive,1.0,3.0,770.0,5.0,2023-10-25 15:21:39.134960,Poor,Yes,Daily,House
4,1200004,24.0,Male,10863.0,Divorced,2.0,High School,Unemployed,11.844155,Suburban,Premium,,14.0,755.0,7.0,2021-11-26 15:21:39.259788,Average,No,Weekly,House


In [7]:
train_df.describe()

Unnamed: 0,id,Age,Annual Income,Number of Dependents,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Premium Amount
count,1200000.0,1181295.0,1155051.0,1090328.0,1125924.0,835971.0,1199994.0,1062118.0,1199999.0,1200000.0
mean,599999.5,41.14556,32745.22,2.009934,25.61391,1.002689,9.569889,592.9244,5.018219,1102.545
std,346410.3,13.53995,32179.51,1.417338,12.20346,0.98284,5.776189,149.9819,2.594331,864.9989
min,0.0,18.0,1.0,0.0,2.012237,0.0,0.0,300.0,1.0,20.0
25%,299999.8,30.0,8001.0,1.0,15.91896,0.0,5.0,468.0,3.0,514.0
50%,599999.5,41.0,23911.0,2.0,24.57865,1.0,10.0,595.0,5.0,872.0
75%,899999.2,53.0,44634.0,3.0,34.52721,2.0,15.0,721.0,7.0,1509.0
max,1199999.0,64.0,149997.0,4.0,58.97591,9.0,19.0,849.0,9.0,4999.0


In [13]:
test_df.describe()

Unnamed: 0,id,Age,Annual Income,Number of Dependents,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration
count,800000.0,787511.0,770140.0,726870.0,750551.0,557198.0,799997.0,708549.0,799998.0
mean,1600000.0,41.13644,32803.871471,2.009337,25.613036,1.004873,9.571891,592.904749,5.018949
std,230940.3,13.537829,32201.063749,1.415241,12.206882,0.982803,5.7722,150.116374,2.593759
min,1200000.0,18.0,2.0,0.0,1.646561,0.0,0.0,300.0,1.0
25%,1400000.0,30.0,8048.0,1.0,15.917353,0.0,5.0,468.0,3.0
50%,1600000.0,41.0,23981.0,2.0,24.580164,1.0,10.0,595.0,5.0
75%,1799999.0,53.0,44660.0,3.0,34.517766,2.0,15.0,721.0,7.0
max,1999999.0,64.0,149997.0,4.0,57.957351,9.0,19.0,849.0,9.0


In [12]:
ouno(train_df)

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Shape of the dataframe is:  (1200000, 21)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Field Name           Object Type               Unique Label Count   Null Values (Percentage) 
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Previous Claims      float64                   11                   364029 (30.34%)          
Occupation           object                    4                    358075 (29.84%)          
Credit Score         float64                   551                  137882 (11.49%)          
Number of Dependents float64                   6                    109672 (9.14%)           
Customer Feedback    object                    4                    77824 (6.49%)            
Health Score         float64                   532658               74076 (6.17%)          

In [14]:
ouno(test_df)

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Shape of the dataframe is:  (800000, 20)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Field Name           Object Type               Unique Label Count   Null Values (Percentage) 
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Previous Claims      float64                   11                   242802 (30.35%)          
Occupation           object                    4                    239125 (29.89%)          
Credit Score         float64                   551                  91451 (11.43%)           
Number of Dependents float64                   6                    73130 (9.14%)            
Customer Feedback    object                    4                    52276 (6.53%)            
Health Score         float64                   388703               49449 (6.18%)           

# From the above we can observe that the data is at ID level 
# 'Occupation', 'Customer Feedback', 'Marital Status' are Categorical variables with null values.
# 'Previous Claims','Credit Score','Number of Dependents','Health Score','Annual Income','Age' are ranked/ numerical variables that have null values.

In [18]:
# Lets do some EDA with Sweetviz
# analyzing the dataset
report = sv.analyze([train_df, 'Train'], target_feat='Premium Amount')

Done! Use 'show' commands to display/save.   |██████████| [100%]   00:00 -> (00:00 left)


In [19]:
# show the report in a form of an HTML file
report.show_html('Report.html')

Report Report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [20]:
!pip install -U ydata-profiling

Collecting ydata-profiling
  Downloading ydata_profiling-4.12.1-py2.py3-none-any.whl.metadata (20 kB)
Collecting visions<0.7.7,>=0.7.5 (from visions[type_image_path]<0.7.7,>=0.7.5->ydata-profiling)
  Downloading visions-0.7.6-py3-none-any.whl.metadata (11 kB)
Collecting htmlmin==0.1.12 (from ydata-profiling)
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting phik<0.13,>=0.11.1 (from ydata-profiling)
  Downloading phik-0.12.4-cp312-cp312-win_amd64.whl.metadata (5.6 kB)
Collecting multimethod<2,>=1.4 (from ydata-profiling)
  Downloading multimethod-1.12-py3-none-any.whl.metadata (9.6 kB)
Collecting statsmodels<1,>=0.13.2 (from ydata-profiling)
  Downloading st


[notice] A new release of pip is available: 23.3.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [22]:
profile = yp.ProfileReport(train_df, title="Training Data Profiling Report")

In [25]:
profile.to_file("insurance_train.html")

(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'Function <code object pandas_auto_compute at 0x000002476468B4F0, file "c:\Users\ramma\AppData\Local\Programs\Python\Python312\Lib\site-packages\ydata_profiling\model\pandas\correlations_pandas.py", line 167>')
Summarize dataset: 100%|██████████| 112/112 [00:36<00:00,  3.05it/s, Completed]                                    
Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.84s/it]
Render HTML: 100%|██████████| 1/1 [00:02<00:00,  2.36s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 59.23it/s]


# Looks like the missing data needs to be updated before going ahead with further EDA. 
# The correlations are very vague or non existant.