## VISUALIZE DIGITS DISTRIBUTION

In [1]:
import pandas as pd
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("data.csv", dtype=str)
df['Nam'] = pd.DatetimeIndex(df['Ngay']).year
df

Unnamed: 0,Thu,Ngay,G8,G7,G6,G5,G4,G3,G2,G1,DB,Nam
0,2,2023-04-10,96,067,3039 1834 0256,2658,12040 61730 61467 95275 06567 26533 97288,43472 46275,85665,26196,331863,2023
1,7,2023-04-08,41,530,1460 7105 3006,4405,45400 39392 18603 08026 38131 08344 69763,23994 55504,27272,00696,744278,2023
2,2,2023-04-03,90,064,6426 7669 7549,1024,07989 15931 86157 15220 80132 42218 77636,18219 22169,28645,90215,551507,2023
3,7,2023-04-01,94,822,2009 3534 1391,9752,02914 85150 64414 92389 87819 88191 03267,40381 81358,58896,52858,819946,2023
4,2,2023-03-27,17,533,1806 0536 9426,3721,26461 50275 84709 09153 63364 09323 91021,94553 70308,65629,71984,196165,2023
...,...,...,...,...,...,...,...,...,...,...,...,...
195,7,2021-02-13,33,789,8203 8964 1197,5336,16097 20404 53589 63340 01982 84821 97736,30527 26707,62044,72420,346015,2021
196,2,2021-02-08,73,711,6308 3409 7873,5700,75778 46773 41235 44775 22681 59093 92033,88973 93760,67790,94856,502117,2021
197,7,2021-02-06,14,519,4105 9528 7678,8254,59392 08342 78720 78075 14692 73513 65585,09862 83242,55007,90406,530464,2021
198,2,2021-02-01,80,115,6820 9332 9013,5348,19754 83442 81350 36206 86234 77258 66479,59402 81165,33303,08207,722174,2021


#### First look into Data

In [3]:
df.describe(include='object')

Unnamed: 0,Thu,Ngay,G8,G7,G6,G5,G4,G3,G2,G1,DB
count,200,200,200,200,200,200,200,200,200,200,200
unique,2,200,85,174,200,198,200,200,199,200,200
top,2,2023-04-10,52,494,3039 1834 0256,6185,12040 61730 61467 95275 06567 26533 97288,43472 46275,7226,26196,331863
freq,100,1,7,3,1,2,1,1,2,1,1


#### Auto Profiling with Pandas_profiling

In [4]:
# from pandas_profiling import ProfileReport

# overall_report = ProfileReport(df, title="OVERALL REPORT")
# overall_report.to_file("overall_report.html")

#### Helper function to Make Dataframe of Digits for Visualizing

In [5]:
def get_digit_df(df, GIAI):
    df = df[['Nam', 'Thu', GIAI]]
    num_of_turns = len(df[GIAI][0].split())
    num_of_digits = len(df[GIAI][0].split()[0])

    def extract_turns(df):
        df_turn = pd.concat([df, df[GIAI].str.split(
            expand=True)], axis=1).drop(columns=[GIAI])
        tmp = df_turn[['Nam', 'Thu', 0]].rename(columns={0: GIAI})

        for i in range(1, num_of_turns):
            tmp = pd.concat(
                [tmp, df_turn[['Nam', 'Thu', i]].rename(columns={i: GIAI})], axis=0)

        return tmp.reset_index()
    
    def extract_digits(df):

        for i in range(num_of_digits):
            df[f'digit_{i+1}'] = df[GIAI].apply(lambda s: list(s)[i])

        return df

    df = extract_turns(df)
    df = extract_digits(df)

    return df.drop(columns=['index'])


In [6]:
GIAI = 'G8' # Change this value to different columns to view distribution 
df_digit = get_digit_df(df, GIAI)
df_digit

Unnamed: 0,Nam,Thu,G8,digit_1,digit_2
0,2023,2,96,9,6
1,2023,7,41,4,1
2,2023,2,90,9,0
3,2023,7,94,9,4
4,2023,2,17,1,7
...,...,...,...,...,...
195,2021,7,33,3,3
196,2021,2,73,7,3
197,2021,7,14,1,4
198,2021,2,80,8,0


#### Overall describe of each Digit column

In [7]:
df_digit.describe(include='object')

Unnamed: 0,Thu,G8,digit_1,digit_2
count,200,200,200,200
unique,2,85,10,10
top,2,52,5,2
freq,100,7,28,30


#### Visualization with histograms for better understanding of distributions

In [8]:
num_of_digits = len(df_digit[GIAI][0].split()[0])

for i in range(num_of_digits):
    df_digit[f'digit_{i+1}'] = df_digit[f'digit_{i+1}'].astype('Int8')

Distribution with respect to Day of Week

In [9]:
for i in range(num_of_digits):
    fig = px.histogram(data_frame=df_digit, x=f'digit_{i+1}', color='Thu',
                       nbins=20,
                       histnorm='percent',
                       opacity=0.8,
                       )
    fig.show()

Distribution with respect to Year

In [10]:
for i in range(num_of_digits):
    fig = px.histogram(data_frame=df_digit, x=f'digit_{i+1}', color='Nam', 
                       nbins=20,
                       histnorm='percent',
                       opacity=0.8)
    fig.show()
