# wine.csv 데이터셋 EDA

* index : 구분자
* quality : 품질
* fixed acidity : 산도
* volatile acidity : 휘발성산
* citric acid : 시트르산
* residual sugar : 잔당( 발효 후 와인 속에 남아있는 당분 )
* chlorides : 염화물
* free sulfur dioxide : 독립 이산화황
* total sulfur dioxide : 총 이산화황
* density : 밀도
* pH : 수소 이온 농도
* sulphates : 황산염
* alcohol : 도수
* type : 종류

In [1]:
class Wine_Info:
    def __init__( self, index, quality, fixed_acidity, volatile_acidity, citric_acid,
                  residual_sugar, chlorides, free_sulfur_dioxide, total_sulfur_dioxide,
                  density, pH, sulphates, alcohol, wine_type ):
        self.__row = [ index, quality, fixed_acidity, volatile_acidity, citric_acid,
                       residual_sugar, chlorides, free_sulfur_dioxide, total_sulfur_dioxide,
                       density, pH, sulphates, alcohol, wine_type ]
           
    def get_column( self, index ):
        return self.__row[ index ]
    
    def __str__( self ):
        return self.__repr__()
    
    def __repr__( self ):
        string = ''
        string += f'{self.__row[ 0 ]:>4}{self.__row[ 1 ]:2}{self.__row[ 2 ]:5.2f}'
        string += f'{self.__row[ 3 ]:6.3f}{self.__row[ 4 ]:6.2f}{self.__row[ 5 ]:6.2f}'
        string += f'{self.__row[ 6 ]:6.3f}{self.__row[ 7 ]:6.1f}'
        string += f'{self.__row[ 8 ]:6.1f}{self.__row[ 9 ]:6.4f}{self.__row[ 10 ]:6.2f}'
        string += f'{self.__row[ 1 ]:6.2f}{self.__row[ 12 ]:6.1f} {self.__row[ 13 ]:5}'
        return string

In [2]:
import operator
import my_lib

class Wine_DataFrame:
    def __init__( self, dataset ):
        self.__dataset = dataset
        self.__wine_info_list = []
        self.__columns = [ 'index', 'quality', 'fixed acidity', 'volatile acidity', 'citric acid',
                           'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide',
                           'density', 'pH', 'sulphates', 'alcohol', 'type' ]
        self.__column_sub = [ 1, 2, 10, 12, 13 ]
        self.__column_info = {}
        self.__count = 0
        self.__read_data()
    
    def __read_data( self ):
        with open( self.__dataset, 'r' ) as f:
            f.readline()
            record = f.readline()
            while record:
                record = record.split( ',' )

                wine_info = Wine_Info( int( record[ 0 ] ), 
                                       int( record[ 1 ] ),
                                       float( record[ 2 ] ),
                                       float( record[ 3 ] ),
                                       float( record[ 4 ] ),
                                       float( record[ 5 ] ),
                                       float( record[ 6 ] ),
                                       float( record[ 7 ] ),
                                       float( record[ 8 ] ),
                                       float( record[ 9 ] ),
                                       float( record[ 10 ] ),
                                       float( record[ 11 ] ),
                                       float( record[ 12 ] ),
                                       record[ 13 ] )
                
                self.__wine_info_list.append( wine_info )
                self.__count += 1 

                record = f.readline()
                
            wine_info = self.__wine_info_list[ 0 ]
            
            for i in range( len( self.__columns ) ):
                wine_info.get_column( i )
                self.__column_info[ self.__columns[ i ] ] = self.__count, type( wine_info.get_column( i ) ) 

    def __create_column_list( self, column, index ):
        for v in self.__wine_info_list:
            column.append( v.get_column( index ) )

    def get_count( self ):
        return self.__count
    
    def get_columns( self ):
        return self.__columns
    
    def get_column_sub( self ):
        return self.__column_sub
        
    def head( self, length = 5 ):
        for v in self.__columns:
            print( f'{v} ', end = '' )
        print()
        
        for i in range( length ):
            print( self.__wine_info_list[ i ] )
        print( f'\ntotal : {self.__count}개' )
        
    def tail( self, length = 5 ):
        for v in self.__columns:
            print( f'{v} ', end = '' )
        print()
        
        start = len( self.__wine_info_list ) - length 
        stop = len( self.__wine_info_list )
        for i in range( start, stop ):
            print( self.__wine_info_list[ i ] )
        print( f'\ntotal : {self.__count}개' )
        
    def info( self ):
        i = 0
        print( f'Data columns ( total {len( self.__columns )} columns )' )
        print( ' #  Column\t\tCount Dtype' )
        print( '--- ------\t\t----- -----' )
        for key, value in self.__column_info.items():
            print( f'{i:2} {key:<20}{value[ 0 ]:6} {value[ 1 ]}' )
            i += 1
        
    def describe( self, column, column_name, index, decimal_places = 6 ):
        self.__create_column_list( column, index )
        
        max_column = max( column )
        min_column = min( column )
        average, dispersion, standard_deviation = my_lib.average_standard_deviation( column )
        height_25, height_50, height_75 = my_lib.calculate_quartile( column )
        
        print( f'{column_name[ index ]} 기초통계' )
        print( f'count : {self.__count}' )
        print( f'mean  : {round( average, decimal_places )}' )
        print( f'std   : {round( standard_deviation, decimal_places )}' )
        print( f'min   : {round( min_column, decimal_places )}' )
        print( f'25%   : {round( height_25, decimal_places )}' )
        print( f'50%   : {round( height_50, decimal_places )}' )
        print( f'75%   : {round( height_75, decimal_places )}' )
        print( f'max   : {round( max_column, decimal_places )}' )
    
    def column_value_count( self, column, column_name, index, eigenvalues = 20 ):
        self.__create_column_list( column, index )
        
        column_value_count_list = my_lib.my_unique( column )
        column_value_count_dict = {}
        for v in column_value_count_list:
            column_value_count_dict[ v ] = column.count( v )

        if len( column_value_count_dict ) <= eigenvalues:
            sort_column_value_count_dict = sorted( column_value_count_dict.items(),
                                                   key = operator.itemgetter( 1 ),
                                                   reverse = True )

            print( f'{column_name[ index ]}'.center( 18 ), end = '' )
            print( f'\n\n{column_name[ index ]}\t빈도수( 비율 )' )
            for key, value in sort_column_value_count_dict:
                print( f'{key}\t{value:5} ( {round( value / self.__count * 100, 2 ):5.2f}% )' )
                
    def corr( self ):
        columns_list = []
        
        for i in range( 0, len( self.__columns ) - 1 ):
            column = []
            self.__create_column_list( column, i )
            columns_list.append( column )
        
        corr_table = []
        for i in range( len( columns_list ) ):
            corr_column = []
            for j in range( len( columns_list ) ):
                corr_column.append( my_lib.my_corr( columns_list[ i ], columns_list[ j ] ) )
            corr_table.append( corr_column )
            
        print( f'{"변수간 상관계수".center( 80 )}\n' )
        
        for i in range( len( self.__columns ) - 1 ):
            print( f'{self.__columns[ i ]}', end = '|' )
        print()
        
        for i in range( len( corr_table ) ):
            for j in range( len( corr_table[ i ] ) ):
                print( f'{corr_table[ i ][ j ]:6.2f}', end = '' )
            print()

In [3]:
df = Wine_DataFrame( '../data/wine.csv' )

In [4]:
df.head()

index quality fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol type 
   0 5 5.60 0.695  0.06  6.80 0.042   9.0  84.00.9943  3.44  5.00  10.2 white

   1 5 8.80 0.610  0.14  2.40 0.067  10.0  42.00.9969  3.19  5.00   9.5 red
 
   2 5 7.90 0.210  0.39  2.00 0.057  21.0 138.00.9918  3.05  5.00  10.9 white

   3 6 7.00 0.210  0.31  6.00 0.046  29.0 108.00.9939  3.26  6.00  10.8 white

   4 6 7.80 0.400  0.26  9.50 0.059  32.0 178.00.9955  3.04  6.00  10.9 white


total : 5497개


In [5]:
df.tail()

index quality fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol type 
5492 5 7.70 0.150  0.29  1.30 0.029  10.0  64.00.9932  3.35  5.00  10.1 white

5493 6 6.30 0.180  0.36  1.20 0.034  26.0 111.00.9907  3.16  6.00  11.0 white

5494 7 7.80 0.150  0.34  1.10 0.035  31.0  93.00.9910  3.07  7.00  11.3 white

5495 5 6.60 0.410  0.31  1.60 0.042  18.0 101.00.9919  3.13  5.00  10.5 white

5496 6 7.00 0.350  0.17  1.10 0.049   7.0 119.00.9930  3.13  6.00   9.7 white


total : 5497개


In [6]:
df.info()

Data columns ( total 14 columns )
 #  Column		Count Dtype
--- ------		----- -----
 0 index                 5497 <class 'int'>
 1 quality               5497 <class 'int'>
 2 fixed acidity         5497 <class 'float'>
 3 volatile acidity      5497 <class 'float'>
 4 citric acid           5497 <class 'float'>
 5 residual sugar        5497 <class 'float'>
 6 chlorides             5497 <class 'float'>
 7 free sulfur dioxide   5497 <class 'float'>
 8 total sulfur dioxide  5497 <class 'float'>
 9 density               5497 <class 'float'>
10 pH                    5497 <class 'float'>
11 sulphates             5497 <class 'float'>
12 alcohol               5497 <class 'float'>
13 type                  5497 <class 'str'>


In [7]:
columns = df.get_columns()
for index, column in enumerate( columns ):
    if index != 0 and index != len( columns ) - 1 :
        column_list = []
        df.describe( column_list, columns, index )
        print()

quality 기초통계
count : 5497
mean  : 5.818992
std   : 0.870232
min   : 3
25%   : 5
50%   : 6
75%   : 6
max   : 9

fixed acidity 기초통계
count : 5497
mean  : 7.210115
std   : 1.287462
min   : 3.8
25%   : 6.4
50%   : 7.0
75%   : 7.7
max   : 15.9

volatile acidity 기초통계
count : 5497
mean  : 0.338163
std   : 0.163209
min   : 0.08
25%   : 0.23
50%   : 0.29
75%   : 0.4
max   : 1.58

citric acid 기초통계
count : 5497
mean  : 0.318543
std   : 0.145091
min   : 0.0
25%   : 0.25
50%   : 0.31
75%   : 0.39
max   : 1.66

residual sugar 기초통계
count : 5497
mean  : 5.438075
std   : 4.756243
min   : 0.6
25%   : 1.8
50%   : 3.0
75%   : 8.1
max   : 65.8

chlorides 기초통계
count : 5497
mean  : 0.055808
std   : 0.03465
min   : 0.009
25%   : 0.038
50%   : 0.047
75%   : 0.064
max   : 0.61

free sulfur dioxide 기초통계
count : 5497
mean  : 30.417682
std   : 17.672274
min   : 1.0
25%   : 17.0
50%   : 29.0
75%   : 41.0
max   : 289.0

total sulfur dioxide 기초통계
count : 5497
mean  : 115.566491
std   : 56.283103
min   : 6.0
25%   : 78

In [8]:
print( f'주요 변수에 대한 고유값 개수 및 비율\n' )
for index in df.get_column_sub():
    column_list = []
    df.column_value_count( column_list, columns, index )

주요 변수에 대한 고유값 개수 및 비율

     quality      

quality	빈도수( 비율 )
6	 2416 ( 43.95% )
5	 1788 ( 32.53% )
7	  924 ( 16.81% )
4	  186 (  3.38% )
8	  152 (  2.77% )
3	   26 (  0.47% )
9	    5 (  0.09% )
       type       

type	빈도수( 비율 )
white
	 4159 ( 75.66% )
red
	 1338 ( 24.34% )


In [9]:
columns = df.get_columns()
print( f'전체 변수에 대한 고유값 개수 및 비율( index 변수 제외 )\n' )
for index, column in enumerate( columns ):
    if index != 0 :
        column_list = []
        df.column_value_count( column_list, columns, index ) #, eigenvalues = df.get_count() )

전체 변수에 대한 고유값 개수 및 비율( index 변수 제외 )

     quality      

quality	빈도수( 비율 )
6	 2416 ( 43.95% )
5	 1788 ( 32.53% )
7	  924 ( 16.81% )
4	  186 (  3.38% )
8	  152 (  2.77% )
3	   26 (  0.47% )
9	    5 (  0.09% )
       type       

type	빈도수( 비율 )
white
	 4159 ( 75.66% )
red
	 1338 ( 24.34% )


In [10]:
df.corr()

                                    변수간 상관계수                                    

index|quality|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|pH|sulphates|alcohol|
  1.00 -0.01 -0.01 -0.01 -0.02  0.00 -0.02  0.01  0.02 -0.01 -0.02 -0.01 -0.00
 -0.01  1.00 -0.08 -0.26  0.08 -0.03 -0.20  0.06 -0.04 -0.30  0.02  0.04  0.44
 -0.01 -0.08  1.00  0.21  0.33 -0.11  0.30 -0.28 -0.32  0.45 -0.25  0.30 -0.10
 -0.01 -0.26  0.21  1.00 -0.38 -0.19  0.39 -0.35 -0.42  0.27  0.26  0.23 -0.03
 -0.02  0.08  0.33 -0.38  1.00  0.14  0.03  0.13  0.19  0.09 -0.32  0.05 -0.01
  0.00 -0.03 -0.11 -0.19  0.14  1.00 -0.13  0.40  0.49  0.56 -0.26 -0.18 -0.36
 -0.02 -0.20  0.30  0.39  0.03 -0.13  1.00 -0.19 -0.28  0.36  0.05  0.40 -0.26
  0.01  0.06 -0.28 -0.35  0.13  0.40 -0.19  1.00  0.72  0.03 -0.15 -0.19 -0.18
  0.02 -0.04 -0.32 -0.42  0.19  0.49 -0.28  0.72  1.00  0.04 -0.23 -0.27 -0.27
 -0.01 -0.30  0.45  0.27  0.09  0.56  0.36  0.03  0.04

# bicycle.csv 데이터셋 EDA

* date_time : 일별 날짜
* wind_direction : 풍향 (degree)
* sky_condition : 하늘 상태 (하단 설명 참조)
* precipitation_form : 강수 형태 (하단 설명 참조)
* wind_speed : 풍속 (m/s)
* humidity : 습도 (%)
* low_temp : 최저 기온 ( `C)
* high_temp : 최고 기온 ( `C)
* Precipitation_Probability : 강수 확률 (%)
* number_of_rentals : 따릉이 대여량

In [27]:
class Bicycle_Info:
    def __init__( self, date_time, wind_direction, sky_condition, precipitation_form, wind_speed,
                  humidity, low_temp, high_temp, Precipitation_Probability, number_of_rentals ):
        self.__row = [ date_time, wind_direction, sky_condition, precipitation_form, wind_speed,
                  humidity, low_temp, high_temp, Precipitation_Probability, number_of_rentals ]
           
    def get_column( self, index ):
        return self.__row[ index ]
    
    def __str__( self ):
        return self.__repr__()
    
    def __repr__( self ):
        string = ''
        string += f'{self.__row[ 0 ]:>4}{self.__row[ 1 ]:8.3f}{self.__row[ 2 ]:6.3f}'
        string += f'{self.__row[ 3 ]:6.3f}{self.__row[ 4 ]:6.3f}{self.__row[ 5 ]:8.2f}'
        string += f'{self.__row[ 6 ]:7.3f}{self.__row[ 7 ]:7.3f}'
        string += f'{self.__row[ 8 ]:7.3f}{self.__row[ 9 ]:6}'

        return string

In [95]:
import operator
import my_lib

class Bicycle_DataFrame:
    def __init__( self, dataset ):
        self.__dataset = dataset
        self.__bicycle_info_list = []
        self.__columns = [ 'date_time', 'wind_direction', 'sky_condition', 'precipitation_form', 
                           'wind_speed', 'humidity', 'low_temp', 'high_temp', 
                           'Precipitation_Probability', 'number_of_rentals' ]
        self.__column_info = {}
        self.__count = 0
        self.__read_data()
    
    def __read_data( self ):
        with open( self.__dataset, 'r' ) as f:
            f.readline()
            record = f.readline()
            while record:
                record = record.split( ',' )

                bicycle_info = Bicycle_Info( record[ 0 ], 
                                       float( record[ 1 ] ),
                                       float( record[ 2 ] ),
                                       float( record[ 3 ] ),
                                       float( record[ 4 ] ),
                                       float( record[ 5 ] ),
                                       float( record[ 6 ] ),
                                       float( record[ 7 ] ),
                                       float( record[ 8 ] ),
                                       int( record[ 9 ] ) )
                
                self.__bicycle_info_list.append( bicycle_info )
                self.__count += 1 

                record = f.readline()
                
            wine_info = self.__bicycle_info_list[ 0 ]
            
            for i in range( len( self.__columns ) ):
                wine_info.get_column( i )
                self.__column_info[ self.__columns[ i ] ] = self.__count, type( wine_info.get_column( i ) ) 

    def __create_column_list( self, column, index ):
        for v in self.__bicycle_info_list:
            column.append( v.get_column( index ) )

    def get_count( self ):
        return self.__count
    
    def get_column( self, column, index ):
        return self.__create_column_list( column, index )
    
    def get_columns( self ):
        return self.__columns
       
    def head( self, length = 5 ):
        for v in self.__columns:
            print( f'{v} ', end = '' )
        print()
        
        for i in range( length ):
            print( self.__bicycle_info_list[ i ] )
        print( f'\ntotal : {self.__count}개' )
        
    def tail( self, length = 5 ):
        for v in self.__columns:
            print( f'{v} ', end = '' )
        print()
        
        start = len( self.__bicycle_info_list ) - length 
        stop = len( self.__bicycle_info_list )
        for i in range( start, stop ):
            print( self.__bicycle_info_list[ i ] )
        print( f'\ntotal : {self.__count}개' )
        
    def info( self ):
        i = 0
        print( f'Data columns ( total {len( self.__columns )} columns )' )
        print( ' #  Column\t\tCount Dtype' )
        print( '--- ------\t\t----- -----' )
        for key, value in self.__column_info.items():
            print( f'{i:2} {key:<20}{value[ 0 ]:6} {value[ 1 ]}' )
            i += 1
        
    def describe( self, column, column_name, index, decimal_places = 6 ):
        self.__create_column_list( column, index )
        
        max_column = max( column )
        min_column = min( column )
        average, dispersion, standard_deviation = my_lib.average_standard_deviation( column )
        height_25, height_50, height_75 = my_lib.calculate_quartile( column )
        
        print( f'{column_name[ index ]} 기초통계' )
        print( f'count : {self.__count}' )
        print( f'mean  : {round( average, decimal_places )}' )
        print( f'std   : {round( standard_deviation, decimal_places )}' )
        print( f'min   : {round( min_column, decimal_places )}' )
        print( f'25%   : {round( height_25, decimal_places )}' )
        print( f'50%   : {round( height_50, decimal_places )}' )
        print( f'75%   : {round( height_75, decimal_places )}' )
        print( f'max   : {round( max_column, decimal_places )}' )
    
    def column_value_count( self, column, column_name, index, eigenvalues = 20 ):
        self.__create_column_list( column, index )
        
        column_value_count_list = my_lib.my_unique( column )
        column_value_count_dict = {}
        for v in column_value_count_list:
            column_value_count_dict[ v ] = column.count( v )

        if len( column_value_count_dict ) <= eigenvalues:
            sort_column_value_count_dict = sorted( column_value_count_dict.items(),
                                                   key = operator.itemgetter( 1 ),
                                                   reverse = True )

            print( f'{column_name[ index ]}'.center( 18 ), end = '' )
            print( f'\n\n{column_name[ index ]}\t빈도수( 비율 )' )
            for key, value in sort_column_value_count_dict:
                print( f'{key}\t{value:5} ( {round( value / self.__count * 100, 2 ):5.2f}% )' )
                
    def max_number_of_rentals( self, length = 273 ):
        datetime_list = []
        rental_list = []
        self.__create_column_list( datetime_list, 0 )
        self.__create_column_list( rental_list, 9 )
        
        number_of_rentals_dict = {}
        for i in range( len( datetime_list ) ):
            number_of_rentals_dict[ datetime_list[ i ] ] = rental_list[ i ]
            
        max_number_of_rentals_dict = sorted( number_of_rentals_dict.items(), 
                                             key = operator.itemgetter( 1 ),
                                             reverse = True )
        
        count = 0
        for key, value in max_number_of_rentals_dict:
            print( f'{key:<16}{value:8}' )
            count += 1
            if count == length:
                break
        
    def corr( self ):
        columns_list = []
        
        for i in range( 1, len( self.__columns ) ):
            column = []
            self.__create_column_list( column, i )
            columns_list.append( column )
        
        corr_table = []
        for i in range( len( columns_list ) ):
            corr_column = []
            for j in range( len( columns_list ) ):
                corr_column.append( my_lib.my_corr( columns_list[ i ], columns_list[ j ] ) )
            corr_table.append( corr_column )
            
        print( f'{"변수간 상관계수".center( 80 )}\n' )
        
        for i in range( len( self.__columns ) - 1 ):
            print( f'{self.__columns[ i ]}', end = '|' )
        print()
        
        for i in range( len( corr_table ) ):
            for j in range( len( corr_table[ i ] ) ):
                print( f'{corr_table[ i ][ j ]:6.2f}', end = '' )
            print()

In [96]:
df = Bicycle_DataFrame( '../data/bicycle.csv' )

In [97]:
df.head()

date_time wind_direction sky_condition precipitation_form wind_speed humidity low_temp high_temp Precipitation_Probability number_of_rentals 
2018-04-01 207.500 4.000 0.000 3.050   75.00 12.600 21.000 30.000 22994
2018-04-02 208.317 2.950 0.000 3.278   69.83 12.812 19.000 19.500 28139
2018-04-03 213.516 2.911 0.000 2.690   74.88 10.312 15.316 19.113 26817
2018-04-04 143.836 3.692 0.425 3.138   71.85  8.312 12.368 43.493 26034
2018-04-05  95.905 4.000 0.723 3.186   73.78  5.875 10.421 63.378  2833

total : 273개


In [98]:
df.tail()

date_time wind_direction sky_condition precipitation_form wind_speed humidity low_temp high_temp Precipitation_Probability number_of_rentals 
2020-06-26 228.662 3.980 0.223 2.271   78.38 20.500 27.526 36.486 96150
2020-06-27 207.770 2.865 0.081 1.794   78.41 20.812 28.842 21.081107001
2020-06-28 282.568 1.730 0.000 1.820   72.74 21.000 29.053  7.297 98568
2020-06-29 137.027 2.257 0.088 2.043   70.47 19.625 26.000 15.541 70053
2020-06-30 120.797 3.622 0.432 5.574   77.06 19.125 26.053 41.284 38086

total : 273개


In [99]:
df.info()

Data columns ( total 10 columns )
 #  Column		Count Dtype
--- ------		----- -----
 0 date_time              273 <class 'str'>
 1 wind_direction         273 <class 'float'>
 2 sky_condition          273 <class 'float'>
 3 precipitation_form     273 <class 'float'>
 4 wind_speed             273 <class 'float'>
 5 humidity               273 <class 'float'>
 6 low_temp               273 <class 'float'>
 7 high_temp              273 <class 'float'>
 8 Precipitation_Probability   273 <class 'float'>
 9 number_of_rentals      273 <class 'int'>


In [100]:
# 전체 날짜 목록 생성
datetimes = []
df.get_column( datetimes, 0 )
datetime_unique = my_lib.my_unique( datetimes )

print( '데이터 수집 일 : ' )
count = 0
for value in datetime_unique:
    print( f'{value:<12}', end = '' )
    count += 1
    if count >= 5:
        print()
        count = 0   
print( f'\n\n데이터 수집 일수 : {len( datetime_unique )}일' )

데이터 수집 일 : 
2018-04-01  2018-04-02  2018-04-03  2018-04-04  2018-04-05  
2018-04-06  2018-04-07  2018-04-08  2018-04-09  2018-04-10  
2018-04-11  2018-04-12  2018-04-13  2018-04-14  2018-04-15  
2018-04-16  2018-04-17  2018-04-18  2018-04-19  2018-04-20  
2018-04-21  2018-04-22  2018-04-23  2018-04-24  2018-04-25  
2018-04-26  2018-04-27  2018-04-28  2018-04-29  2018-04-30  
2018-05-01  2018-05-02  2018-05-03  2018-05-04  2018-05-05  
2018-05-06  2018-05-07  2018-05-08  2018-05-09  2018-05-10  
2018-05-11  2018-05-12  2018-05-13  2018-05-14  2018-05-15  
2018-05-16  2018-05-17  2018-05-18  2018-05-19  2018-05-20  
2018-05-21  2018-05-22  2018-05-23  2018-05-24  2018-05-25  
2018-05-26  2018-05-27  2018-05-28  2018-05-29  2018-05-30  
2018-05-31  2018-06-01  2018-06-02  2018-06-03  2018-06-04  
2018-06-05  2018-06-06  2018-06-07  2018-06-08  2018-06-09  
2018-06-10  2018-06-11  2018-06-12  2018-06-13  2018-06-14  
2018-06-15  2018-06-16  2018-06-17  2018-06-18  2018-06-19  
2018-06-20  

In [101]:
for index, column in enumerate( columns ):
    if index != 0:
        column_list = []
        df.describe( column_list, columns, index )
        print()

wind_direction 기초통계
count : 273
mean  : 202.750967
std   : 56.555366
min   : 57.047
25%   : 171.541
50%   : 210.108
75%   : 240.047
max   : 321.622

sky_condition 기초통계
count : 273
mean  : 2.288256
std   : 0.960012
min   : 1.0
25%   : 1.405
50%   : 2.189
75%   : 3.014
max   : 4.0

precipitation_form 기초통계
count : 273
mean  : 0.100963
std   : 0.20282
min   : 0.0
25%   : 0.0
50%   : 0.0
75%   : 0.088
max   : 1.0

wind_speed 기초통계
count : 273
mean  : 2.480963
std   : 0.882776
min   : 0.753
25%   : 1.828
50%   : 2.412
75%   : 2.932
max   : 5.607

humidity 기초통계
count : 273
mean  : 56.745491
std   : 12.328626
min   : 24.831
25%   : 47.432
50%   : 56.115
75%   : 66.453
max   : 88.885

low_temp 기초통계
count : 273
mean  : 13.795249
std   : 5.098347
min   : 1.938
25%   : 10.062
50%   : 14.438
75%   : 18.0
max   : 22.312

high_temp 기초통계
count : 273
mean  : 23.384733
std   : 5.195064
min   : 9.895
25%   : 19.895
50%   : 24.167
75%   : 27.579
max   : 33.421

Precipitation_Probability 기초통계
count : 273
me

In [103]:
print( '대여량이 많은 날짜순 정보\n\n날짜\t\t  대여량' )
df.max_number_of_rentals( length = 10 )

대여량이 많은 날짜순 정보

날짜		  대여량
2020-06-18        110377
2020-06-12        109210
2020-06-19        109124
2020-06-15        108320
2020-06-17        107723
2020-06-05        107343
2020-06-27        107001
2020-06-16        106519
2020-05-30        105172
2020-06-06        105033


In [65]:
columns = df.get_columns()
print( f'전체 변수에 대한 고유값 개수 및 비율\n' )
for index, column in enumerate( columns ):
    if index != 0 :
        column_list = []
        df.column_value_count( column_list, columns, index, eigenvalues = df.get_count() )
    print()

전체 변수에 대한 고유값 개수 및 비율


  wind_direction  

wind_direction	빈도수( 비율 )
171.541	    2 (  0.73% )
218.574	    2 (  0.73% )
57.047	    1 (  0.37% )
57.149	    1 (  0.37% )
58.986	    1 (  0.37% )
68.824	    1 (  0.37% )
69.128	    1 (  0.37% )
71.831	    1 (  0.37% )
71.833	    1 (  0.37% )
73.811	    1 (  0.37% )
75.007	    1 (  0.37% )
76.797	    1 (  0.37% )
79.554	    1 (  0.37% )
82.703	    1 (  0.37% )
92.75	    1 (  0.37% )
93.851	    1 (  0.37% )
94.986	    1 (  0.37% )
95.905	    1 (  0.37% )
101.264	    1 (  0.37% )
105.838	    1 (  0.37% )
108.297	    1 (  0.37% )
108.298	    1 (  0.37% )
108.432	    1 (  0.37% )
111.088	    1 (  0.37% )
115.277	    1 (  0.37% )
116.818	    1 (  0.37% )
116.932	    1 (  0.37% )
119.872	    1 (  0.37% )
120.797	    1 (  0.37% )
122.0	    1 (  0.37% )
122.054	    1 (  0.37% )
124.77	    1 (  0.37% )
124.797	    1 (  0.37% )
125.514	    1 (  0.37% )
126.858	    1 (  0.37% )
127.453	    1 (  0.37% )
128.845	    1 (  0.37% )
130.142	    1 (  0.37% )
1

In [66]:
df.corr()

                                    변수간 상관계수                                    

date_time|wind_direction|sky_condition|precipitation_form|wind_speed|humidity|low_temp|high_temp|Precipitation_Probability|
  1.00 -0.39 -0.35  0.08 -0.22 -0.14 -0.03 -0.39  0.21
 -0.39  1.00  0.67  0.14  0.65  0.09 -0.15  0.91 -0.38
 -0.35  0.67  1.00  0.29  0.61  0.05 -0.14  0.91 -0.42
  0.08  0.14  0.29  1.00  0.04 -0.40 -0.44  0.24 -0.46
 -0.22  0.65  0.61  0.04  1.00  0.41  0.19  0.69 -0.15
 -0.14  0.09  0.05 -0.40  0.41  1.00  0.92  0.07  0.32
 -0.03 -0.15 -0.14 -0.44  0.19  0.92  1.00 -0.16  0.42
 -0.39  0.91  0.91  0.24  0.69  0.07 -0.16  1.00 -0.45
  0.21 -0.38 -0.42 -0.46 -0.15  0.32  0.42 -0.45  1.00
