In [50]:
import urllib.request
import os.path
import pandas as pd
from sqlalchemy import create_engine, text
from sqlalchemy.exc import SQLAlchemyError 

In [158]:
DB_PATH = "data/"
DB_FILE = "ds-plus-final.db"
DB_URL = "https://code.s3.yandex.net/data-scientist/ds-plus-final.db"
DB_TABLE_NAMES = ['contract', 'personal', 'internet', 'phone']
DB_TABLE_DICT = {
    "contract": [
        "customerID",
        "BeginDate",
        "EndDate",
        "Type",
        "PaperlessBilling",
        "PaymentMethod",
        "MonthlyCharges",
        "TotalCharges"
    ],
    "personal": [
        "customerID",
        "gender",
        "SeniorCitizen",
        "Partner",
        "Dependents"
    ],
    "internet": [
        "customerID",
        "InternetService",
        "OnlineSecurity",
        "OnlineBackup",
        "DeviceProtection",
        "TechSupport",
        "StreamingTV",
        "StreamingMovies"
    ],
    "phone": [
        "customerID",
        "MultipleLines"
    ]
}                  
RANDOM_STATE = 110825
TARGET_METRIC = .85
TARGET = "EndDate"

['contract', 'personal', 'internet', 'phone']

In [54]:
if not os.path.exists(DB_PATH+DB_FILE):
    try:
        urllib.request.urlretrieve(DB_URL, DB_PATH+DB_FILE)
        print(f"–ë–∞–∑–∞ –¥–∞–Ω–Ω—ã—Ö {DB_FILE} –∑–∞–≥—Ä—É–∂–µ–Ω–∞")
    except Exception as err:
        print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –ø–æ–ø—ã—Ç–∫–µ –∑–∞–≥—Ä—É–∑–∫–∏ –±–∞–∑—ã –¥–∞–Ω–Ω—ã—Ö {err}")
else:
    print(f"–ë–∞–∑–∞ –¥–∞–Ω–Ω—ã—Ö {DB_FILE} —É–∂–µ –∑–∞–≥—Ä—É–∂–µ–Ω–∞")

–ë–∞–∑–∞ –¥–∞–Ω–Ω—ã—Ö ds-plus-final.db —É–∂–µ –∑–∞–≥—Ä—É–∂–µ–Ω–∞


In [56]:
engine = create_engine(f'sqlite:///{DB_PATH+DB_FILE}', echo=False) 

In [318]:
class TableStructureValidator:
    def __init__(self, engine, tables_dict=None):
        if not tables_dict:
            raise ValueError('–°–ª–æ–≤–∞—Ä—å tables_dict –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å —É–∫–∞–∑–∞–Ω –∏ –Ω–µ –º–æ–∂–µ—Ç –±—ã—Ç—å –ø—É—Å—Ç—ã–º')
        self.engine = engine
        self.tables_dict = tables_dict
        self.results = {}
        self._validate_tables()
    
    def _get_table_info(self, table_name):
        try:
            query = text(f"PRAGMA table_info({table_name})")
            return pd.read_sql_query(query, con=self.engine)
        except SQLAlchemyError as e:
            print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –ø–æ–ª—É—á–µ–Ω–∏–∏ –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏ –æ —Ç–∞–±–ª–∏—Ü–µ {table_name}: {e}")
            return None
            
    def _check_null_values(self, table_name):
        null_counts = {}
        try:
            for column in self.tables_dict[table_name]:
                query = text(f"SELECT COUNT(*) FROM {table_name} WHERE {column} IS NULL")
                null_count = pd.read_sql_query(query, self.engine).iloc[0, 0]
                if null_count > 0:
                    null_counts[column] = null_count
        except Exception as e:
            print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –ø—Ä–æ–≤–µ—Ä–∫–µ NULL-–∑–Ω–∞—á–µ–Ω–∏–π –≤ {table_name}.{column}: {e}")
        return null_counts
        
    def _validate_tables(self):        
        for table_name, expected_columns in self.tables_dict.items():
            actual_info = self._get_table_info(table_name)
            actual_columns = actual_info['name'].tolist()
            missing_in_db = set(expected_columns) - set(actual_columns)
            extra_in_db = set(actual_columns) - set(expected_columns)
            null_values = self._check_null_values(table_name)
            self.results[table_name] = {
                'status': 'ok' if not missing_in_db and not extra_in_db else 'mismatch',
                'actual_columns': actual_columns,
                'missing_columns': list(missing_in_db),
                'extra_columns': list(extra_in_db),
                'column_types': dict(zip(actual_info['name'], actual_info['type'])),
                'null_values': null_values,
                'null_values_count': sum(null_values.values())
            }

    def print_info(self):
        for table_name, data in self.results.items():
            print(f"\n–¢–∞–±–ª–∏—Ü–∞: {table_name}")
            print("-" * 50)
            
            if data['status'] == 'ok':
                print("‚úÖ –°—Ç—Ä—É–∫—Ç—É—Ä–∞ –ø–æ–ª–Ω–æ—Å—Ç—å—é —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É–µ—Ç –æ–∂–∏–¥–∞–µ–º–æ–π")
            else:
                if data['missing_columns']:
                    print(f"‚ùå –û—Ç—Å—É—Ç—Å—Ç–≤—É—é—â–∏–µ —Å—Ç–æ–ª–±—Ü—ã: {', '.join(data['missing_columns'])}")
                if data['extra_columns']:
                    print(f"‚ùå –õ–∏—à–Ω–∏–µ —Å—Ç–æ–ª–±—Ü—ã –≤ –ë–î: {', '.join(data['extra_columns'])}")
                    
            if data['null_values']:
                print("\n–ü—Ä–æ–ø—É—â–µ–Ω–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è (NULL):")
                for col, count in data['null_values'].items():
                    print(f"  - {col}: {count} –ø—Ä–æ–ø—É—Å–∫–æ–≤")
                    
            print("\n–ü—Ä–æ–≤–µ—Ä–∫–∞ —Å—Ç–æ–ª–±—Ü–æ–≤ –∏ —Ç–∏–ø–æ–≤:")
            for col in data['actual_columns']:
                status = "üî¥" if col in data['missing_columns'] or col in data['extra_columns'] else "üü¢"
                col_type = data['column_types'].get(col)
                print(f"{status} {col}: {col_type}")
            
            print("-" * 50)

In [320]:
# –∏–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –∏ –∑–∞–ø—É—Å–∫ –≤–∞–ª–∏–¥–∞—Ü–∏ —Ç–∞–±–ª–∏—Ü –ë–î
validator = TableStructureValidator(engine=engine, tables_dict=DB_TABLE_DICT)
validator.print_info()


–¢–∞–±–ª–∏—Ü–∞: contract
--------------------------------------------------
‚úÖ –°—Ç—Ä—É–∫—Ç—É—Ä–∞ –ø–æ–ª–Ω–æ—Å—Ç—å—é —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É–µ—Ç –æ–∂–∏–¥–∞–µ–º–æ–π

–ü—Ä–æ–≤–µ—Ä–∫–∞ —Å—Ç–æ–ª–±—Ü–æ–≤ –∏ —Ç–∏–ø–æ–≤:
üü¢ customerID: TEXT
üü¢ BeginDate: TEXT
üü¢ EndDate: TEXT
üü¢ Type: TEXT
üü¢ PaperlessBilling: TEXT
üü¢ PaymentMethod: TEXT
üü¢ MonthlyCharges: TEXT
üü¢ TotalCharges: TEXT
--------------------------------------------------

–¢–∞–±–ª–∏—Ü–∞: personal
--------------------------------------------------
‚úÖ –°—Ç—Ä—É–∫—Ç—É—Ä–∞ –ø–æ–ª–Ω–æ—Å—Ç—å—é —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É–µ—Ç –æ–∂–∏–¥–∞–µ–º–æ–π

–ü—Ä–æ–≤–µ—Ä–∫–∞ —Å—Ç–æ–ª–±—Ü–æ–≤ –∏ —Ç–∏–ø–æ–≤:
üü¢ customerID: TEXT
üü¢ gender: TEXT
üü¢ SeniorCitizen: TEXT
üü¢ Partner: TEXT
üü¢ Dependents: TEXT
--------------------------------------------------

–¢–∞–±–ª–∏—Ü–∞: internet
--------------------------------------------------
‚úÖ –°—Ç—Ä—É–∫—Ç—É—Ä–∞ –ø–æ–ª–Ω–æ—Å—Ç—å—é —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É–µ—Ç –æ–∂–∏–¥–∞–µ–º–æ–π

–ü—Ä–æ–≤–µ—Ä–∫–

In [178]:
# # —Ñ—É–Ω–∫—Ü–∏—è –¥–ª—è –ø–æ–ª—É—á–µ–Ω–∏—è —Å—Ç—Ä—É–∫—Ç—É—Ä—ã —Ç–∞–±–ª–∏—Ü –∏–∑ –ë–î
# def get_table_info(table_name, engine=engine):
#     query = text(f"PRAGMA table_info({table_name})")
#     print(f'–°—Ç–æ–ª–±—Ü—ã —Ç–∞–±–ª–∏—Ü—ã {table_name}:')
#     try:
#         print(pd.read_sql_query(query, con=engine)['name'], '\n')
#     except SQLAlchemyError as e:
#         print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –≤—ã–ø–æ–ª–Ω–µ–Ω–∏–∏ –∑–∞–ø—Ä–æ—Å–∞: {e}")

def get_table_info(table_name, engine=engine):
    query = text(f"PRAGMA table_info({table_name})")
    print(f'\n–°—Ç—Ä—É–∫—Ç—É—Ä–∞ —Ç–∞–±–ª–∏—Ü—ã {table_name}:')
    print('-' * 40)
    try:
        table_info = pd.read_sql_query(query, con=engine)
        for _, row in table_info.iterrows():
            print(f"{row['name']}: {row['type']} {'(PK)' if row['pk'] else 'not PK'}")
        print('-' * 40, '\n')
    except SQLAlchemyError as e:
        print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –≤—ã–ø–æ–ª–Ω–µ–Ω–∏–∏ –∑–∞–ø—Ä–æ—Å–∞: {e}")

In [180]:
# —Ñ—É–Ω–∫—Ü–∏—è –¥–ª—è –ø–æ–ª—É—á–µ–Ω–∏—è –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞ –∑–∞–ø–∏—Å–µ–π –≤ –∫–∞–∂–¥–æ–π —Ç–∞–±–ª–∏—Ü–µ
def get_table_size(table_name, engine=engine):
    query = text(f'SELECT COUNT(*) FROM {table_name}')
    try:
        print(f'–°—Ç—Ä–æ–∫ –≤ —Ç–∞–±–ª–∏—Ü–µ {table_name}: '
              f'{pd.read_sql_query(query, con=engine).iloc[0,0]}', '\n', '-'*50)
    except SQLAlchemyError as e:
        print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –≤—ã–ø–æ–ª–Ω–µ–Ω–∏–∏ –∑–∞–ø—Ä–æ—Å–∞: {e}")

In [182]:
for table in list(DB_TABLE_DICT.keys()):
    get_table_info(table)
    get_table_size(table)


–°—Ç—Ä—É–∫—Ç—É—Ä–∞ —Ç–∞–±–ª–∏—Ü—ã contract:
----------------------------------------
customerID: TEXT not PK
BeginDate: TEXT not PK
EndDate: TEXT not PK
Type: TEXT not PK
PaperlessBilling: TEXT not PK
PaymentMethod: TEXT not PK
MonthlyCharges: TEXT not PK
TotalCharges: TEXT not PK
---------------------------------------- 

–°—Ç—Ä–æ–∫ –≤ —Ç–∞–±–ª–∏—Ü–µ contract: 7043 
 --------------------------------------------------

–°—Ç—Ä—É–∫—Ç—É—Ä–∞ —Ç–∞–±–ª–∏—Ü—ã personal:
----------------------------------------
customerID: TEXT not PK
gender: TEXT not PK
SeniorCitizen: TEXT not PK
Partner: TEXT not PK
Dependents: TEXT not PK
---------------------------------------- 

–°—Ç—Ä–æ–∫ –≤ —Ç–∞–±–ª–∏—Ü–µ personal: 7043 
 --------------------------------------------------

–°—Ç—Ä—É–∫—Ç—É—Ä–∞ —Ç–∞–±–ª–∏—Ü—ã internet:
----------------------------------------
customerID: TEXT not PK
InternetService: TEXT not PK
OnlineSecurity: TEXT not PK
OnlineBackup: TEXT not PK
DeviceProtection: TEXT not PK
TechSupport:

In [130]:
# —Ñ—É–Ω–∫—Ü–∏—è –¥–ª—è —Ñ–æ—Ä–º–∏—Ä–æ–≤–∞–Ω–∏—è —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –∏ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è –ø–æ –∫–∞—Ç–µ–≥–æ—Ä–∏—è–º
def external_info(df, threshold=10, density_threshold=.01):
    numeric_stats = ['max', 'min', 'median', 'mean']
    stats = []
    for feature in df.columns:
        feature_series = df[feature]
        nunique = feature_series.nunique()
        density = round(nunique / len(df), 2)
        is_numeric = pd.api.types.is_numeric_dtype(feature_series)
        is_string = pd.api.types.is_string_dtype(df[feature])
                                                   
        feature_stats = {
            'feature': feature,
            'dtype': feature_series.dtype,
            'Cardinality': nunique,
            'Card. density': density,
            'Missing': feature_series.isna().sum(),
            'Missing %': feature_series.isna().mean(),
            'Mode': feature_series.mode()[0]
        }
        if is_numeric:
            if density < density_threshold or nunique < threshold:
                feature_stats['Type'] = 'discrete'
            else:
                feature_stats['Type'] = 'continuous'
            for stat in numeric_stats:
                try:
                    feature_stats[stat.capitalize()] = getattr(df[feature], stat)()
                except (AttributeError, TypeError):
                    feature_stats[stat.capitalize()] = '–ù–µ –Ω–∞–π–¥–µ–Ω'
        else:
            if is_string:
                feature_stats['Type'] = 'categorical'         
            else:
                feature_stats['Type'] = 'unknown'
            for stat in numeric_stats:
                feature_stats[stat.capitalize()] = "-"      
        stats.append(feature_stats)
    return pd.DataFrame(stats)

In [140]:
query = text(f"SELECT * FROM {DB_TABLE_NAMES[0]}")
pd.read_sql_query(query, con = engine)

Unnamed: 0,customerID,BeginDate,EndDate,Type,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,7590-VHVEG,2020-01-01,No,Month-to-month,Yes,Electronic check,29.85,31.04
1,5575-GNVDE,2017-04-01,No,One year,No,Mailed check,56.95,2071.84
2,3668-QPYBK,2019-10-01,No,Month-to-month,Yes,Mailed check,53.85,226.17
3,7795-CFOCW,2016-05-01,No,One year,No,Bank transfer (automatic),42.3,1960.6
4,9237-HQITU,2019-09-01,No,Month-to-month,Yes,Electronic check,70.7,353.5
...,...,...,...,...,...,...,...,...
7038,6840-RESVB,2018-02-01,No,One year,Yes,Mailed check,84.8,2035.2
7039,2234-XADUH,2014-02-01,No,One year,Yes,Credit card (automatic),103.2,7430.4
7040,4801-JZAZL,2019-03-01,No,Month-to-month,Yes,Electronic check,29.6,325.6
7041,8361-LTMKD,2019-07-01,No,Month-to-month,Yes,Mailed check,74.4,520.8


In [142]:
external_info(pd.read_sql_query(query, con = engine))

Unnamed: 0,feature,dtype,Cardinality,Card. density,Missing,Missing %,Mode,Type,Max,Min,Median,Mean
0,customerID,object,7043,1.0,0,0.0,0002-ORFBO,categorical,-,-,-,-
1,BeginDate,object,77,0.01,0,0.0,2014-02-01,categorical,-,-,-,-
2,EndDate,object,67,0.01,0,0.0,No,categorical,-,-,-,-
3,Type,object,3,0.0,0,0.0,Month-to-month,categorical,-,-,-,-
4,PaperlessBilling,object,2,0.0,0,0.0,Yes,categorical,-,-,-,-
5,PaymentMethod,object,4,0.0,0,0.0,Electronic check,categorical,-,-,-,-
6,MonthlyCharges,object,1585,0.23,0,0.0,20.05,categorical,-,-,-,-
7,TotalCharges,object,6658,0.95,0,0.0,,categorical,-,-,-,-
