In [16]:
import pandas as pd 
import numpy as np 
from datetime import datetime, timedelta
import random
from tqdm import tqdm
import os
from desc_by_py import *

%matplotlib inline

In [None]:
def data_analysis(data, year, res_path):
    data['year'] =year
    
    # 提取时间特征 
    df = pd.DataFrame()
    df['first_time_seen'] = data_2015.groupby(['year','model','serial_number'])['date'].min()
    df['last_time_seen'] = data_2015.groupby(['year','model','serial_number'])['date'].max()
    df['failure'] = data_2015.groupby(['year','model','serial_number'])['failure'].max()
    df.reset_index(drop=True, inplace=True)
    
    tmp_failure = pd.DataFrame()
    tmp_failure['failure_date'] = data_2015[data_2015.failure==1].groupby(['serial_number'])['date'].min()
    tmp_failure.reset_index(drop=True, inplace=True)
    
    stats_data = pd.merge(df, tmp_failure, how='left', on='serial_number')
    
    # stats_data.to_csv(res_path+'stats_'+year+'.csv',index=False)
    
    return stats_data


def convert_str2date(line):
    return datetime.strptime(line, "%Y-%m-%d")


def convert_date2str(d):
    return d.strftime("%Y-%m-%d")

def remove_days(date, days):
    date = convert_str2date(date)
    return convert_date2str(date - timedelta(days=days))


In [None]:
stats_2015 = data_analysis(data_2015, '2015', res_path)
stats_2016 = data_analysis(data_2016, '2016', res_path)
stats_2017 = data_analysis(data_2017, '2017', res_path)
stats_2018 = data_analysis(data_2018, '2018', res_path)

In [26]:
import re
import math
import sys
import time
from operator import itemgetter
import copy

import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt


def _num_dd(varseries):
    """
        :param varseries: 
        :type varseries: pd.Series

        :return: 返回一个字典, 字典的 key 有 
                ks-pvalue, max-or-bot1, mean-or-top1, median-or-bot5, min-or-top2, n-miss, n-unique, n-valid
                 'p1-or-top3', 'p25-or-top5', 'p5-or-top4', 'p75-or-bot4': 3.0, 'p95-or-bot3', 'p99-or-bot2'
        :rtype: dict
    """
    statsDict = {}
    statsDict['n-valid'] = varseries.count()
    statsDict['n-miss'] = len(varseries) - varseries.count()
    statsDict['n-unique'] = len(varseries.unique())

    if len(varseries.unique()) == 1 and str(varseries.unique()[0]) == 'nan':
        statsDict['mean-or-top1'] = '.'
        statsDict['min-or-top2'] = '.'
        statsDict['p1-or-top3'] = '.'
        statsDict['p5-or-top4'] = '.'
        statsDict['p25-or-top5'] = '.'
        statsDict['median-or-bot5'] = '.'
        statsDict['p75-or-bot4'] = '.'
        statsDict['p95-or-bot3'] = '.'
        statsDict['p99-or-bot2'] = '.'
        statsDict['max-or-bot1'] = '.'
        statsDict['ks-pvalue'] = '.'
    else:
        statsDict['mean-or-top1'] = varseries.mean()
        statsDict['min-or-top2'] = varseries.min()

        temp = varseries[varseries.notnull()]
        statsDict['p1-or-top3'] = temp.quantile(0.01)
        statsDict['p5-or-top4'] = temp.quantile(0.05)
        statsDict['p25-or-top5'] = temp.quantile(0.25)
        statsDict['median-or-bot5'] = temp.quantile(0.5)
        statsDict['p75-or-bot4'] = temp.quantile(0.75)
        statsDict['p95-or-bot3'] = temp.quantile(0.95)
        statsDict['p99-or-bot2'] = temp.quantile(0.99)
        statsDict['max-or-bot1'] = varseries.max()
        mu = varseries.mean()
        sigma = np.std(varseries)
        stat_val, p_value = stats.kstest(temp, 'norm', (mu, sigma))
        statsDict['ks-pvalue'] = str(p_value)
    return statsDict


def _char_dd(varseries):
    """
        :param varseries: 
        :type varseries: pd.Series
    """
    statsDict = {}
    varseriesFillna = varseries.fillna('__NULL__')
    freqStatsDf = pd.DataFrame(varseriesFillna.value_counts())
    freqStatsDf.columns = ['count']
    freqStatsDf.sort_values(by='count',
                            ascending=False,
                            inplace=True)
    if '__NULL__' in freqStatsDf.index:
        statsDict['n-miss'] = freqStatsDf.iloc['__NULL__', 'count']
        statsDict['n-valid'] = freqStatsDf['count'].sum() - \
            freqStatsDf.iloc['__NULL__', 'count']
    else:
        statsDict['n-miss'] = 0
        statsDict['n-valid'] = freqStatsDf['count'].sum()

    statsDict['n-unique'] = len(freqStatsDf.index.unique())
    charStatList = ['mean-or-top1', 'min-or-top2', 'p1-or-top3', 'p5-or-top4', 'p25-or-top5',
                    'median-or-bot5', 'p75-or-bot4', 'p95-or-bot3', 'p99-or-bot2', 'max-or-bot1']
    topCatStats = freqStatsDf[:5]
    bottomCatStats = freqStatsDf[-5:]

    for cherry in range(len(topCatStats)):
        statsDict[charStatList[cherry]] = freqStatsDf.iloc[cherry].name + \
            "::" + str(freqStatsDf.iloc[cherry, 'count'])

    for cherry in range(len(bottomCatStats)):
        cherry -= len(bottomCatStats)
        statsDict[charStatList[cherry]] = freqStatsDf.iloc[cherry].name + \
            "::" + str(freqStatsDf.iloc[cherry, 'count'])

    statsDict['ks-pvalue'] = '.'
    return statsDict


def py_data_desc(df):
    """
        :param df: 
        :type df: pd.DataFrame

        todo:
        需要做一个更强的数据类型分类
    """
    # df = df[np.isnan(df["serial_number"])==False]
    dtypes = df.dtypes

    # char_lst = list(dtypes[dtypes == "object"].index)
    char_lst = list(
        dtypes[dtypes.isin([np.dtype("O"), np.dtype("<M8[ns]")])].index)
    # num_lst = list(dtypes[dtypes != "object"].index)
    num_lst = list(
        dtypes[~dtypes.isin([np.dtype("O"), np.dtype("<M8[ns]")])].index)

    result = []
    for x in char_lst:
        # tmp = _char_dd(df[x])
        tmp = _char_dd(df[x].astype(str))
        tmp["variable"] = x
        result.append(tmp)

    for x in num_lst:
        tmp = _num_dd(df[x])
        tmp["variable"] = x
        result.append(tmp)

    return pd.DataFrame(result)[["variable", "n-miss", "n-unique", "n-valid", "mean-or-top1", "min-or-top2", "p1-or-top3", "p5-or-top4", "p25-or-top5", "median-or-bot5", "p75-or-bot4", "p95-or-bot3", "p99-or-bot2", "max-or-bot1", "ks-pvalue"]]


######################################################################################

def ex_sinple():
    df = pd.DataFrame({"a": ["b", "c", "d", "e", "f"], "t": range(5)})
    print(py_data_desc(df))


def app_ex():
    df = pd.read_csv("../res/train_data_2015_2020.csv", encoding="utf-8")
    print(py_data_desc(df))
    dft = py_data_desc(df)
    dft.to_csv("../res/train_data_2015_2020_desc.csv", encoding="utf-8")


In [None]:
app_ex()