In [None]:
import pandas as pd
import numpy as np
from locale import atof, setlocale, LC_NUMERIC, LC_ALL
import matplotlib.pyplot as plt
import matplotlib_inline
import seaborn as sns
import scipy
from scipy.stats import norm
setlocale(LC_ALL, 'fr_FR.UTF-8')

'fr_FR.UTF-8'

In [74]:
df = pd.read_csv("aggregatedfile.csv",index_col=0, low_memory=False)

In [76]:
df = df[df.dependance.apply(lambda x: x != "['Dépendance']")].reset_index(drop=True)


In [77]:
df.shape

(498650, 17)

In [40]:
def feature_generation (df):
    # convert the 'Date' column to datetime format
    df['Date_YYYY-MM'] = pd.to_datetime(df['Date mutation'],
                                        format="%d/%m/%Y").dt.to_period('M')
    df= df.drop(['Date mutation'], axis = 1)
    ## attention à ne faire qu'après avoir enrichi avec variables insee
    dict_type_voie = dict()
    for value in df['type_de_voie'].value_counts()[
            df['type_de_voie'].value_counts() < 300].index.values:
        dict_type_voie[value] = 'Autres'
    df = df.replace({'type_de_voie': dict_type_voie})
    return df

In [41]:
df = feature_generation(df)

In [64]:
df['Valeur fonciere']= df['Valeur fonciere'].apply(lambda x: atof(x))

In [66]:
# Calculate the z-score from scratch
standard_deviation = df['Valeur fonciere'].std(ddof=0)
mean_value = df['Valeur fonciere'].mean()
zscores = [(value - mean_value) / standard_deviation
           for value in df['Valeur fonciere']]
df['zscores']= zscores


In [81]:
def outlier_inspect(df,
                    col,
                    min_z=1,
                    max_z=5,
                    step=0.2,
                    max_hist=None,
                    bins=50):
    fig = plt.figure(figsize=(20, 6))
    fig.suptitle(col, fontsize=16)
    plt.subplot(1, 3, 1)
    if max_hist == None:
        sns.histplot(df[col], kde=False, bins=50, color="r")
    else:
        sns.distplot(df[df[col] <= max_hist][col], kde=False, bins=50)
    plt.subplot(1, 3, 2)
    sns.boxplot(df[col])
    plt.subplot(1, 3, 3)
    z_score_inspect = outlier_zscore(df,
                                     col,
                                     min_z=min_z,
                                     max_z=max_z,
                                     step=step)
    plt.show()

In [82]:
def outlier_zscore(df, col, min_z=1, max_z=5, step=0.1, print_list=False):
    z_scores = df["zscores"]
    threshold_list = []
    for threshold in np.arange(min_z, max_z, step):
        threshold_list.append(
            (threshold, len(np.where(z_scores > threshold)[0])))
        df_outlier = pd.DataFrame(threshold_list,
                                  columns=['threshold', 'outlier_count'])
        df_outlier['pct'] = (df_outlier.outlier_count -
                             df_outlier.outlier_count.shift(-1)
                             ) / df_outlier.outlier_count * 100
    plt.plot(df_outlier.threshold, df_outlier.outlier_count)
    best_treshold = round(df_outlier.iloc[df_outlier.pct.argmax(), 0], 2)
    outlier_limit = int(df[col].dropna().mean() + (df[col].dropna().std()) *
                        df_outlier.iloc[df_outlier.pct.argmax(), 0])
    percentile_threshold = stats.percentileofscore(df[col].dropna(),
                                                   outlier_limit)
    plt.vlines(best_treshold,
               0,
               df_outlier.outlier_count.max(),
               colors="r",
               ls=":")
    plt.annotate(
        "Zscore : {}\nValue : {}\nPercentile : {}".format(
            best_treshold, outlier_limit,
            (np.round(percentile_threshold,
                      3), np.round(100 - percentile_threshold, 3))),
        (best_treshold, df_outlier.outlier_count.max() / 2))
    #plt.show()
    if print_list:
        print(df_outlier)
    return (plt, df_outlier, best_treshold, outlier_limit,
            percentile_threshold)


In [83]:
outlier_inspect(df, 'Valeur fonciere')



TypeError: Horizontal orientation requires numeric `x` variable.

Error in callback <function flush_figures at 0x7fb8acb048b0> (for post_execute):


In [71]:
df["outlier"] = (abs(df["zscores"]) > 2).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["outlier"] = (abs(df["zscores"]) > 2).astype(int)


In [72]:
df=df[df['outlier'] == 0]

In [73]:
df

Unnamed: 0,parcelle_cad_section,Valeur fonciere,B_T_Q,type_de_voie,voie,code_postal,commune,clean_code_departement,clean_code_commune,surface_carrez_lot_1,Nb_lots,surface_terrain,surface_reelle_bati,nb_pieces_principales,dependance,main_type_terrain,Date_YYYY-MM,zscores,outlier
0,01001000ZE,127000.0,,RTE,DU MONT,1400.0,L'ABERGEMENT-CLEMENCIAT,01,1,0.0,0,130.0,42.0,2.0,['Dépendance' 'Maison'],S,2021-07,-0.051438,0
1,01001000ZH,465225.0,,RTE,DE LA FONTAINE,1400.0,L'ABERGEMENT-CLEMENCIAT,01,1,0.0,0,3000.0,188.0,8.0,['Maison'],S,2021-02,0.034003,0
2,01001000ZH,120000.0,,RUE,DE MUNETVILLE,1400.0,L'ABERGEMENT-CLEMENCIAT,01,1,0.0,0,985.0,75.0,4.0,['Maison'],S,2021-07,-0.053206,0
3,01001000ZH,329000.0,,RUE,DU STADE,1400.0,L'ABERGEMENT-CLEMENCIAT,01,1,0.0,0,1078.0,118.0,5.0,['Dépendance' 'Maison'],S,2021-04,-0.000410,0
4,01001000ZH,198000.0,,RUE,DES MURIERS,1400.0,L'ABERGEMENT-CLEMENCIAT,01,1,0.0,0,755.0,76.0,4.0,['Maison'],S,2021-06,-0.033502,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498645,974024000A,180000.0,,IMP,ETHEVE,97413.0,CILAOS,974,24,0.0,0,936.0,78.0,4.0,['Maison'],S,2021-10,-0.038049,0
498646,974024000A,220000.0,,CHE,TERRE BLANCHE,97413.0,CILAOS,974,24,0.0,0,10500.0,90.0,4.0,['Maison'],T,2021-04,-0.027944,0
498647,974024000A,61000.0,,PAS,DES MARCHES,97413.0,CILAOS,974,24,0.0,0,375.0,30.0,4.0,['Dépendance' 'Maison'],S,2021-05,-0.068110,0
498648,974024000A,148850.0,,RUE,DES GLYCINES,97413.0,CILAOS,974,24,0.0,0,177.0,60.0,6.0,['Maison'],S,2021-05,-0.045918,0
