In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
file_path = '/content/drive/MyDrive/ADSDB/exploitation/data.csv'

In [5]:
data = pd.read_csv(file_path)

In [6]:
def check_data_quality(df, mild_outlier_threshold=1.5, severe_outlier_threshold=3):
    # Display basic information about the DataFrame
    print("DataFrame Info:")
    print(df.info())

    # Check for missing values
    print("\nMissing Values:")
    print(df.isnull().sum())

    # Check for duplicate rows
    print("\nDuplicate Rows:")
    duplicate_counts = df.duplicated().sum()
    print(f"Total number of duplicated rows: {duplicate_counts}")

    # Check data types of each column
    print("\nData Types:")
    print(df.dtypes)

    # Display basic statistics
    print("\nBasic Statistics:")
    print(df.describe())

    categorical_columns = df.select_dtypes(include=['object']).columns
    for column in categorical_columns:
        unique_values = df[column].unique()
        print(f"\nUnique values in {column}: {unique_values}")

    numeric_columns = df.select_dtypes(include=['number']).columns
    for column in numeric_columns:
        if np.issubdtype(df[column].dtype, np.number):
            Q1 = df[column].quantile(0.25)
            Q3 = df[column].quantile(0.75)
            IQR = Q3 - Q1

            mild_outliers = df[(df[column] < Q1 - mild_outlier_threshold * IQR) |
                               (df[column] > Q3 + mild_outlier_threshold * IQR)]

            severe_outliers = df[(df[column] < Q1 - severe_outlier_threshold * IQR) |
                                 (df[column] > Q3 + severe_outlier_threshold * IQR)]

            print(f"\nOutliers in {column} (Mild): {len(mild_outliers)}")
            print(f"\nOutliers in {column} (Severe): {len(severe_outliers)}\n")

In [7]:
def plot_pie_chart(df, column_name):
    value_counts = df[column_name].value_counts()
    fig = px.pie(names=value_counts.index, values=value_counts.values,
                 title=f'Distribution of Unique Values in {column_name}')
    fig.update_layout(showlegend=True)
    fig.show()

In [8]:
def plot_pie_charts(df):
  for column in df.columns:
    if df.dtypes[column] == object:
      plot_pie_chart(df, column)

In [9]:
def plot_histogram(df, column_name):
    fig = px.histogram(df, x=column_name,
                       title=f'Histogram of {column_name}',
                       labels={'x': column_name, 'y': 'Frequency'},
                       marginal='box')
    fig.update_layout(bargap=0.1)
    fig.show()

In [10]:
def plot_histograms(df):
  for column in df.columns:
    if df.dtypes[column] == float or df.dtypes[column] == int:
      plot_histogram(df, column)

In [11]:
def plot_visuals(df):
  plot_histograms(df)
  plot_pie_charts(df)

In [12]:
check_data_quality(data)

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27483 entries, 0 to 27482
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Year                    27483 non-null  int64  
 1   Quarter                 27483 non-null  object 
 2   Provinces               27483 non-null  object 
 3   Sex                     27483 non-null  object 
 4   activity_rate           27483 non-null  float64
 5   inflation_rate          27483 non-null  float64
 6   house_price_index_type  27483 non-null  object 
 7   house_price_index       27483 non-null  float64
dtypes: float64(3), int64(1), object(4)
memory usage: 1.7+ MB
None

Missing Values:
Year                      0
Quarter                   0
Provinces                 0
Sex                       0
activity_rate             0
inflation_rate            0
house_price_index_type    0
house_price_index         0
dtype: int64

Duplicate Rows:
Total nu

In [13]:
plot_visuals(data)